1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package org.zilverline.core;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.io.InputStream;
27 import java.io.InputStreamReader;
28 import java.io.StringReader;
29 import java.text.DateFormat;
30 import java.text.SimpleDateFormat;
31 import java.util.Date;
32 import java.util.Properties;
33
34 import javax.activation.DataHandler;
35 import javax.mail.Address;
36 import javax.mail.BodyPart;
37 import javax.mail.FetchProfile;
38 import javax.mail.Flags;
39 import javax.mail.Folder;
40 import javax.mail.Message;
41 import javax.mail.MessagingException;
42 import javax.mail.NoSuchProviderException;
43 import javax.mail.Part;
44 import javax.mail.Session;
45 import javax.mail.Store;
46 import javax.mail.UIDFolder;
47 import javax.mail.internet.MimeMessage;
48 import javax.mail.internet.MimeMultipart;
49 import javax.mail.internet.MimePart;
50
51 import org.apache.commons.logging.Log;
52 import org.apache.commons.logging.LogFactory;
53
54 import org.springframework.util.StringUtils;
55
56 import org.apache.lucene.document.DateTools;
57 import org.apache.lucene.document.Document;
58 import org.apache.lucene.document.Field;
59 import org.apache.lucene.index.IndexWriter;
60
61 import org.zilverline.extractors.HTMLExtractor;
62 import org.zilverline.util.StopWatch;
63
64 /***
65 * A Collection is a number of documents in a directory that are indexed together.
66 *
67 * @author Michael Franken
68 * @version $Revision: 1.15 $
69 */
70 public class IMAPCollection extends AbstractCollection {
71 /*** logger for Commons logging. */
72 private static Log log = LogFactory.getLog(IMAPCollection.class);
73
74
75 private static final String F_CD = "content-description";
76
77 private static final String F_CONTENTS = "contents";
78
79 private static final String F_CT = "content-type";
80
81 private static final String F_FOLDER = "folder";
82
83 private static final String F_FROM = "from";
84
85 private static final String F_RECEIVED = "received";
86
87 private static final String F_REPLY_TO = "reply-to";
88
89 private static final String F_SENT = "sent";
90
91 private static final String F_SIZE = "size";
92
93 private static final String F_SUBJECT = "subject";
94
95 private static final String F_TO = "to";
96
97 private static final String F_UID = "uid";
98
99 private static final String F_URL = "url";
100
101
102 private static final Flags.Flag[] FLAGS = new Flags.Flag[] { Flags.Flag.ANSWERED, Flags.Flag.DELETED, Flags.Flag.DRAFT,
103 Flags.Flag.FLAGGED, Flags.Flag.RECENT, Flags.Flag.SEEN };
104
105
106 private static final String[] SFLAGS = new String[] { "answered", "deleted", "draft", "flagged", "recent", "seen" };
107
108 private static final FetchProfile PROFILE = new FetchProfile();
109
110 static {
111 PROFILE.add(FetchProfile.Item.ENVELOPE);
112 PROFILE.add(FetchProfile.Item.CONTENT_INFO);
113 PROFILE.add(UIDFolder.FetchProfileItem.UID);
114 PROFILE.add(com.sun.mail.imap.IMAPFolder.FetchProfileItem.HEADERS);
115 PROFILE.add(com.sun.mail.imap.IMAPFolder.FetchProfileItem.SIZE);
116 }
117
118 private String folder;
119
120 private String host;
121
122 private String password;
123
124 private String user;
125
126 /***
127 * Default Constructor setting all fields to non null defaults.
128 */
129 public IMAPCollection() {
130 super();
131 name = "";
132 url = "";
133 description = "";
134 numberOfDocs = 0;
135 version = 0;
136 lastIndexed = null;
137 existsOnDisk = false;
138 keepCache = false;
139 isKeepCacheSet = false;
140 }
141
142 /***
143 * Gets the origin from where this collection's documents can be retrieved.
144 *
145 * @return location such as e:/docs or InBox
146 */
147 public final String getRoot() {
148 if (!StringUtils.hasText(folder)) {
149 return "all folders";
150 }
151 return getFolder();
152 }
153
154 public String getFolder() {
155 return folder;
156 }
157
158 /***
159 * @return Returns the host.
160 */
161 public String getHost() {
162 return host;
163 }
164
165 /***
166 * @return Returns the password.
167 */
168 public String getPassword() {
169 return password;
170 }
171
172 /***
173 * @return Returns the user.
174 */
175 public String getUser() {
176 return user;
177 }
178
179 /***
180 * Index the given Collection.
181 *
182 * @param fullIndex indicates whether a full or incremental index should be created
183 * @throws IndexException if the Collections can not be indexed
184 */
185 public final void index(final boolean fullIndex) throws IndexException {
186 resetCache(fullIndex);
187 doIndex(fullIndex);
188 }
189
190 /***
191 * Index the given Collection.
192 *
193 * @param fullIndex indicates whether a full or incremental index should be created
194 * @throws IndexException if the Collections can not be indexed
195 * @return true if succesfull
196 */
197 private final boolean doIndex(boolean fullIndex) throws IndexException {
198 IndexWriter writer = null;
199 Store store = null;
200 try {
201
202 StopWatch watch = new StopWatch();
203
204 watch.start();
205
206
207 File indexDirectory = this.getIndexDirWithManagerDefaults();
208
209
210 boolean mustReindex = fullIndex;
211 if (!this.isIndexValid()) {
212 mustReindex = true;
213 indexDirectory.mkdirs();
214 }
215
216
217 writer = new IndexWriter(indexDirectory, this.createAnalyzer(), mustReindex);
218
219 if (manager.getMergeFactor() != null) {
220 writer.setMergeFactor(manager.getMergeFactor().intValue());
221 }
222 if (manager.getMinMergeDocs() != null) {
223 writer.setMaxBufferedDocs(manager.getMinMergeDocs().intValue());
224 }
225
226 if (manager.getMaxMergeDocs() != null) {
227 writer.setMaxMergeDocs(manager.getMaxMergeDocs().intValue());
228 }
229
230 resetCache(fullIndex);
231
232 log.debug("Connecting to IMAP server: " + host);
233 Properties props = System.getProperties();
234 Session session = Session.getDefaultInstance(props, null);
235 store = session.getStore("imap");
236 log.debug("Connecting to " + host + " as " + user);
237 store.connect(host, user, password);
238 log.debug("Connected");
239
240 Folder topFolder = null;
241 if (StringUtils.hasText(folder)) {
242 topFolder = store.getFolder(folder);
243 } else {
244 topFolder = store.getDefaultFolder();
245 }
246 indexFolder(writer, topFolder);
247
248 watch.stop();
249 log.info("Indexed " + writer.docCount() + " documents in " + watch.elapsedTime());
250 return true;
251 }
252 catch (NoSuchProviderException e) {
253 throw new IndexException("Can't connect to " + host, e);
254 }
255 catch (MessagingException e) {
256 throw new IndexException("Error while accessing IMAP server " + host, e);
257 }
258 catch (IOException e) {
259 throw new IndexException("Error indexing '" + this.getName() + "'. Possibly unable to remove old index", e);
260 }
261 catch (Exception e) {
262 throw new IndexException("Error indexing '" + this.getName() + "'", e);
263 }
264 finally {
265 if (writer != null) {
266 try {
267 writer.optimize();
268 log.debug("Optimizing index for " + name);
269 writer.close();
270 log.debug("Closing index for " + name);
271 }
272 catch (IOException e1) {
273 log.error("Error closing Index for " + name, e1);
274 }
275 }
276 if (store != null) {
277 try {
278 store.close();
279 }
280 catch (MessagingException e1) {
281 log.error("Error closing IMAP server " + host, e1);
282 }
283 }
284 init();
285 }
286 }
287
288 private final boolean indexFolder(IndexWriter writer, Folder thisFolder) throws MessagingException {
289 if (stopRequested) {
290 log.info("Indexing stops, due to request");
291 return false;
292 }
293 if ((thisFolder.getType() & Folder.HOLDS_MESSAGES) != 0) {
294 thisFolder.open(Folder.READ_ONLY);
295 Message[] messages = thisFolder.getMessages();
296 if (messages == null) {
297
298 messages = new Message[0];
299 }
300
301 thisFolder.fetch(messages, PROFILE);
302
303 log.debug("FOLDER: " + thisFolder.getFullName() + " messages=" + messages.length);
304
305 for (int i = 0; i < messages.length; i++) {
306 try {
307 String msgID = null;
308 if (messages[i] instanceof MimeMessage) {
309 MimeMessage mm = (MimeMessage) messages[i];
310 msgID = mm.getMessageID();
311 }
312 if (!md5DocumentCache.contains(msgID)) {
313 log.debug("new message added for message: " + msgID);
314 final Document doc = new Document();
315 doc.add(Field.Keyword(F_FOLDER, thisFolder.getFullName()));
316 doc.add(Field.Keyword("collection", name));
317
318 indexMessage(doc, messages[i]);
319
320 writer.addDocument(doc);
321 md5DocumentCache.add(msgID);
322 } else {
323 log.debug("existing message skipped for message: " + msgID);
324 }
325 }
326 catch (Exception ioe) {
327
328 log.warn("Bad Message: " + messages[i], ioe);
329 continue;
330 }
331 }
332
333 }
334
335 if ((thisFolder.getType() & Folder.HOLDS_FOLDERS) != 0) {
336 Folder[] far = thisFolder.list();
337 if (far != null) {
338 for (int i = 0; i < far.length; i++) {
339 indexFolder(writer, far[i]);
340 }
341 }
342 }
343 if (thisFolder.isOpen()) {
344 log.debug("Closing folder: " + thisFolder.getFullName());
345 thisFolder.close(false);
346 }
347
348 return true;
349 }
350
351 /***
352 * Index one message.
353 */
354 private void indexMessage(final Document doc, final Message m) throws MessagingException, IOException {
355 if (stopRequested) {
356 log.info("Indexing stops, due to request");
357 return;
358 }
359 final long uid = ((UIDFolder) m.getFolder()).getUID(m);
360
361
362
363
364 String urlPrefix = "imap://" + user + "@" + host + ":143/fetch%3EUID%3E/";
365
366 final String url = urlPrefix + m.getFolder().getFullName() + "%3E" + uid;
367 doc.add(Field.Text("name", url));
368
369 final String subject = m.getSubject();
370 final Date recv = m.getReceivedDate();
371 final Date sent = m.getSentDate();
372 log.info("Folder: " + m.getFolder().getFullName() + ": Message received " + recv + ", subject: " + subject);
373
374
375
376 if (subject != null) {
377 doc.add(Field.Text(F_SUBJECT, m.getSubject()));
378 doc.add(Field.Text("title", m.getSubject()));
379 }
380
381 if (recv != null) {
382 doc.add(Field.Keyword(F_RECEIVED, DateTools.timeToString(recv.getTime(), DateTools.Resolution.SECOND)));
383 }
384
385 if (sent != null) {
386 doc.add(Field.Keyword(F_SENT, DateTools.timeToString(sent.getTime(), DateTools.Resolution.SECOND)));
387
388 DateFormat df = new SimpleDateFormat("yyyyMMdd");
389 String dfString = df.format(new Date(sent.getTime()));
390 doc.add(Field.Keyword("modified", dfString));
391 }
392
393 doc.add(Field.Keyword(F_URL, url));
394
395 Address[] addrs = m.getAllRecipients();
396 if (addrs != null) {
397 for (int j = 0; j < addrs.length; j++) {
398 doc.add(Field.Keyword(F_TO, "" + addrs[j]));
399 }
400 }
401
402 addrs = m.getFrom();
403 if (addrs != null) {
404 for (int j = 0; j < addrs.length; j++) {
405 doc.add(Field.Keyword(F_FROM, "" + addrs[j]));
406 doc.add(Field.Keyword("author", "" + addrs[j]));
407 }
408 }
409 addrs = m.getReplyTo();
410 if (addrs != null) {
411 for (int j = 0; j < addrs.length; j++) {
412 doc.add(Field.Keyword(F_REPLY_TO, "" + addrs[j]));
413 }
414 }
415
416 doc.add(Field.Keyword(F_UID, "" + uid));
417
418
419 for (int j = 0; j < FLAGS.length; j++) {
420 boolean val = m.isSet(FLAGS[j]);
421 doc.add(Field.Keyword(SFLAGS[j], (val ? "true" : "false")));
422 }
423
424
425 if (m instanceof MimeMessage) {
426
427 MimeMessage mm = (MimeMessage) m;
428 log.debug("index, adding MimeMessage " + m.getFileName());
429 indexMimeMessage(doc, mm);
430
431 } else {
432
433
434 final DataHandler dh = m.getDataHandler();
435 log.debug("index, adding (non-MIME) Content " + m.getFileName());
436 doc.add(Field.Text(F_CONTENTS, new InputStreamReader(dh.getInputStream())));
437 }
438 }
439
440 /***
441 * Index a MIME message, which seems to be all of them.
442 */
443 private void indexMimeMessage(final Document doc, final MimeMessage mm) throws MessagingException, IOException {
444
445
446 long size = mm.getSize();
447 int lines = mm.getLineCount();
448 doc.add(Field.Keyword("hash", mm.getMessageID()));
449
450 if (size > 0) {
451 doc.add(Field.UnIndexed(F_SIZE, "" + size));
452 } else {
453 doc.add(Field.UnIndexed(F_SIZE, "" + 0));
454 }
455 indexPart(doc, mm);
456 }
457
458 /***
459 * Index a part.
460 */
461 private void indexPart(final Document doc, final Part p) throws MessagingException, IOException {
462 int size = p.getSize();
463 String ct = p.getContentType();
464 String cd = p.getDescription();
465 log.debug("IndexContent, type: " + ct + ", description: " + cd);
466 Object content = null;
467
468 if (ct != null) {
469 doc.add(Field.Keyword(F_CT, ct));
470 }
471 doc.add(Field.Keyword("type", "MAIL"));
472
473 if (cd != null) {
474 doc.add(Field.Keyword(F_CD, cd));
475 }
476
477 if (ct != null && ct.toLowerCase().startsWith("image/")) {
478
479 return;
480 }
481
482 try {
483
484 content = p.getContent();
485 }
486 catch (IOException ioe) {
487 log.warn("OUCH decoding attachment, p=" + p, ioe);
488 doc.add(Field.Text(F_CONTENTS, new InputStreamReader(p.getInputStream())));
489 return;
490 }
491
492 if (content instanceof MimeMultipart) {
493 int n = ((MimeMultipart) content).getCount();
494 for (int i = 0; i < n; i++) {
495 BodyPart bp = ((MimeMultipart) content).getBodyPart(i);
496
497 indexPart(doc, bp);
498 }
499 } else if (content instanceof MimePart) {
500 indexPart(doc, (MimePart) content);
501 } else if (content instanceof Part) {
502 indexPart(doc, (Part) content);
503 } else if (content instanceof String) {
504 indexString(doc, (String) content, ct);
505 } else if (content instanceof InputStream) {
506 indexStream(doc, (InputStream) content, ct);
507 } else {
508 log.error("***** Strange content: " + content + "/" + content.getClass() + " ct=" + ct + " cd=" + cd);
509 }
510 }
511
512 /***
513 * Index a Stream.
514 */
515 private void indexStream(final Document doc, final InputStream content, final String type) throws MessagingException,
516 IOException {
517 log.debug("indexStream for type: " + type);
518 ExtractorFactory ef = new ExtractorFactory();
519 Extractor ex = ef.createExtractor(type);
520 if (ex != null) {
521 String parsedContent = ex.getContent(content);
522 log.info("Adding content");
523 doc.add(Field.Text(F_CONTENTS, new StringReader(parsedContent)));
524 } else {
525 log.warn("indexStream: Unknown mimetype: " + type);
526 }
527 }
528
529 /***
530 * Index a String.
531 */
532 private void indexString(final Document doc, final String content, final String type) throws MessagingException, IOException {
533 log.debug("indexString for type: " + type);
534 if (type.toLowerCase().startsWith("text/plain")) {
535 log.info("Adding TEXT: ");
536 doc.add(Field.Text(F_CONTENTS, new StringReader(content)));
537 } else if (type.toLowerCase().startsWith("text/html")) {
538 HTMLExtractor he = new HTMLExtractor();
539 String parsedContent = he.getContent(content);
540 log.info("Adding HTML: ");
541 doc.add(Field.Text(F_CONTENTS, new StringReader(parsedContent)));
542 } else {
543 log.warn("indexString: Unknown mimetype: " + type);
544 }
545 }
546
547 /***
548 * Sets existsOnDisk based on whether the collection (contentDir) actually (now) sits on disk.
549 *
550 * @todo the whole existsOnDisk construction is a little funny, refactor some time
551 */
552 protected void setExistsOnDisk() {
553 existsOnDisk = false;
554 Store store = null;
555 try {
556
557 log.debug("Connecting to IMAP server: " + host);
558 Properties props = System.getProperties();
559 Session session = Session.getDefaultInstance(props, null);
560 store = session.getStore("imap");
561 log.debug("Connecting to " + host + " as " + user);
562 store.connect(host, user, password);
563 log.debug("Connected");
564
565 Folder topFolder = null;
566 if (StringUtils.hasText(folder)) {
567 topFolder = store.getFolder(folder);
568 } else {
569 topFolder = store.getDefaultFolder();
570 }
571 existsOnDisk = (topFolder != null);
572 }
573 catch (NoSuchProviderException e) {
574 log.warn("Can't connect to " + host, e);
575 }
576 catch (MessagingException e) {
577 log.warn("Error while accessing IMAP server " + host, e);
578 }
579 finally {
580 if (store != null) {
581 try {
582 store.close();
583 }
584 catch (MessagingException e1) {
585 log.error("Error closing IMAP server " + host, e1);
586 }
587 }
588 }
589 }
590
591 public void setFolder(String thisFolder) {
592 this.folder = thisFolder;
593 }
594
595 /***
596 * @param thisHost The host to set.
597 */
598 public void setHost(String thisHost) {
599 this.host = thisHost;
600 }
601
602 /***
603 * @param thisPassword The password to set.
604 */
605 public void setPassword(String thisPassword) {
606 this.password = thisPassword;
607 }
608
609 /***
610 * @param thisUser The user to set.
611 */
612 public void setUser(String thisUser) {
613 this.user = thisUser;
614 }
615
616 /***
617 * Prints Collection as String for logging.
618 *
619 * @return pretty formatted information about the collection
620 */
621 public final String toString() {
622 return "Collection(" + id + "), with name: " + name + ",\n\t\tdescription: " + description + ",\n\t\tcontentDir: "
623 + contentDir + ",\n\t\turl: " + url + ",\n\t\texistsOnDisk: " + existsOnDisk + ",\n\t\tindexDir: " + indexDir
624 + ",\n\t\tcacheDir: " + cacheDir + ",\n\t\tcacheUrl: " + cacheUrl + ",\n\t\tanalyzer: " + analyzer
625 + ",\n\t\tkeepCache: " + keepCache + ",\n\t\tisKeepCacheSet: " + isKeepCacheSet + ",\n\t\tnumberOfDocs: "
626 + numberOfDocs + ",\n\t\tmanager: " + manager + ",\n\t\tlastIndexed: " + lastIndexed;
627
628
629
630
631 }
632 }