View Javadoc

1   /*
2    * Copyright 2003-2004 Michael Franken, Zilverline.
3    *
4    * The contents of this file, or the files included with this file, are subject to
5    * the current version of ZILVERLINE Collaborative Source License for the
6    * Zilverline Search Engine (the "License"); You may not use this file except in
7    * compliance with the License.
8    *
9    * You may obtain a copy of the License at
10   *
11   *     http://www.zilverline.org.
12   *
13   * See the License for the rights, obligations and
14   * limitations governing use of the contents of the file.
15   *
16   * The Original and Upgraded Code is the Zilverline Search Engine. The developer of
17   * the Original and Upgraded Code is Michael Franken. Michael Franken owns the
18   * copyrights in the portions it created. All Rights Reserved.
19   *
20   */
21  
22  package org.zilverline.core;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.io.InputStream;
27  import java.io.InputStreamReader;
28  import java.io.StringReader;
29  import java.text.DateFormat;
30  import java.text.SimpleDateFormat;
31  import java.util.Date;
32  import java.util.Properties;
33  
34  import javax.activation.DataHandler;
35  import javax.mail.Address;
36  import javax.mail.BodyPart;
37  import javax.mail.FetchProfile;
38  import javax.mail.Flags;
39  import javax.mail.Folder;
40  import javax.mail.Message;
41  import javax.mail.MessagingException;
42  import javax.mail.NoSuchProviderException;
43  import javax.mail.Part;
44  import javax.mail.Session;
45  import javax.mail.Store;
46  import javax.mail.UIDFolder;
47  import javax.mail.internet.MimeMessage;
48  import javax.mail.internet.MimeMultipart;
49  import javax.mail.internet.MimePart;
50  
51  import org.apache.commons.logging.Log;
52  import org.apache.commons.logging.LogFactory;
53  
54  import org.springframework.util.StringUtils;
55  
56  import org.apache.lucene.document.DateTools;
57  import org.apache.lucene.document.Document;
58  import org.apache.lucene.document.Field;
59  import org.apache.lucene.index.IndexWriter;
60  
61  import org.zilverline.extractors.HTMLExtractor;
62  import org.zilverline.util.StopWatch;
63  
64  /***
65   * A Collection is a number of documents in a directory that are indexed together.
66   * 
67   * @author Michael Franken
68   * @version $Revision: 1.15 $
69   */
70  public class IMAPCollection extends AbstractCollection {
71      /*** logger for Commons logging. */
72      private static Log log = LogFactory.getLog(IMAPCollection.class);
73  
74      // field names
75      private static final String F_CD = "content-description";
76  
77      private static final String F_CONTENTS = "contents";
78  
79      private static final String F_CT = "content-type";
80  
81      private static final String F_FOLDER = "folder";
82  
83      private static final String F_FROM = "from";
84  
85      private static final String F_RECEIVED = "received";
86  
87      private static final String F_REPLY_TO = "reply-to";
88  
89      private static final String F_SENT = "sent";
90  
91      private static final String F_SIZE = "size";
92  
93      private static final String F_SUBJECT = "subject";
94  
95      private static final String F_TO = "to";
96  
97      private static final String F_UID = "uid";
98  
99      private static final String F_URL = "url";
100 
101     // msg flags
102     private static final Flags.Flag[] FLAGS = new Flags.Flag[] { Flags.Flag.ANSWERED, Flags.Flag.DELETED, Flags.Flag.DRAFT,
103         Flags.Flag.FLAGGED, Flags.Flag.RECENT, Flags.Flag.SEEN };
104 
105     // no toString() in Flags.Flag :(
106     private static final String[] SFLAGS = new String[] { "answered", "deleted", "draft", "flagged", "recent", "seen" };
107 
108     private static final FetchProfile PROFILE = new FetchProfile();
109 
110     static {
111         PROFILE.add(FetchProfile.Item.ENVELOPE); // standard headers
112         PROFILE.add(FetchProfile.Item.CONTENT_INFO);
113         PROFILE.add(UIDFolder.FetchProfileItem.UID);
114         PROFILE.add(com.sun.mail.imap.IMAPFolder.FetchProfileItem.HEADERS);
115         PROFILE.add(com.sun.mail.imap.IMAPFolder.FetchProfileItem.SIZE);
116     }
117 
118     private String folder;
119 
120     private String host;
121 
122     private String password;
123 
124     private String user;
125 
126     /***
127      * Default Constructor setting all fields to non null defaults.
128      */
129     public IMAPCollection() {
130         super();
131         name = "";
132         url = "";
133         description = "";
134         numberOfDocs = 0;
135         version = 0;
136         lastIndexed = null;
137         existsOnDisk = false;
138         keepCache = false;
139         isKeepCacheSet = false;
140     }
141 
142     /***
143      * Gets the origin from where this collection's documents can be retrieved.
144      * 
145      * @return location such as e:/docs or InBox
146      */
147     public final String getRoot() {
148         if (!StringUtils.hasText(folder)) {
149             return "all folders";
150         }
151         return getFolder();
152     }
153 
154     public String getFolder() {
155         return folder;
156     }
157 
158     /***
159      * @return Returns the host.
160      */
161     public String getHost() {
162         return host;
163     }
164 
165     /***
166      * @return Returns the password.
167      */
168     public String getPassword() {
169         return password;
170     }
171 
172     /***
173      * @return Returns the user.
174      */
175     public String getUser() {
176         return user;
177     }
178 
179     /***
180      * Index the given Collection.
181      * 
182      * @param fullIndex indicates whether a full or incremental index should be created
183      * @throws IndexException if the Collections can not be indexed
184      */
185     public final void index(final boolean fullIndex) throws IndexException {
186         resetCache(fullIndex);
187         doIndex(fullIndex);
188     }
189 
190     /***
191      * Index the given Collection.
192      * 
193      * @param fullIndex indicates whether a full or incremental index should be created
194      * @throws IndexException if the Collections can not be indexed
195      * @return true if succesfull
196      */
197     private final boolean doIndex(boolean fullIndex) throws IndexException {
198         IndexWriter writer = null;
199         Store store = null;
200         try {
201             // record start time
202             StopWatch watch = new StopWatch();
203 
204             watch.start();
205 
206             // make sure the index exists
207             File indexDirectory = this.getIndexDirWithManagerDefaults();
208 
209             // reindex if the index is not there or invalid
210             boolean mustReindex = fullIndex;
211             if (!this.isIndexValid()) {
212                 mustReindex = true;
213                 indexDirectory.mkdirs();
214             }
215 
216             // create an index(writer)
217             writer = new IndexWriter(indexDirectory, this.createAnalyzer(), mustReindex);
218             // see whether there are specific indexing settings in manager
219             if (manager.getMergeFactor() != null) {
220                 writer.setMergeFactor(manager.getMergeFactor().intValue());
221             }
222             if (manager.getMinMergeDocs() != null) {
223                 writer.setMaxBufferedDocs(manager.getMinMergeDocs().intValue());
224             }
225 
226             if (manager.getMaxMergeDocs() != null) {
227                 writer.setMaxMergeDocs(manager.getMaxMergeDocs().intValue());
228             }
229 
230             resetCache(fullIndex);
231             // connect to IMAP
232             log.debug("Connecting to IMAP server: " + host);
233             Properties props = System.getProperties();
234             Session session = Session.getDefaultInstance(props, null);
235             store = session.getStore("imap");
236             log.debug("Connecting to " + host + " as " + user);
237             store.connect(host, user, password);
238             log.debug("Connected");
239             // start at the proper folder
240             Folder topFolder = null;
241             if (StringUtils.hasText(folder)) {
242                 topFolder = store.getFolder(folder);
243             } else {
244                 topFolder = store.getDefaultFolder();
245             }
246             indexFolder(writer, topFolder);
247             // record end time and report duration of indexing
248             watch.stop();
249             log.info("Indexed " + writer.docCount() + " documents in " + watch.elapsedTime());
250             return true;
251         }
252         catch (NoSuchProviderException e) {
253             throw new IndexException("Can't connect to " + host, e);
254         }
255         catch (MessagingException e) {
256             throw new IndexException("Error while accessing IMAP server " + host, e);
257         }
258         catch (IOException e) {
259             throw new IndexException("Error indexing '" + this.getName() + "'. Possibly unable to remove old index", e);
260         }
261         catch (Exception e) {
262             throw new IndexException("Error indexing '" + this.getName() + "'", e);
263         }
264         finally {
265             if (writer != null) {
266                 try {
267                     writer.optimize();
268                     log.debug("Optimizing index for " + name);
269                     writer.close();
270                     log.debug("Closing index for " + name);
271                 }
272                 catch (IOException e1) {
273                     log.error("Error closing Index for " + name, e1);
274                 }
275             }
276             if (store != null) {
277                 try {
278                     store.close();
279                 }
280                 catch (MessagingException e1) {
281                     log.error("Error closing IMAP server " + host, e1);
282                 }
283             }
284             init();
285         }
286     }
287 
288     private final boolean indexFolder(IndexWriter writer, Folder thisFolder) throws MessagingException {
289         if (stopRequested) {
290             log.info("Indexing stops, due to request");
291             return false;
292         }
293         if ((thisFolder.getType() & Folder.HOLDS_MESSAGES) != 0) {
294             thisFolder.open(Folder.READ_ONLY);
295             Message[] messages = thisFolder.getMessages(); // get refs to all msgs
296             if (messages == null) {
297                 // dummy
298                 messages = new Message[0];
299             }
300 
301             thisFolder.fetch(messages, PROFILE); // fetch headers
302 
303             log.debug("FOLDER: " + thisFolder.getFullName() + " messages=" + messages.length);
304 
305             for (int i = 0; i < messages.length; i++) {
306                 try {
307                     String msgID = null;
308                     if (messages[i] instanceof MimeMessage) {
309                         MimeMessage mm = (MimeMessage) messages[i];
310                         msgID = mm.getMessageID();
311                     }
312                     if (!md5DocumentCache.contains(msgID)) {
313                         log.debug("new message added for message: " + msgID);
314                         final Document doc = new Document();
315                         doc.add(Field.Keyword(F_FOLDER, thisFolder.getFullName()));
316                         doc.add(Field.Keyword("collection", name));
317                         // index this message
318                         indexMessage(doc, messages[i]);
319                         // add it
320                         writer.addDocument(doc);
321                         md5DocumentCache.add(msgID);
322                     } else {
323                         log.debug("existing message skipped for message: " + msgID);
324                     }
325                 }
326                 catch (Exception ioe) {
327                     // can be side effect of hosed up mail headers
328                     log.warn("Bad Message: " + messages[i], ioe);
329                     continue;
330                 }
331             }
332 
333         }
334         // recurse if possible
335         if ((thisFolder.getType() & Folder.HOLDS_FOLDERS) != 0) {
336             Folder[] far = thisFolder.list();
337             if (far != null) {
338                 for (int i = 0; i < far.length; i++) {
339                     indexFolder(writer, far[i]);
340                 }
341             }
342         }
343         if (thisFolder.isOpen()) {
344             log.debug("Closing folder: " + thisFolder.getFullName());
345             thisFolder.close(false); // false => do not expunge
346         }
347 
348         return true;
349     }
350 
351     /***
352      * Index one message.
353      */
354     private void indexMessage(final Document doc, final Message m) throws MessagingException, IOException {
355         if (stopRequested) {
356             log.info("Indexing stops, due to request");
357             return;
358         }
359         final long uid = ((UIDFolder) m.getFolder()).getUID(m);
360 
361         // form a URL that mozilla seems to accept. Couldn't get it to accept
362         // what I thought was the standard
363 
364         String urlPrefix = "imap://" + user + "@" + host + ":143/fetch%3EUID%3E/";
365 
366         final String url = urlPrefix + m.getFolder().getFullName() + "%3E" + uid;
367         doc.add(Field.Text("name", url));
368 
369         final String subject = m.getSubject();
370         final Date recv = m.getReceivedDate();
371         final Date sent = m.getSentDate();
372         log.info("Folder: " + m.getFolder().getFullName() + ": Message received " + recv + ", subject: " + subject);
373         // -------------------------------------------------------
374         // data gathered, now add to doc
375 
376         if (subject != null) {
377             doc.add(Field.Text(F_SUBJECT, m.getSubject()));
378             doc.add(Field.Text("title", m.getSubject()));
379         }
380 
381         if (recv != null) {
382             doc.add(Field.Keyword(F_RECEIVED, DateTools.timeToString(recv.getTime(), DateTools.Resolution.SECOND)));
383         }
384 
385         if (sent != null) {
386             doc.add(Field.Keyword(F_SENT, DateTools.timeToString(sent.getTime(), DateTools.Resolution.SECOND)));
387             // store date as yyyyMMdd
388             DateFormat df = new SimpleDateFormat("yyyyMMdd");
389             String dfString = df.format(new Date(sent.getTime()));
390             doc.add(Field.Keyword("modified", dfString));
391         }
392 
393         doc.add(Field.Keyword(F_URL, url));
394 
395         Address[] addrs = m.getAllRecipients();
396         if (addrs != null) {
397             for (int j = 0; j < addrs.length; j++) {
398                 doc.add(Field.Keyword(F_TO, "" + addrs[j]));
399             }
400         }
401 
402         addrs = m.getFrom();
403         if (addrs != null) {
404             for (int j = 0; j < addrs.length; j++) {
405                 doc.add(Field.Keyword(F_FROM, "" + addrs[j]));
406                 doc.add(Field.Keyword("author", "" + addrs[j]));
407             }
408         }
409         addrs = m.getReplyTo();
410         if (addrs != null) {
411             for (int j = 0; j < addrs.length; j++) {
412                 doc.add(Field.Keyword(F_REPLY_TO, "" + addrs[j]));
413             }
414         }
415 
416         doc.add(Field.Keyword(F_UID, "" + uid));
417 
418         // could ignore docs that have the deleted flag set
419         for (int j = 0; j < FLAGS.length; j++) {
420             boolean val = m.isSet(FLAGS[j]);
421             doc.add(Field.Keyword(SFLAGS[j], (val ? "true" : "false")));
422         }
423 
424         // now special case for mime
425         if (m instanceof MimeMessage) {
426             // mime++;
427             MimeMessage mm = (MimeMessage) m;
428             log.debug("index, adding MimeMessage " + m.getFileName());
429             indexMimeMessage(doc, mm);
430 
431         } else {
432             // nmime++;
433 
434             final DataHandler dh = m.getDataHandler();
435             log.debug("index, adding (non-MIME) Content " + m.getFileName());
436             doc.add(Field.Text(F_CONTENTS, new InputStreamReader(dh.getInputStream())));
437         }
438     }
439 
440     /***
441      * Index a MIME message, which seems to be all of them.
442      */
443     private void indexMimeMessage(final Document doc, final MimeMessage mm) throws MessagingException, IOException {
444         // o.println( "\n\n[index mm]: " + mm.getSubject());
445 
446         long size = mm.getSize();
447         int lines = mm.getLineCount();
448         doc.add(Field.Keyword("hash", mm.getMessageID()));
449 
450         if (size > 0) {
451             doc.add(Field.UnIndexed(F_SIZE, "" + size));
452         } else {
453             doc.add(Field.UnIndexed(F_SIZE, "" + 0));
454         }
455         indexPart(doc, mm);
456     }
457 
458     /***
459      * Index a part.
460      */
461     private void indexPart(final Document doc, final Part p) throws MessagingException, IOException {
462         int size = p.getSize();
463         String ct = p.getContentType();
464         String cd = p.getDescription();
465         log.debug("IndexContent, type: " + ct + ", description: " + cd);
466         Object content = null;
467 
468         if (ct != null) {
469             doc.add(Field.Keyword(F_CT, ct));
470         }
471         doc.add(Field.Keyword("type", "MAIL"));
472 
473         if (cd != null) {
474             doc.add(Field.Keyword(F_CD, cd));
475         }
476 
477         if (ct != null && ct.toLowerCase().startsWith("image/")) {
478             // no point for now but maybe in the future we see if any forms such as jpegs have some strings
479             return;
480         }
481 
482         try {
483             // get content object, indirectly calls into JAF which decodes based on MIME type and char
484             content = p.getContent();
485         }
486         catch (IOException ioe) {
487             log.warn("OUCH decoding attachment, p=" + p, ioe);
488             doc.add(Field.Text(F_CONTENTS, new InputStreamReader(p.getInputStream())));
489             return;
490         }
491 
492         if (content instanceof MimeMultipart) {
493             int n = ((MimeMultipart) content).getCount();
494             for (int i = 0; i < n; i++) {
495                 BodyPart bp = ((MimeMultipart) content).getBodyPart(i);
496                 // same thing ends up happening regardless, if/else left it to show structure
497                 indexPart(doc, bp);
498             }
499         } else if (content instanceof MimePart) {
500             indexPart(doc, (MimePart) content);
501         } else if (content instanceof Part) {
502             indexPart(doc, (Part) content);
503         } else if (content instanceof String) {
504             indexString(doc, (String) content, ct);
505         } else if (content instanceof InputStream) {
506             indexStream(doc, (InputStream) content, ct);
507         } else {
508             log.error("***** Strange content: " + content + "/" + content.getClass() + " ct=" + ct + " cd=" + cd);
509         }
510     }
511 
512     /***
513      * Index a Stream.
514      */
515     private void indexStream(final Document doc, final InputStream content, final String type) throws MessagingException,
516         IOException {
517         log.debug("indexStream for type: " + type);
518         ExtractorFactory ef = new ExtractorFactory();
519         Extractor ex = ef.createExtractor(type);
520         if (ex != null) {
521             String parsedContent = ex.getContent(content);
522             log.info("Adding content");
523             doc.add(Field.Text(F_CONTENTS, new StringReader(parsedContent)));
524         } else {
525             log.warn("indexStream: Unknown mimetype: " + type);
526         }
527     }
528 
529     /***
530      * Index a String.
531      */
532     private void indexString(final Document doc, final String content, final String type) throws MessagingException, IOException {
533         log.debug("indexString for type: " + type);
534         if (type.toLowerCase().startsWith("text/plain")) {
535             log.info("Adding TEXT: ");
536             doc.add(Field.Text(F_CONTENTS, new StringReader(content)));
537         } else if (type.toLowerCase().startsWith("text/html")) {
538             HTMLExtractor he = new HTMLExtractor();
539             String parsedContent = he.getContent(content);
540             log.info("Adding HTML: ");
541             doc.add(Field.Text(F_CONTENTS, new StringReader(parsedContent)));
542         } else {
543             log.warn("indexString: Unknown mimetype: " + type);
544         }
545     }
546 
547     /***
548      * Sets existsOnDisk based on whether the collection (contentDir) actually (now) sits on disk.
549      * 
550      * @todo the whole existsOnDisk construction is a little funny, refactor some time
551      */
552     protected void setExistsOnDisk() {
553         existsOnDisk = false;
554         Store store = null;
555         try {
556             // try to connect to server and find folder
557             log.debug("Connecting to IMAP server: " + host);
558             Properties props = System.getProperties();
559             Session session = Session.getDefaultInstance(props, null);
560             store = session.getStore("imap");
561             log.debug("Connecting to " + host + " as " + user);
562             store.connect(host, user, password);
563             log.debug("Connected");
564             // start at the proper folder
565             Folder topFolder = null;
566             if (StringUtils.hasText(folder)) {
567                 topFolder = store.getFolder(folder);
568             } else {
569                 topFolder = store.getDefaultFolder();
570             }
571             existsOnDisk = (topFolder != null);
572         }
573         catch (NoSuchProviderException e) {
574             log.warn("Can't connect to " + host, e);
575         }
576         catch (MessagingException e) {
577             log.warn("Error while accessing IMAP server " + host, e);
578         }
579         finally {
580             if (store != null) {
581                 try {
582                     store.close();
583                 }
584                 catch (MessagingException e1) {
585                     log.error("Error closing IMAP server " + host, e1);
586                 }
587             }
588         }
589     }
590 
591     public void setFolder(String thisFolder) {
592         this.folder = thisFolder;
593     }
594 
595     /***
596      * @param thisHost The host to set.
597      */
598     public void setHost(String thisHost) {
599         this.host = thisHost;
600     }
601 
602     /***
603      * @param thisPassword The password to set.
604      */
605     public void setPassword(String thisPassword) {
606         this.password = thisPassword;
607     }
608 
609     /***
610      * @param thisUser The user to set.
611      */
612     public void setUser(String thisUser) {
613         this.user = thisUser;
614     }
615 
616     /***
617      * Prints Collection as String for logging.
618      * 
619      * @return pretty formatted information about the collection
620      */
621     public final String toString() {
622         return "Collection(" + id + "), with name: " + name + ",\n\t\tdescription: " + description + ",\n\t\tcontentDir: "
623             + contentDir + ",\n\t\turl: " + url + ",\n\t\texistsOnDisk: " + existsOnDisk + ",\n\t\tindexDir: " + indexDir
624             + ",\n\t\tcacheDir: " + cacheDir + ",\n\t\tcacheUrl: " + cacheUrl + ",\n\t\tanalyzer: " + analyzer
625             + ",\n\t\tkeepCache: " + keepCache + ",\n\t\tisKeepCacheSet: " + isKeepCacheSet + ",\n\t\tnumberOfDocs: "
626             + numberOfDocs + ",\n\t\tmanager: " + manager + ",\n\t\tlastIndexed: " + lastIndexed;
627         // +
628         // ",\n\t\tmd5DocumentCache:
629         // " + md5DocumentCache +
630         // "\n\n";
631     }
632 }