1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package org.zilverline.core;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.util.Date;
27 import java.util.HashSet;
28 import java.util.Set;
29
30 import org.apache.commons.logging.Log;
31 import org.apache.commons.logging.LogFactory;
32
33 import org.springframework.util.StringUtils;
34
35 import org.apache.lucene.analysis.Analyzer;
36 import org.apache.lucene.document.Document;
37 import org.apache.lucene.index.IndexReader;
38
39 import org.zilverline.service.CollectionManager;
40 import org.zilverline.util.FileUtils;
41
42 /***
43 * AbstractCollection provides common implementation for all
44 * DocumentCollections.
45 *
46 * @author Michael Franken
47 * @version $Revision: 1.12 $
48 */
49 public abstract class AbstractCollection implements DocumentCollection {
50 /*** logger for Commons logging. */
51 private static Log log = LogFactory.getLog(AbstractCollection.class);
52
53 /***
54 * String representation of Analyzer.
55 */
56 protected String analyzer;
57
58 /*** The Analyzer to be used in indexing and searching. */
59 protected transient Analyzer analyzerObject = null;
60
61 /***
62 * The archive cache is used to store the keys of archives that are
63 * extracted 'on-the-fly'.
64 */
65 protected transient Set archiveCache;
66
67 /***
68 * The cacheDir is the directory this collection's cache is stored at.
69 *
70 * <p>
71 * The cache is used to (temporarily) store expanded content, such as zip
72 * files.
73 * </p>
74 */
75 protected File cacheDir;
76
77 /***
78 * The cacheUrl is the location this collection's cache (if any) is mapped
79 * to as a result of a search. e.g. d:\temp\cache\books\java could be mapped
80 * to https://server/cache/path
81 */
82 protected String cacheUrl;
83
84 /***
85 * The contentDir is the directory this collection is stored at.
86 * <p>
87 * e.g. d:\books\java
88 * </p>
89 *
90 * <p>
91 * The <code>contentDir</code> needs to point to an existing directory in
92 * order to be indexed.
93 * </p>
94 */
95 protected File contentDir;
96
97 /*** Description of collection. */
98 protected String description;
99
100 /*** Indicates whether a collection actually esists on disk. */
101 protected transient boolean existsOnDisk;
102
103 /*** id indicates identity. Used for persistency and presentation. */
104 protected Long id;
105
106 /***
107 * The indexDir is the directory where the index is stored.
108 *
109 * <p>
110 * e.g. d:\temp\zilverline\index
111 * </p>
112 */
113 protected File indexDir;
114
115 /***
116 * The thread used to index this collection.
117 *
118 */
119 protected transient Thread indexingThread;
120
121 /***
122 * Attribute used to find out whether <code>keepCache</code> has been set
123 * externally (using setter).
124 */
125 protected transient boolean isKeepCacheSet;
126
127 /***
128 * Indicated whether the cache should be removed after indexing.
129 *
130 * <p>
131 * If not, search results can return files in for instance zip files.
132 * </p>
133 */
134 protected boolean keepCache;
135
136 /*** The date of the last index of this collection. */
137 protected transient Date lastIndexed;
138
139 /*** Reference back to the collectionManager. */
140 protected transient CollectionManager manager;
141
142 /*** This cache is used to store the MD5 keys of all indexed documents. */
143 protected transient Set md5DocumentCache;
144
145 /*** Name of collection, also used as part of the name of index. */
146 protected String name;
147
148 /***
149 * Variable used to possibly stop the indexing thread.
150 */
151 protected transient boolean stopRequested;
152
153 /***
154 * Number of Documents in this collection. Can only be set by actually
155 * consulting the corresponding index
156 */
157 protected transient int numberOfDocs;
158
159 /***
160 * The url is the location this collection is mapped to as a result of a
161 * search. e.g. d:\books\java could be mapped to https://server/path/java/
162 */
163 protected String url;
164
165 /*** The version of the index of this collection. */
166 protected transient long version;
167
168 /***
169 * Returns an Analyzer for this collection based on configuration.
170 *
171 * @return the Analyzer used to index and search this collection
172 * @todo the analyzer setting and creation is a bit funny, refactor some
173 * time
174 * @see Analyzer
175 */
176 public final Analyzer createAnalyzer() {
177 if (analyzerObject != null) {
178 return analyzerObject;
179 } else {
180 return manager.createAnalyzer();
181 }
182 }
183
184 /***
185 * Determine whether the collection (contentDir) actually (now) exists on
186 * disk.
187 *
188 * @return true if the collection exists
189 */
190 public final boolean existsOnDisk() {
191 setExistsOnDisk();
192 return existsOnDisk;
193 }
194
195 /***
196 * Get the Analyzer.
197 *
198 * @return the Analyzer as String
199 * @see org.apache.lucene.analysis.Analyzer
200 */
201 public final String getAnalyzer() {
202 return analyzer;
203 }
204
205 /***
206 * Gets the archive cache for this Collection.
207 *
208 * <p>
209 * The archive cache is used to store the keys of archives that are
210 * extracted 'on-the-fly'
211 * </p>
212 *
213 * @return HashSet containing archives that have been cached (so they have
214 * been extracted)
215 */
216 public final Set getArchiveCache() {
217 return archiveCache;
218 }
219
220 /***
221 * Get the location where this collection's cache is kept on disk.
222 *
223 * @return Returns the cacheDir.
224 */
225 public final File getCacheDir() {
226 return cacheDir;
227 }
228
229 /***
230 * Gets the directory where this collection's cache is stored. If the
231 * cacheDir is not set for this Collection, the name of this collection is
232 * used, possibly prepended with the (default) retrieved from the manager.
233 * The cache is used to (temporarily) store expanded content, such as zip
234 * files.
235 *
236 * @return The directory where the cache of this collection is stored on
237 * disk.
238 */
239 public final File getCacheDirWithManagerDefaults() {
240 if ((cacheDir == null) || "".equals(cacheDir.toString())) {
241 if (manager != null) {
242
243
244 return new File(manager.getCacheBaseDir(), name);
245 } else {
246 log.warn("Manager for " + name + " should not be null");
247 return new File(name, "cache");
248 }
249 }
250
251 return cacheDir;
252 }
253
254 /***
255 * Gets the URL where this collection's cached documents can be retrieved.
256 *
257 * @return Returns the cacheUrl.
258 */
259 public final String getCacheUrl() {
260 return cacheUrl;
261 }
262
263 /***
264 * The URL maps the cacheDir to another location.
265 *
266 * <p>
267 * e.g. A document 'ldap.pdf' in cacheDir 'e:\collection\cache\books\' with
268 * an cacheURL of 'http://search.company.com/cachedBooks/' will be returned
269 * in a search result as
270 * <code>http://search.company.com/cachedBooks/ldap.pdf</code>
271 * </p>
272 *
273 * @return the cacheUrl of the collection, or the cacheDir as URL if url is
274 * null or empty.
275 */
276 public final String getCacheUrlWithManagerDefaults() {
277 if (StringUtils.hasLength(cacheUrl)) {
278 if (cacheUrl.endsWith("/")) {
279 return cacheUrl;
280 } else {
281 return cacheUrl + "/";
282 }
283 } else {
284 return "file://"
285 + getCacheDirWithManagerDefaults().toURI().getPath();
286 }
287 }
288
289 /***
290 * Gets the location where this collection's documents can be retrieved.
291 *
292 * @return contentDir directory of collection
293 */
294 public final File getContentDir() {
295 return contentDir;
296 }
297
298 /***
299 * Gets the origin from where this collection's documents can be retrieved.
300 *
301 * @return location such as e:/docs or InBox
302 */
303 public abstract String getRoot();
304
305 /***
306 * Get the description of the collection.
307 *
308 * @return description for the collection
309 */
310 public final String getDescription() {
311 return description;
312 }
313
314 /***
315 * Get the id of the collection.
316 *
317 * @return unique id, can be null
318 */
319 public final Long getId() {
320 return id;
321 }
322
323 /***
324 * Get the location where this collection's index is kept on disk.
325 *
326 * @return the indexDir, possibly null.
327 */
328 public final File getIndexDir() {
329 return indexDir;
330 }
331
332 /***
333 * 'Calculates' the directory where the index of this collection is stored
334 * on disk. If the indexDir is not set for this Collection, the name of this
335 * collection is used, possibly prepended with the baseDir retrieved from
336 * the manager.
337 *
338 * @return The directory where the index of this collection is stored on
339 * disk, never null
340 */
341 public final File getIndexDirWithManagerDefaults() {
342 if ((indexDir == null) || "".equals(indexDir.toString())) {
343 if (manager != null) {
344
345
346 return new File(manager.getIndexBaseDir(), name);
347 } else {
348 log.warn("Manager for " + name + " should not be null");
349 return new File(name, "index");
350 }
351 }
352
353 return indexDir;
354 }
355
356 /***
357 * Return the date of the last Index.
358 *
359 * @return date of last Index, may return null
360 */
361 public final Date getLastIndexed() {
362 return lastIndexed;
363 }
364
365 /***
366 * Get the collection's manager.
367 *
368 * @todo remove this dependency to service layer.
369 *
370 * @return Reference to the CollectionManager holding this Collection.
371 */
372 public final CollectionManager getManager() {
373 return manager;
374 }
375
376 /***
377 * Gets the cache of MD5 hashes of all documents (previously) indexed.
378 *
379 * @return HashSet containing hashes of all documents (previously) indexed
380 */
381 public final Set getMd5DocumentCache() {
382 return md5DocumentCache;
383 }
384
385 /***
386 * Get the name of this collection.
387 *
388 * @return name of collection
389 */
390 public final String getName() {
391 return name;
392 }
393
394 /***
395 * Get the number of documents in this collection. The number is not
396 * calculated, but stored after indexing process, so it is a cheap
397 * operation.
398 *
399 * @return number of documents in collection
400 */
401 public final int getNumberOfDocs() {
402 if (isIndexingInProgress()) {
403 IndexReader index = null;
404 try {
405 File thisIndex = getIndexDirWithManagerDefaults();
406 index = IndexReader.open(thisIndex);
407
408 if (index != null) {
409 return index.numDocs();
410 }
411 } catch (IOException e) {
412 log
413 .warn("Error getting index for collection '" + name
414 + "'", e);
415 } finally {
416 if (index != null) {
417 try {
418 index.close();
419 } catch (IOException e1) {
420 log.error("Error closing index for collection " + name,
421 e1);
422 }
423 }
424 }
425 }
426 return numberOfDocs;
427 }
428
429 /***
430 * Gets the URL where this collection's documents can be retrieved.
431 *
432 * @return the url
433 */
434 public final String getUrl() {
435 return url;
436 }
437
438 /***
439 * Determines the URL of the collection.
440 * <p>
441 * The URL maps the contentDir to another location. e.g. A document
442 * 'ldap.pdf' in contentDir 'e:\collection\books\' with an URL of
443 * 'http://search.company.com/books/' will be returned in a search result as
444 * <code>http://search.company.com/books/ldap.pdf</code>
445 * </p>
446 *
447 * @return the URL of the collection as a String, possibly null in the
448 * exeptional case where there is no contentDir
449 */
450 public final String getUrlDefault() {
451 if (StringUtils.hasLength(url)) {
452 if (url.endsWith("/")) {
453 return url;
454 } else {
455 return url + "/";
456 }
457 } else {
458 if (contentDir != null) {
459 return "file://" + contentDir.toURI().getPath();
460 } else {
461 log.warn("Collection " + name + " does not have a contentDir.");
462 return null;
463 }
464 }
465 }
466
467 /***
468 * Return the version of the Index.
469 *
470 * @return version of last Index
471 */
472 public final long getVersion() {
473 return version;
474 }
475
476 /***
477 * Index the given Collection.
478 *
479 * @param fullIndex
480 * indicated whether a full or incremental index should be
481 * created
482 * @throws IndexException
483 * if the Collections can not be indexed
484 */
485 public abstract void index(final boolean fullIndex) throws IndexException;
486
487 /***
488 * Index the given Collection in a background thread. Stops the indexing if
489 * already running.
490 *
491 * @param fullIndex
492 * indicated whether a full or incremental index should be
493 * created
494 * @throws IndexException
495 * if the Collections can not be indexed
496 */
497 public final void indexInThread(final boolean fullIndex)
498 throws IndexException {
499 if (isIndexingInProgress()) {
500 log.warn("Collection " + name
501 + " is already being indexed, now stopping");
502 stopRequest();
503 return;
504 }
505 stopRequested = false;
506 Runnable r = new Runnable() {
507 public void run() {
508 try {
509 index(fullIndex);
510 } catch (Exception x) {
511
512 log
513 .error(
514 "Can't succesfully finish background indexing process",
515 x);
516 }
517 }
518 };
519 if (fullIndex) {
520
521 numberOfDocs = 0;
522 lastIndexed = new Date();
523 }
524 indexingThread = new Thread(r);
525 indexingThread.setName(name + "IndexingThread");
526 if (manager.getPriority() != null) {
527 indexingThread.setPriority(manager.getPriority().intValue());
528 } else {
529 indexingThread.setPriority(Thread.NORM_PRIORITY);
530 }
531 indexingThread.start();
532 }
533
534 /***
535 * Initialize this collection by getting its index. It retrieves the number
536 * of documents and the MD5 hash of all documents in the collection.
537 *
538 * If the index does not exist (this is a new Collection) just return.
539 *
540 * @throws IndexException
541 * when existing index of Collection can not be succesfully
542 * opened.
543 */
544 public final void init() throws IndexException {
545 log.debug("Initializing collection " + name);
546 IndexReader index = null;
547
548 setExistsOnDisk();
549
550
551 if (md5DocumentCache == null) {
552 md5DocumentCache = new HashSet();
553 }
554
555
556 if (archiveCache == null) {
557 archiveCache = new HashSet();
558 }
559 if (!isIndexValid()) {
560 log.info("Index does not exist (yet) for collection '" + name
561 + "'. Possibly new collection.");
562 numberOfDocs = 0;
563 return;
564 }
565
566
567 File thisIndex = getIndexDirWithManagerDefaults();
568 try {
569 index = IndexReader.open(thisIndex);
570
571 if (index != null) {
572 numberOfDocs = index.numDocs();
573
574 md5DocumentCache.clear();
575 for (int i = 0; i < numberOfDocs; i++) {
576 Document d = index.document(i);
577 String hashValue = d.get("hash");
578 md5DocumentCache.add(hashValue);
579 }
580
581 version = IndexReader.getCurrentVersion(thisIndex);
582
583 lastIndexed = new Date(IndexReader.lastModified(thisIndex));
584 log.debug("Collection " + name + " has " + numberOfDocs
585 + " documents, index created at: " + lastIndexed);
586 } else {
587 log
588 .error("Index could not be retrieved for collection "
589 + name);
590 }
591 } catch (IOException e) {
592 throw new IndexException("Error initializing collection '" + name
593 + "'", e);
594 } finally {
595 if (index != null) {
596 try {
597 index.close();
598 } catch (IOException e1) {
599 log.error("Error closing index for collection " + name, e1);
600 }
601 } else {
602 numberOfDocs = 0;
603 version = 0;
604 lastIndexed = null;
605 }
606 }
607 }
608
609 /***
610 * Returns whether the collection exists on disk. It does not actually
611 * determine that.
612 *
613 * @return true if existsOnDisk
614 */
615 public final boolean isExistsOnDisk() {
616 return existsOnDisk;
617 }
618
619 /***
620 * Indicates whether any indexing is going on.
621 *
622 * @return true if so.
623 */
624 public final boolean isIndexingInProgress() {
625 if (indexingThread == null) {
626 return false;
627 }
628 return indexingThread.isAlive();
629 }
630
631 /***
632 * Check whether the index of this collection is valid. An index is valid
633 * when the directory exists and there is an index in it.
634 *
635 * @return true if the index is valid, otherwise false.
636 *
637 * @throws IndexException
638 * when existing index of Collection can not be succesfully
639 * opened.
640 */
641 public final boolean isIndexValid() throws IndexException {
642 File file = this.getIndexDirWithManagerDefaults();
643
644 if ((file == null) || !IndexReader.indexExists(file)) {
645 log.warn("Index '" + file + "' not valid for collection '" + name
646 + "'.");
647
648 return false;
649 }
650
651 return true;
652 }
653
654 /***
655 * Returns whether the cache containing archive's contents should be kept
656 * after being indexed.
657 *
658 * @return true if so.
659 */
660 public final boolean isKeepCache() {
661 return keepCache;
662 }
663
664 /***
665 * Determines whether the cache containing archive's contents should be kept
666 * after being indexed. It does so by retrieving the defaults from the
667 * manager if needed.
668 *
669 * @return true if so.
670 */
671 public final boolean isKeepCacheWithManagerDefaults() {
672 if (isKeepCacheSet) {
673 return keepCache;
674 } else {
675 if (manager != null) {
676
677 return manager.isKeepCache();
678 }
679 }
680 return false;
681 }
682
683 /***
684 * Indicates whether collection is just instantiated and has no id yet.
685 *
686 * @return true if has no id yet
687 */
688 public final boolean isNew() {
689 return id == null;
690 }
691
692 /***
693 * Sets analyzer and creates an Analyzer object as specified by the given
694 * String.
695 *
696 * @param analyzerClassName
697 * the name of the class. The actual class needs to be available
698 * on the classpath.
699 */
700 public final void setAnalyzer(final String analyzerClassName) {
701 try {
702 if (StringUtils.hasText(analyzerClassName)) {
703 analyzer = analyzerClassName;
704
705 Class c = Class.forName(analyzerClassName);
706
707 if (c != null) {
708 log.debug("Returning Analyzer: '" + analyzerClassName + "'");
709 analyzerObject = (Analyzer) c.newInstance();
710 }
711 }
712 } catch (InstantiationException e1) {
713 log.warn("Can not initiate Analyzer '" + analyzerClassName, e1);
714 } catch (IllegalAccessException e1) {
715 log.warn("Can not access Analyzer " + analyzerClassName, e1);
716 } catch (ClassNotFoundException e) {
717 log.warn("Class not found: " + analyzerClassName, e);
718 }
719 }
720
721 /***
722 * Sets the cacheDir of the collection.
723 *
724 * The cache is used to (temporarily) store expanded content, such as zip
725 * files.
726 *
727 * @param thisCacheDir
728 * The directory where the cache of this collection is stored on
729 * disk.
730 */
731 public final void setCacheDir(final File thisCacheDir) {
732 cacheDir = thisCacheDir;
733 }
734
735 /***
736 * Sete the cacheUrl of the collection.
737 *
738 * The URL is the prefix that corresponds to the cacheDir. e.g. A document
739 * 'ldap.pdf' in cacheDir 'd:\temp\books\cache\' can be returned in a search
740 * result as http://search.company.com/books/cache/ldap.pdf The URL in this
741 * case is the prefix 'http://search.company.com/books/cache/'
742 *
743 * @param theCacheURL
744 * the URL of this collection's cache.
745 */
746 public final void setCacheUrl(final String theCacheURL) {
747 cacheUrl = theCacheURL;
748 }
749
750 /***
751 * Sets the content directory of the collection, and checks whether the
752 * contentDir actually exists by setting existsOnDisk.
753 *
754 * @param theContentDir
755 * directory where collection sits on disk.
756 */
757 public final void setContentDir(final File theContentDir) {
758 contentDir = theContentDir;
759
760
761 setExistsOnDisk();
762
763 if (!isExistsOnDisk()) {
764 log.warn("Set contentDir for collection '" + name
765 + "' at non-existing: " + contentDir);
766 } else {
767 log.info("Set contentDir for collection '" + name + "' at: "
768 + contentDir);
769 }
770 }
771
772 /***
773 * Set the description of the collection.
774 *
775 * @param theDescription
776 * description for collection
777 */
778 public final void setDescription(final String theDescription) {
779 description = theDescription;
780 }
781
782 /***
783 * Sets existsOnDisk based on whether the collection (contentDir) actually
784 * (now) sits on disk.
785 *
786 * @todo the whole existsOnDisk construction is a little funny, refactor
787 * some time
788 */
789 protected abstract void setExistsOnDisk();
790
791 /***
792 * Set the id of the collection. The id is used by the collectionManager to
793 * add and retrieve collections.
794 *
795 * @param theId
796 * the Id
797 */
798 public final void setId(final Long theId) {
799 id = theId;
800 }
801
802 /***
803 * Set the indexDir of this collection. The indexDir is the directory where
804 * the index is stored.
805 *
806 * <p>
807 * e.g. d:\temp\zilverline\index
808 * </p>
809 *
810 * @param theIndexDir
811 * path to the index
812 */
813 public final void setIndexDir(final File theIndexDir) {
814 indexDir = theIndexDir;
815 }
816
817 /***
818 * Set whether the cache should be deleted after indexing this collection.
819 * In the meantime mark this attribute as being set externally, to
820 * distinguish from the default value of a boolean.
821 *
822 * @param b
823 * true or false
824 */
825 public final void setKeepCache(final boolean b) {
826 keepCache = b;
827 isKeepCacheSet = true;
828 }
829
830 /***
831 * Set the collectionManager.
832 *
833 * @param thisManager
834 * The CollectionManager holding this collection.
835 */
836 public final void setManager(final CollectionManager thisManager) {
837 this.manager = thisManager;
838 }
839
840 /***
841 * Set the name of the Collection.
842 *
843 * @param theName
844 * name for collection
845 */
846 public final void setName(final String theName) {
847 name = theName;
848 }
849
850 /***
851 * Set the URL of this Collection. The URL is the prefix that corresponds to
852 * the contentDir. e.g. A document 'ldap.pdf' in contentDir
853 * 'e:\collection\books\' can be returned in a search result as
854 * http://search.company.com/books/ldap.pdf The URL in this case is the
855 * prefix 'http://search.company.com/books/'
856 *
857 * @param theURL
858 * the URL of this collection.
859 */
860 public final void setUrl(final String theURL) {
861 url = theURL;
862 }
863
864 /***
865 * Stop the indexing thread.
866 */
867 public final void stopRequest() {
868 stopRequested = true;
869 indexingThread.interrupt();
870 }
871
872 /***
873 * Resets the cache by deleting all keys, and removing documents form cache.
874 *
875 * @param fullIndex
876 * @throws IndexException
877 */
878 protected void resetCache(final boolean fullIndex) throws IndexException {
879
880
881 if (this.getMd5DocumentCache() == null) {
882 this.init();
883 }
884 if (fullIndex) {
885 this.getMd5DocumentCache().clear();
886
887
888 if ((this.getCacheDirWithManagerDefaults() != null)
889 && this.getCacheDirWithManagerDefaults().exists()) {
890 log.debug("Removing cache of " + this.getName());
891
892 boolean success = FileUtils.removeDir(this
893 .getCacheDirWithManagerDefaults());
894
895 if (!success) {
896 log
897 .warn("Could not entirely delete cache prior to indexing '"
898 + this.getContentDir()
899 + "'. \n\tThis may result in an inconsistent index, as some cache may not be inaccessible, "
900 + "\n\tor old parts of cache included in new index.");
901 }
902 }
903 }
904 }
905
906 /***
907 * Collections are equal if their id are.
908 *
909 * @see java.lang.Object#equals(java.lang.Object)
910 */
911 public boolean equals(Object obj) {
912 if (obj instanceof AbstractCollection) {
913 AbstractCollection thatCollection = ((AbstractCollection) obj);
914 if (this.getId() == null) {
915 return thatCollection.getId() == null;
916 }
917 return thatCollection.getId().equals(this.getId());
918 }
919 return false;
920 }
921
922 public int hashCode() {
923
924 return this.getId().hashCode();
925 }
926
927 }