View Javadoc

1   /*
2    * Copyright 2003-2005 Michael Franken, Zilverline.
3    *
4    * The contents of this file, or the files included with this file, are subject to
5    * the current version of ZILVERLINE Collaborative Source License for the
6    * Zilverline Search Engine (the "License"); You may not use this file except in
7    * compliance with the License.
8    *
9    * You may obtain a copy of the License at
10   *
11   *     http://www.zilverline.org.
12   *
13   * See the License for the rights, obligations and
14   * limitations governing use of the contents of the file.
15   *
16   * The Original and Upgraded Code is the Zilverline Search Engine. The developer of
17   * the Original and Upgraded Code is Michael Franken. Michael Franken owns the
18   * copyrights in the portions it created. All Rights Reserved.
19   *
20   */
21  
22  package org.zilverline.core;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.util.Date;
27  import java.util.HashSet;
28  import java.util.Set;
29  
30  import org.apache.commons.logging.Log;
31  import org.apache.commons.logging.LogFactory;
32  
33  import org.springframework.util.StringUtils;
34  
35  import org.apache.lucene.analysis.Analyzer;
36  import org.apache.lucene.document.Document;
37  import org.apache.lucene.index.IndexReader;
38  
39  import org.zilverline.service.CollectionManager;
40  import org.zilverline.util.FileUtils;
41  
42  /***
43   * AbstractCollection provides common implementation for all
44   * DocumentCollections.
45   * 
46   * @author Michael Franken
47   * @version $Revision: 1.12 $
48   */
49  public abstract class AbstractCollection implements DocumentCollection {
50  	/*** logger for Commons logging. */
51  	private static Log log = LogFactory.getLog(AbstractCollection.class);
52  
53  	/***
54  	 * String representation of Analyzer.
55  	 */
56  	protected String analyzer;
57  
58  	/*** The Analyzer to be used in indexing and searching. */
59  	protected transient Analyzer analyzerObject = null;
60  
61  	/***
62  	 * The archive cache is used to store the keys of archives that are
63  	 * extracted 'on-the-fly'.
64  	 */
65  	protected transient Set archiveCache;
66  
67  	/***
68  	 * The cacheDir is the directory this collection's cache is stored at.
69  	 * 
70  	 * <p>
71  	 * The cache is used to (temporarily) store expanded content, such as zip
72  	 * files.
73  	 * </p>
74  	 */
75  	protected File cacheDir;
76  
77  	/***
78  	 * The cacheUrl is the location this collection's cache (if any) is mapped
79  	 * to as a result of a search. e.g. d:\temp\cache\books\java could be mapped
80  	 * to https://server/cache/path
81  	 */
82  	protected String cacheUrl;
83  
84  	/***
85  	 * The contentDir is the directory this collection is stored at.
86  	 * <p>
87  	 * e.g. d:\books\java
88  	 * </p>
89  	 * 
90  	 * <p>
91  	 * The <code>contentDir</code> needs to point to an existing directory in
92  	 * order to be indexed.
93  	 * </p>
94  	 */
95  	protected File contentDir;
96  
97  	/*** Description of collection. */
98  	protected String description;
99  
100 	/*** Indicates whether a collection actually esists on disk. */
101 	protected transient boolean existsOnDisk;
102 
103 	/*** id indicates identity. Used for persistency and presentation. */
104 	protected Long id;
105 
106 	/***
107 	 * The indexDir is the directory where the index is stored.
108 	 * 
109 	 * <p>
110 	 * e.g. d:\temp\zilverline\index
111 	 * </p>
112 	 */
113 	protected File indexDir;
114 
115 	/***
116 	 * The thread used to index this collection.
117 	 * 
118 	 */
119 	protected transient Thread indexingThread;
120 
121 	/***
122 	 * Attribute used to find out whether <code>keepCache</code> has been set
123 	 * externally (using setter).
124 	 */
125 	protected transient boolean isKeepCacheSet;
126 
127 	/***
128 	 * Indicated whether the cache should be removed after indexing.
129 	 * 
130 	 * <p>
131 	 * If not, search results can return files in for instance zip files.
132 	 * </p>
133 	 */
134 	protected boolean keepCache;
135 
136 	/*** The date of the last index of this collection. */
137 	protected transient Date lastIndexed;
138 
139 	/*** Reference back to the collectionManager. */
140 	protected transient CollectionManager manager;
141 
142 	/*** This cache is used to store the MD5 keys of all indexed documents. */
143 	protected transient Set md5DocumentCache;
144 
145 	/*** Name of collection, also used as part of the name of index. */
146 	protected String name;
147 
148 	/***
149 	 * Variable used to possibly stop the indexing thread.
150 	 */
151 	protected transient boolean stopRequested;
152 
153 	/***
154 	 * Number of Documents in this collection. Can only be set by actually
155 	 * consulting the corresponding index
156 	 */
157 	protected transient int numberOfDocs;
158 
159 	/***
160 	 * The url is the location this collection is mapped to as a result of a
161 	 * search. e.g. d:\books\java could be mapped to https://server/path/java/
162 	 */
163 	protected String url;
164 
165 	/*** The version of the index of this collection. */
166 	protected transient long version;
167 
168 	/***
169 	 * Returns an Analyzer for this collection based on configuration.
170 	 * 
171 	 * @return the Analyzer used to index and search this collection
172 	 * @todo the analyzer setting and creation is a bit funny, refactor some
173 	 *       time
174 	 * @see Analyzer
175 	 */
176 	public final Analyzer createAnalyzer() {
177 		if (analyzerObject != null) {
178 			return analyzerObject;
179 		} else {
180 			return manager.createAnalyzer();
181 		}
182 	}
183 
184 	/***
185 	 * Determine whether the collection (contentDir) actually (now) exists on
186 	 * disk.
187 	 * 
188 	 * @return true if the collection exists
189 	 */
190 	public final boolean existsOnDisk() {
191 		setExistsOnDisk();
192 		return existsOnDisk;
193 	}
194 
195 	/***
196 	 * Get the Analyzer.
197 	 * 
198 	 * @return the Analyzer as String
199 	 * @see org.apache.lucene.analysis.Analyzer
200 	 */
201 	public final String getAnalyzer() {
202 		return analyzer;
203 	}
204 
205 	/***
206 	 * Gets the archive cache for this Collection.
207 	 * 
208 	 * <p>
209 	 * The archive cache is used to store the keys of archives that are
210 	 * extracted 'on-the-fly'
211 	 * </p>
212 	 * 
213 	 * @return HashSet containing archives that have been cached (so they have
214 	 *         been extracted)
215 	 */
216 	public final Set getArchiveCache() {
217 		return archiveCache;
218 	}
219 
220 	/***
221 	 * Get the location where this collection's cache is kept on disk.
222 	 * 
223 	 * @return Returns the cacheDir.
224 	 */
225 	public final File getCacheDir() {
226 		return cacheDir;
227 	}
228 
229 	/***
230 	 * Gets the directory where this collection's cache is stored. If the
231 	 * cacheDir is not set for this Collection, the name of this collection is
232 	 * used, possibly prepended with the (default) retrieved from the manager.
233 	 * The cache is used to (temporarily) store expanded content, such as zip
234 	 * files.
235 	 * 
236 	 * @return The directory where the cache of this collection is stored on
237 	 *         disk.
238 	 */
239 	public final File getCacheDirWithManagerDefaults() {
240 		if ((cacheDir == null) || "".equals(cacheDir.toString())) {
241 			if (manager != null) {
242 				// create a filename from the default index location and the
243 				// name of this collection
244 				return new File(manager.getCacheBaseDir(), name);
245 			} else {
246 				log.warn("Manager for " + name + " should not be null");
247 				return new File(name, "cache");
248 			}
249 		}
250 
251 		return cacheDir;
252 	}
253 
254 	/***
255 	 * Gets the URL where this collection's cached documents can be retrieved.
256 	 * 
257 	 * @return Returns the cacheUrl.
258 	 */
259 	public final String getCacheUrl() {
260 		return cacheUrl;
261 	}
262 
263 	/***
264 	 * The URL maps the cacheDir to another location.
265 	 * 
266 	 * <p>
267 	 * e.g. A document 'ldap.pdf' in cacheDir 'e:\collection\cache\books\' with
268 	 * an cacheURL of 'http://search.company.com/cachedBooks/' will be returned
269 	 * in a search result as
270 	 * <code>http://search.company.com/cachedBooks/ldap.pdf</code>
271 	 * </p>
272 	 * 
273 	 * @return the cacheUrl of the collection, or the cacheDir as URL if url is
274 	 *         null or empty.
275 	 */
276 	public final String getCacheUrlWithManagerDefaults() {
277 		if (StringUtils.hasLength(cacheUrl)) {
278 			if (cacheUrl.endsWith("/")) {
279 				return cacheUrl;
280 			} else {
281 				return cacheUrl + "/";
282 			}
283 		} else {
284 			return "file://"
285 					+ getCacheDirWithManagerDefaults().toURI().getPath();
286 		}
287 	}
288 
289 	/***
290 	 * Gets the location where this collection's documents can be retrieved.
291 	 * 
292 	 * @return contentDir directory of collection
293 	 */
294 	public final File getContentDir() {
295 		return contentDir;
296 	}
297 
298 	/***
299 	 * Gets the origin from where this collection's documents can be retrieved.
300 	 * 
301 	 * @return location such as e:/docs or InBox
302 	 */
303 	public abstract String getRoot();
304 
305 	/***
306 	 * Get the description of the collection.
307 	 * 
308 	 * @return description for the collection
309 	 */
310 	public final String getDescription() {
311 		return description;
312 	}
313 
314 	/***
315 	 * Get the id of the collection.
316 	 * 
317 	 * @return unique id, can be null
318 	 */
319 	public final Long getId() {
320 		return id;
321 	}
322 
323 	/***
324 	 * Get the location where this collection's index is kept on disk.
325 	 * 
326 	 * @return the indexDir, possibly null.
327 	 */
328 	public final File getIndexDir() {
329 		return indexDir;
330 	}
331 
332 	/***
333 	 * 'Calculates' the directory where the index of this collection is stored
334 	 * on disk. If the indexDir is not set for this Collection, the name of this
335 	 * collection is used, possibly prepended with the baseDir retrieved from
336 	 * the manager.
337 	 * 
338 	 * @return The directory where the index of this collection is stored on
339 	 *         disk, never null
340 	 */
341 	public final File getIndexDirWithManagerDefaults() {
342 		if ((indexDir == null) || "".equals(indexDir.toString())) {
343 			if (manager != null) {
344 				// create a filename from the default index location and the
345 				// name of this collection
346 				return new File(manager.getIndexBaseDir(), name);
347 			} else {
348 				log.warn("Manager for " + name + " should not be null");
349 				return new File(name, "index");
350 			}
351 		}
352 
353 		return indexDir;
354 	}
355 
356 	/***
357 	 * Return the date of the last Index.
358 	 * 
359 	 * @return date of last Index, may return null
360 	 */
361 	public final Date getLastIndexed() {
362 		return lastIndexed;
363 	}
364 
365 	/***
366 	 * Get the collection's manager.
367 	 * 
368 	 * @todo remove this dependency to service layer.
369 	 * 
370 	 * @return Reference to the CollectionManager holding this Collection.
371 	 */
372 	public final CollectionManager getManager() {
373 		return manager;
374 	}
375 
376 	/***
377 	 * Gets the cache of MD5 hashes of all documents (previously) indexed.
378 	 * 
379 	 * @return HashSet containing hashes of all documents (previously) indexed
380 	 */
381 	public final Set getMd5DocumentCache() {
382 		return md5DocumentCache;
383 	}
384 
385 	/***
386 	 * Get the name of this collection.
387 	 * 
388 	 * @return name of collection
389 	 */
390 	public final String getName() {
391 		return name;
392 	}
393 
394 	/***
395 	 * Get the number of documents in this collection. The number is not
396 	 * calculated, but stored after indexing process, so it is a cheap
397 	 * operation.
398 	 * 
399 	 * @return number of documents in collection
400 	 */
401 	public final int getNumberOfDocs() {
402 		if (isIndexingInProgress()) {
403 			IndexReader index = null;
404 			try {
405 				File thisIndex = getIndexDirWithManagerDefaults();
406 				index = IndexReader.open(thisIndex);
407 
408 				if (index != null) {
409 					return index.numDocs();
410 				}
411 			} catch (IOException e) {
412 				log
413 						.warn("Error getting index for collection '" + name
414 								+ "'", e);
415 			} finally {
416 				if (index != null) {
417 					try {
418 						index.close();
419 					} catch (IOException e1) {
420 						log.error("Error closing index for collection " + name,
421 								e1);
422 					}
423 				}
424 			}
425 		}
426 		return numberOfDocs;
427 	}
428 
429 	/***
430 	 * Gets the URL where this collection's documents can be retrieved.
431 	 * 
432 	 * @return the url
433 	 */
434 	public final String getUrl() {
435 		return url;
436 	}
437 
438 	/***
439 	 * Determines the URL of the collection.
440 	 * <p>
441 	 * The URL maps the contentDir to another location. e.g. A document
442 	 * 'ldap.pdf' in contentDir 'e:\collection\books\' with an URL of
443 	 * 'http://search.company.com/books/' will be returned in a search result as
444 	 * <code>http://search.company.com/books/ldap.pdf</code>
445 	 * </p>
446 	 * 
447 	 * @return the URL of the collection as a String, possibly null in the
448 	 *         exeptional case where there is no contentDir
449 	 */
450 	public final String getUrlDefault() {
451 		if (StringUtils.hasLength(url)) {
452 			if (url.endsWith("/")) {
453 				return url;
454 			} else {
455 				return url + "/";
456 			}
457 		} else {
458 			if (contentDir != null) {
459 				return "file://" + contentDir.toURI().getPath();
460 			} else {
461 				log.warn("Collection " + name + " does not have a contentDir.");
462 				return null;
463 			}
464 		}
465 	}
466 
467 	/***
468 	 * Return the version of the Index.
469 	 * 
470 	 * @return version of last Index
471 	 */
472 	public final long getVersion() {
473 		return version;
474 	}
475 
476 	/***
477 	 * Index the given Collection.
478 	 * 
479 	 * @param fullIndex
480 	 *            indicated whether a full or incremental index should be
481 	 *            created
482 	 * @throws IndexException
483 	 *             if the Collections can not be indexed
484 	 */
485 	public abstract void index(final boolean fullIndex) throws IndexException;
486 
487 	/***
488 	 * Index the given Collection in a background thread. Stops the indexing if
489 	 * already running.
490 	 * 
491 	 * @param fullIndex
492 	 *            indicated whether a full or incremental index should be
493 	 *            created
494 	 * @throws IndexException
495 	 *             if the Collections can not be indexed
496 	 */
497 	public final void indexInThread(final boolean fullIndex)
498 			throws IndexException {
499 		if (isIndexingInProgress()) {
500 			log.warn("Collection " + name
501 					+ " is already being indexed, now stopping");
502 			stopRequest();
503 			return;
504 		}
505 		stopRequested = false;
506 		Runnable r = new Runnable() {
507 			public void run() {
508 				try {
509 					index(fullIndex);
510 				} catch (Exception x) {
511 					// in case ANY exception slips through
512 					log
513 							.error(
514 									"Can't succesfully finish background indexing process",
515 									x);
516 				}
517 			}
518 		};
519 		if (fullIndex) {
520 			// update the info now so it is shown in user interface
521 			numberOfDocs = 0;
522 			lastIndexed = new Date();
523 		}
524 		indexingThread = new Thread(r);
525 		indexingThread.setName(name + "IndexingThread");
526 		if (manager.getPriority() != null) {
527 			indexingThread.setPriority(manager.getPriority().intValue());
528 		} else {
529 			indexingThread.setPriority(Thread.NORM_PRIORITY);
530 		}
531 		indexingThread.start();
532 	}
533 
534 	/***
535 	 * Initialize this collection by getting its index. It retrieves the number
536 	 * of documents and the MD5 hash of all documents in the collection.
537 	 * 
538 	 * If the index does not exist (this is a new Collection) just return.
539 	 * 
540 	 * @throws IndexException
541 	 *             when existing index of Collection can not be succesfully
542 	 *             opened.
543 	 */
544 	public final void init() throws IndexException {
545 		log.debug("Initializing collection " + name);
546 		IndexReader index = null;
547 		// Determine whether the collection exists on disk
548 		setExistsOnDisk();
549 		// check whether this collection has a cache for the MD5 hashes of
550 		// documents
551 		if (md5DocumentCache == null) {
552 			md5DocumentCache = new HashSet();
553 		}
554 		// check whether this collection has a cache for the MD5 hashes of
555 		// indexed archives
556 		if (archiveCache == null) {
557 			archiveCache = new HashSet();
558 		}
559 		if (!isIndexValid()) {
560 			log.info("Index does not exist (yet) for collection '" + name
561 					+ "'. Possibly new collection.");
562 			numberOfDocs = 0;
563 			return;
564 		}
565 
566 		// Get the index
567 		File thisIndex = getIndexDirWithManagerDefaults();
568 		try {
569 			index = IndexReader.open(thisIndex);
570 
571 			if (index != null) {
572 				numberOfDocs = index.numDocs();
573 				// retrieve all hashes of Documents from the cache
574 				md5DocumentCache.clear();
575 				for (int i = 0; i < numberOfDocs; i++) {
576 					Document d = index.document(i);
577 					String hashValue = d.get("hash");
578 					md5DocumentCache.add(hashValue);
579 				}
580 				// get some relevant information from the index
581 				version = IndexReader.getCurrentVersion(thisIndex);
582 				// deprecated, but needed
583 				lastIndexed = new Date(IndexReader.lastModified(thisIndex));
584 				log.debug("Collection " + name + " has " + numberOfDocs
585 						+ " documents, index created at: " + lastIndexed);
586 			} else {
587 				log
588 						.error("Index could not be retrieved for collection "
589 								+ name);
590 			}
591 		} catch (IOException e) {
592 			throw new IndexException("Error initializing collection '" + name
593 					+ "'", e);
594 		} finally {
595 			if (index != null) {
596 				try {
597 					index.close();
598 				} catch (IOException e1) {
599 					log.error("Error closing index for collection " + name, e1);
600 				}
601 			} else {
602 				numberOfDocs = 0;
603 				version = 0;
604 				lastIndexed = null;
605 			}
606 		}
607 	}
608 
609 	/***
610 	 * Returns whether the collection exists on disk. It does not actually
611 	 * determine that.
612 	 * 
613 	 * @return true if existsOnDisk
614 	 */
615 	public final boolean isExistsOnDisk() {
616 		return existsOnDisk;
617 	}
618 
619 	/***
620 	 * Indicates whether any indexing is going on.
621 	 * 
622 	 * @return true if so.
623 	 */
624 	public final boolean isIndexingInProgress() {
625 		if (indexingThread == null) {
626 			return false;
627 		}
628 		return indexingThread.isAlive();
629 	}
630 
631 	/***
632 	 * Check whether the index of this collection is valid. An index is valid
633 	 * when the directory exists and there is an index in it.
634 	 * 
635 	 * @return true if the index is valid, otherwise false.
636 	 * 
637 	 * @throws IndexException
638 	 *             when existing index of Collection can not be succesfully
639 	 *             opened.
640 	 */
641 	public final boolean isIndexValid() throws IndexException {
642 		File file = this.getIndexDirWithManagerDefaults();
643 
644 		if ((file == null) || !IndexReader.indexExists(file)) {
645 			log.warn("Index '" + file + "' not valid for collection '" + name
646 					+ "'.");
647 
648 			return false;
649 		}
650 
651 		return true;
652 	}
653 
654 	/***
655 	 * Returns whether the cache containing archive's contents should be kept
656 	 * after being indexed.
657 	 * 
658 	 * @return true if so.
659 	 */
660 	public final boolean isKeepCache() {
661 		return keepCache;
662 	}
663 
664 	/***
665 	 * Determines whether the cache containing archive's contents should be kept
666 	 * after being indexed. It does so by retrieving the defaults from the
667 	 * manager if needed.
668 	 * 
669 	 * @return true if so.
670 	 */
671 	public final boolean isKeepCacheWithManagerDefaults() {
672 		if (isKeepCacheSet) {
673 			return keepCache;
674 		} else {
675 			if (manager != null) {
676 				// the value was not set here, get it from manager.
677 				return manager.isKeepCache();
678 			}
679 		}
680 		return false;
681 	}
682 
683 	/***
684 	 * Indicates whether collection is just instantiated and has no id yet.
685 	 * 
686 	 * @return true if has no id yet
687 	 */
688 	public final boolean isNew() {
689 		return id == null;
690 	}
691 
692 	/***
693 	 * Sets analyzer and creates an Analyzer object as specified by the given
694 	 * String.
695 	 * 
696 	 * @param analyzerClassName
697 	 *            the name of the class. The actual class needs to be available
698 	 *            on the classpath.
699 	 */
700 	public final void setAnalyzer(final String analyzerClassName) {
701 		try {
702 			if (StringUtils.hasText(analyzerClassName)) {
703 				analyzer = analyzerClassName;
704 
705 				Class c = Class.forName(analyzerClassName);
706 
707 				if (c != null) {
708 					log.debug("Returning Analyzer: '" + analyzerClassName + "'");
709 					analyzerObject = (Analyzer) c.newInstance();
710 				}
711 			}
712 		} catch (InstantiationException e1) {
713 			log.warn("Can not initiate Analyzer '" + analyzerClassName, e1);
714 		} catch (IllegalAccessException e1) {
715 			log.warn("Can not access Analyzer " + analyzerClassName, e1);
716 		} catch (ClassNotFoundException e) {
717 			log.warn("Class not found: " + analyzerClassName, e);
718 		}
719 	}
720 
721 	/***
722 	 * Sets the cacheDir of the collection.
723 	 * 
724 	 * The cache is used to (temporarily) store expanded content, such as zip
725 	 * files.
726 	 * 
727 	 * @param thisCacheDir
728 	 *            The directory where the cache of this collection is stored on
729 	 *            disk.
730 	 */
731 	public final void setCacheDir(final File thisCacheDir) {
732 		cacheDir = thisCacheDir;
733 	}
734 
735 	/***
736 	 * Sete the cacheUrl of the collection.
737 	 * 
738 	 * The URL is the prefix that corresponds to the cacheDir. e.g. A document
739 	 * 'ldap.pdf' in cacheDir 'd:\temp\books\cache\' can be returned in a search
740 	 * result as http://search.company.com/books/cache/ldap.pdf The URL in this
741 	 * case is the prefix 'http://search.company.com/books/cache/'
742 	 * 
743 	 * @param theCacheURL
744 	 *            the URL of this collection's cache.
745 	 */
746 	public final void setCacheUrl(final String theCacheURL) {
747 		cacheUrl = theCacheURL;
748 	}
749 
750 	/***
751 	 * Sets the content directory of the collection, and checks whether the
752 	 * contentDir actually exists by setting existsOnDisk.
753 	 * 
754 	 * @param theContentDir
755 	 *            directory where collection sits on disk.
756 	 */
757 	public final void setContentDir(final File theContentDir) {
758 		contentDir = theContentDir;
759 
760 		// check whether contentDir is really a directory
761 		setExistsOnDisk();
762 
763 		if (!isExistsOnDisk()) {
764 			log.warn("Set contentDir for collection '" + name
765 					+ "' at non-existing: " + contentDir);
766 		} else {
767 			log.info("Set contentDir for collection '" + name + "' at: "
768 					+ contentDir);
769 		}
770 	}
771 
772 	/***
773 	 * Set the description of the collection.
774 	 * 
775 	 * @param theDescription
776 	 *            description for collection
777 	 */
778 	public final void setDescription(final String theDescription) {
779 		description = theDescription;
780 	}
781 
782 	/***
783 	 * Sets existsOnDisk based on whether the collection (contentDir) actually
784 	 * (now) sits on disk.
785 	 * 
786 	 * @todo the whole existsOnDisk construction is a little funny, refactor
787 	 *       some time
788 	 */
789 	protected abstract void setExistsOnDisk();
790 
791 	/***
792 	 * Set the id of the collection. The id is used by the collectionManager to
793 	 * add and retrieve collections.
794 	 * 
795 	 * @param theId
796 	 *            the Id
797 	 */
798 	public final void setId(final Long theId) {
799 		id = theId;
800 	}
801 
802 	/***
803 	 * Set the indexDir of this collection. The indexDir is the directory where
804 	 * the index is stored.
805 	 * 
806 	 * <p>
807 	 * e.g. d:\temp\zilverline\index
808 	 * </p>
809 	 * 
810 	 * @param theIndexDir
811 	 *            path to the index
812 	 */
813 	public final void setIndexDir(final File theIndexDir) {
814 		indexDir = theIndexDir;
815 	}
816 
817 	/***
818 	 * Set whether the cache should be deleted after indexing this collection.
819 	 * In the meantime mark this attribute as being set externally, to
820 	 * distinguish from the default value of a boolean.
821 	 * 
822 	 * @param b
823 	 *            true or false
824 	 */
825 	public final void setKeepCache(final boolean b) {
826 		keepCache = b;
827 		isKeepCacheSet = true;
828 	}
829 
830 	/***
831 	 * Set the collectionManager.
832 	 * 
833 	 * @param thisManager
834 	 *            The CollectionManager holding this collection.
835 	 */
836 	public final void setManager(final CollectionManager thisManager) {
837 		this.manager = thisManager;
838 	}
839 
840 	/***
841 	 * Set the name of the Collection.
842 	 * 
843 	 * @param theName
844 	 *            name for collection
845 	 */
846 	public final void setName(final String theName) {
847 		name = theName;
848 	}
849 
850 	/***
851 	 * Set the URL of this Collection. The URL is the prefix that corresponds to
852 	 * the contentDir. e.g. A document 'ldap.pdf' in contentDir
853 	 * 'e:\collection\books\' can be returned in a search result as
854 	 * http://search.company.com/books/ldap.pdf The URL in this case is the
855 	 * prefix 'http://search.company.com/books/'
856 	 * 
857 	 * @param theURL
858 	 *            the URL of this collection.
859 	 */
860 	public final void setUrl(final String theURL) {
861 		url = theURL;
862 	}
863 
864 	/***
865 	 * Stop the indexing thread.
866 	 */
867 	public final void stopRequest() {
868 		stopRequested = true;
869 		indexingThread.interrupt();
870 	}
871 
872 	/***
873 	 * Resets the cache by deleting all keys, and removing documents form cache.
874 	 * 
875 	 * @param fullIndex
876 	 * @throws IndexException
877 	 */
878 	protected void resetCache(final boolean fullIndex) throws IndexException {
879 		// create a cache for MD5 hashes if there's not one yet (new collection,
880 		// first indexed)
881 		if (this.getMd5DocumentCache() == null) {
882 			this.init();
883 		}
884 		if (fullIndex) {
885 			this.getMd5DocumentCache().clear();
886 
887 			// make sure the cache is flushed
888 			if ((this.getCacheDirWithManagerDefaults() != null)
889 					&& this.getCacheDirWithManagerDefaults().exists()) {
890 				log.debug("Removing cache of " + this.getName());
891 
892 				boolean success = FileUtils.removeDir(this
893 						.getCacheDirWithManagerDefaults());
894 
895 				if (!success) {
896 					log
897 							.warn("Could not entirely delete cache prior to indexing '"
898 									+ this.getContentDir()
899 									+ "'. \n\tThis may result in an inconsistent index, as some cache may not be inaccessible, "
900 									+ "\n\tor old parts of cache included in new index.");
901 				}
902 			}
903 		}
904 	}
905 
906 	/***
907 	 * Collections are equal if their id are.
908 	 * 
909 	 * @see java.lang.Object#equals(java.lang.Object)
910 	 */
911 	public boolean equals(Object obj) {
912 		if (obj instanceof AbstractCollection) {
913 			AbstractCollection thatCollection = ((AbstractCollection) obj);
914 			if (this.getId() == null) {
915 				return thatCollection.getId() == null;
916 			}
917 			return thatCollection.getId().equals(this.getId());
918 		}
919 		return false;
920 	}
921 
922 	public int hashCode() {
923 
924 		return this.getId().hashCode();
925 	}
926 
927 }