1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package org.zilverline.service;
23
24 import java.io.BufferedInputStream;
25 import java.io.BufferedOutputStream;
26 import java.io.File;
27 import java.io.FileOutputStream;
28 import java.io.IOException;
29 import java.util.ArrayList;
30 import java.util.Enumeration;
31 import java.util.Iterator;
32 import java.util.List;
33 import java.util.zip.ZipEntry;
34 import java.util.zip.ZipFile;
35
36 import org.apache.commons.logging.Log;
37 import org.apache.commons.logging.LogFactory;
38 import org.apache.lucene.analysis.Analyzer;
39 import org.apache.lucene.analysis.standard.StandardAnalyzer;
40 import org.springframework.util.StringUtils;
41 import org.zilverline.core.DocumentCollection;
42 import org.zilverline.core.ExtractorFactory;
43 import org.zilverline.core.FileSystemCollection;
44 import org.zilverline.core.Handler;
45 import org.zilverline.core.IndexException;
46 import org.zilverline.dao.CollectionManagerDAO;
47 import org.zilverline.dao.DAOException;
48 import org.zilverline.util.FileUtils;
49 import org.zilverline.util.SysUtils;
50
51 /***
52 * The CollectionManagerImpl holds all collections, and base values for them.
53 *
54 * <p>
55 * NB. This Bean gets instantiated in web-servlet.xml.
56 * </p>
57 *
58 * @author Michael Franken
59 * @version $Revision: 1.28 $
60 *
61 * @see org.zilverline.core.FileSystemCollection
62 */
63 public class CollectionManagerImpl implements CollectionManager {
64
65 /***
66 * @return Returns the analyzer.
67 */
68 public String getAnalyzer() {
69 return analyzer;
70 }
71
72 /***
73 * Gets a collection by id.
74 *
75 * @param theId The id of the collection
76 *
77 * @return Collection or null if not found
78 */
79 public DocumentCollection getCollection(final Long theId) {
80 Iterator li = collections.iterator();
81 while (li.hasNext()) {
82 DocumentCollection c = (DocumentCollection) li.next();
83 if (theId.equals(c.getId())) {
84 return c;
85 }
86 }
87 return null;
88 }
89
90 /***
91 * DAO object taking care of persistence for CollectionManager and its collections.
92 */
93 private transient CollectionManagerDAO dao;
94
95 /***
96 * MergeFactor for indexing process.
97 */
98 private Integer mergeFactor;
99
100 /***
101 * minMergeDocs for indexing process.
102 */
103 private Integer minMergeDocs;
104
105 /***
106 * maxMergeDocs for indexing process.
107 */
108 private Integer maxMergeDocs;
109
110 /***
111 * priority for indexing process.
112 */
113 private Integer priority = new Integer(2);
114
115 /***
116 * @return Returns the dao.
117 */
118 public CollectionManagerDAO getDao() {
119 return dao;
120 }
121
122 /***
123 * Set the DAO for this CollectionManager.
124 *
125 * @param thisDao The dao to set.
126 */
127 public void setDao(final CollectionManagerDAO thisDao) {
128 this.dao = thisDao;
129 }
130
131 /*** logger for Commons logging. */
132 private static Log log = LogFactory.getLog(CollectionManagerImpl.class);
133
134 /***
135 * String representation of Analyzer. Stored to present to user for selection.
136 */
137 private String analyzer = "org.apache.lucene.analysis.standard.StandardAnalyzer";
138
139 /*** Array containing all available Analyzers. */
140 private transient String[] allAnalyzers;
141
142 /***
143 * The Analyzer to be used in indexing and searching. StandardAnalyzer by default.
144 */
145 private transient Analyzer analyzerObject = new StandardAnalyzer();
146
147 /***
148 * The default cache base directory for all collections. The cache is the directory on disk where zipped content is unzipped for
149 * indexing. By default use WEB-INF/cache
150 */
151 private File cacheBaseDir = new File(new File(this.getClass().getResource("/").getFile()).getParentFile(), "cache");
152
153 /*** The set of collections this CollectionManagerImpl manages. */
154
155 private List collections = new ArrayList();
156
157 /***
158 * The default index base directory for all collections. The index is the directory on disk where a Lucene index is stored. By
159 * default use WEB-INF/index
160 *
161 * @see org.apache.lucene.index.IndexReader
162 */
163 private File indexBaseDir = new File(new File(this.getClass().getResource("/").getFile()).getParentFile(), "index");
164
165 /*** The default for all collections whether to keep cache dir after indexing. */
166 private boolean keepCache = false;
167
168 /***
169 * The handler of archives, contains mappings for file extension to unarchiving programs.
170 */
171 private Handler archiveHandler;
172
173 /***
174 * Factory containing Extractor mapppings.
175 */
176 private ExtractorFactory factory = new ExtractorFactory();
177
178 /***
179 * Array containing all Extractors by name.
180 */
181 private transient String[] allExtractors;
182
183 /***
184 * Add or updates collection to list of collections, and sets the manager.
185 *
186 * Update occurs when a collection with the same id is found in the collections. Assigns an ID to a new Collection.
187 *
188 * @param col Collection containing documents
189 */
190 public void addCollection(final DocumentCollection col) {
191 long maxId = 0L;
192
193 if (col.getId() != null) {
194
195 for (int i = 0; i < collections.size(); i++) {
196 DocumentCollection thisCollection = (DocumentCollection) collections.get(i);
197 maxId = Math.max(maxId, thisCollection.getId().longValue());
198 log.debug("max ID: " + maxId);
199 if (col.getId().equals(thisCollection.getId())) {
200 log.debug("Updating collection " + col.getId() + ", " + col.getName() + " to Manager at location " + i);
201 collections.set(i, col);
202 thisCollection.setManager(this);
203 return;
204 }
205 }
206 } else {
207 for (int i = 0; i < collections.size(); i++) {
208 DocumentCollection thisCollection = (DocumentCollection) collections.get(i);
209 maxId = Math.max(maxId, thisCollection.getId().longValue());
210 log.debug("max ID: " + maxId);
211 }
212 }
213
214 col.setId(new Long(maxId + 1L));
215 log.debug("Adding collection " + col.getId() + ", " + col.getName() + " to Manager at end");
216 collections.add(col);
217 col.setManager(this);
218 }
219
220 /***
221 * Deletes collection from list of collections.
222 *
223 * @param col Collection containing documents
224 */
225 public void deleteCollection(final DocumentCollection col) {
226 for (int i = 0; i < collections.size(); i++) {
227 DocumentCollection thisCollection = (DocumentCollection) collections.get(i);
228 if (col.getId().equals(thisCollection.getId())) {
229 collections.remove(i);
230 break;
231 }
232 }
233 }
234
235 /***
236 * Indicates whether any indexing is going on.
237 *
238 * @return true if so.
239 */
240 public boolean isIndexingInProgress() {
241 for (int i = 0; i < collections.size(); i++) {
242 DocumentCollection thisCollection = (DocumentCollection) collections.get(i);
243 if (thisCollection.isIndexingInProgress()) {
244 return true;
245 }
246 }
247 return false;
248 }
249
250 /***
251 * Store the CollectionManager to store.
252 *
253 * @throws IndexException when collectionManager can not be saved to underlying store
254 */
255 public void store() throws IndexException {
256 if (dao != null) {
257 try {
258 dao.store(this);
259 }
260 catch (DAOException e) {
261 throw new IndexException("Can not save IndexService", e);
262 }
263 } else {
264 log.error("No DAO set for IndexService");
265 }
266 }
267
268 /***
269 * Returns an Analyzer for this collection based on configuration.
270 *
271 * @return the Analyzer used to index and search this collection
272 */
273
274
275
276 public Analyzer createAnalyzer() {
277 return analyzerObject;
278 }
279
280 /***
281 * Get the cache base directory.
282 *
283 * @return String the directory where the cache sits
284 */
285 public File getCacheBaseDir() {
286 return cacheBaseDir;
287 }
288
289 /***
290 * Gets a collection by name.
291 *
292 * @param theName The name of the collection
293 *
294 * @return Collection or null if not found
295 */
296 public DocumentCollection getCollectionByName(final String theName) {
297 if (theName == null) {
298 return null;
299 }
300
301 Iterator li = collections.iterator();
302
303 while (li.hasNext()) {
304 DocumentCollection c = (DocumentCollection) li.next();
305
306 if (theName.equals(c.getName())) {
307 return c;
308 }
309 }
310
311 return null;
312 }
313
314 /***
315 * Get all collections.
316 *
317 * @return collections List of collections
318 */
319 public List getCollections() {
320 return collections;
321 }
322
323 /***
324 * Get the base directory for the index.
325 *
326 * @return the directory
327 */
328 public File getIndexBaseDir() {
329 return indexBaseDir;
330 }
331
332 /***
333 * Initializes all collections.
334 *
335 * @throws IndexException if one of the collections can not be initialized.
336 */
337 public void init() throws IndexException {
338 allExtractors = ExtractorFactory.findExtractorsOnClasspath();
339 allAnalyzers = Handler.findAnalyzersOnClasspath();
340
341 CollectionManager thatManager = dao.load();
342 if (thatManager != null) {
343 this.cacheBaseDir = thatManager.getCacheBaseDir();
344 this.indexBaseDir = thatManager.getIndexBaseDir();
345 this.keepCache = thatManager.isKeepCache();
346 this.analyzer = thatManager.getAnalyzer();
347 this.archiveHandler = thatManager.getArchiveHandler();
348 this.factory = thatManager.getFactory();
349 this.priority = thatManager.getPriority();
350 this.mergeFactor = thatManager.getMergeFactor();
351 this.maxMergeDocs = thatManager.getMaxMergeDocs();
352 this.minMergeDocs = thatManager.getMinMergeDocs();
353
354
355 collections.clear();
356 Iterator li = thatManager.getCollections().iterator();
357 try {
358 while (li.hasNext()) {
359 DocumentCollection c = (DocumentCollection) li.next();
360 log.debug("Adding collection to manager: " + c.getName());
361 this.addCollection(c);
362 c.init();
363 }
364 }
365 catch (IndexException e) {
366 throw new IndexException("Error initializing all indexes in CollectionManagerImpl", e);
367 }
368 } else {
369
370 setFactory(new ExtractorFactory());
371 setArchiveHandler(new Handler());
372
373 if (getCollections().isEmpty()) {
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401 }
402 }
403 }
404
405 /***
406 * The default for all collections whether to keep cache dir after indexing.
407 *
408 * @return whether to keep the cache or not.
409 */
410 public boolean isKeepCache() {
411 return keepCache;
412 }
413
414 /***
415 * Create an Analyzer as specified by the given String.
416 *
417 * @param analyzerClassName the name of the class. The class needs to be available on the classpath.
418 */
419 public void setAnalyzer(final String analyzerClassName) {
420 try {
421 if (analyzerClassName != null) {
422 analyzer = analyzerClassName;
423
424 Class c = Class.forName(analyzerClassName);
425
426 if (c != null) {
427 log.debug("Returning Analyzer: " + analyzerClassName);
428 analyzerObject = (Analyzer) c.newInstance();
429 }
430 }
431 }
432 catch (InstantiationException e1) {
433 log.debug("Can not initiate Analyzer '" + analyzerClassName, e1);
434 }
435 catch (IllegalAccessException e1) {
436 log.debug("Can not access Analyzer " + analyzerClassName, e1);
437 }
438 catch (ClassNotFoundException e) {
439 log.debug("Class not found: " + analyzerClassName, e);
440 }
441 }
442
443 /***
444 * The default cache base directory for all collections. The cache is the directory on disk where zipped content is unzipped for
445 * indexing.
446 *
447 * @param thisDir the directory on disk
448 */
449 public void setCacheBaseDir(final File thisDir) {
450 cacheBaseDir = thisDir;
451 }
452
453 /***
454 * The default index base directory for all collections. The index is the directory on disk where a Lucene index is stored.
455 *
456 * @param thisDir the directory on disk
457 *
458 * @see org.apache.lucene.index.IndexReader
459 */
460 public void setIndexBaseDir(final File thisDir) {
461 indexBaseDir = thisDir;
462 }
463
464 /***
465 * Indicates whether a Collection cache should be kept after indexing. The value of this CollectionManagerImpl functions as
466 * default for all Collections.
467 *
468 * @param b keep cache or not.
469 */
470 public void setKeepCache(final boolean b) {
471 keepCache = b;
472 }
473
474 /***
475 * @return Returns the allAnalyzers.
476 */
477 public String[] getAllAnalyzers() {
478 return allAnalyzers;
479 }
480
481 /***
482 * get the ArchiveHandler, which contains the mappings for unArchiving archives.
483 *
484 * @return object containing mappings for handling archives
485 */
486 public Handler getArchiveHandler() {
487 return archiveHandler;
488 }
489
490 /***
491 * @return Returns the factory.
492 */
493 public ExtractorFactory getFactory() {
494 return factory;
495 }
496
497 /***
498 * Set the ArchiveHandler.
499 *
500 * @param handler object containing mappings for handling archives
501 */
502 public void setArchiveHandler(final Handler handler) {
503 archiveHandler = handler;
504 }
505
506 /***
507 * @param thatFactory The factory to set.
508 */
509 public void setFactory(final ExtractorFactory thatFactory) {
510 this.factory = thatFactory;
511 }
512
513 /***
514 * Expands Archives to disk. This is used is 'on-the-fly' extraction from cache
515 *
516 * @param col the Collection to which cache this archive is extracted
517 * @param zip the archive or directory that might contain archives
518 *
519 * @return true if archive(s) could be extracted
520 *
521 * @throws IndexException on error
522 *
523 * @see org.zilverline.web.CacheController
524 */
525 public boolean expandArchive(final FileSystemCollection col, final File zip) throws IndexException {
526 log.debug("getFromCache: document " + zip + " from : " + col.getName());
527
528 if (!zip.exists()) {
529 log.warn(zip + " does not exist.");
530
531 return false;
532 }
533
534
535 if (zip.isDirectory()) {
536 File[] files = zip.listFiles();
537
538 for (int i = 0; i < files.length; i++) {
539 expandArchive(col, files[i]);
540 }
541 } else {
542 String extension = FileUtils.getExtension(zip);
543
544 if ((archiveHandler != null) && archiveHandler.canUnPack(extension)) {
545
546 log.debug(zip + " is an archive");
547
548 File dir = null;
549
550 if (StringUtils.hasText(archiveHandler.getUnArchiveCommand(extension))) {
551
552 log.debug(zip + " is a zip file");
553 dir = unZip(zip, col);
554 } else {
555 log.debug(zip + " is a external archive file");
556 dir = unPack(zip, col);
557 }
558
559
560 log.debug("Recurse into " + dir);
561 File[] files = dir.listFiles();
562
563 for (int i = 0; i < files.length; i++) {
564 expandArchive(col, files[i]);
565 }
566
567 return true;
568 } else {
569 if (archiveHandler == null || archiveHandler.getMappings() == null) {
570 log.warn("Can't extract this type, no archiveHandler");
571 } else {
572 log.warn("Can't extract this type, not a supported extension: " + extension);
573 }
574 }
575 }
576
577 return false;
578 }
579
580 /***
581 * 'unpacks' a given archive file into cache directory with derived name. e.g. c:\temp\file.chm wil be unpacked into
582 * [cacheDir]\file_chm\.
583 *
584 * @param sourceFile the Archive file to be unpacked
585 * @param thisCollection the collection whose cache and contenDir is used
586 *
587 * @return File (new) directory containing unpacked file, null if unknown Archive
588 */
589 public File unPack(final File sourceFile, final FileSystemCollection thisCollection) {
590 File unPackDestinationDirectory = null;
591
592
593
594 if (archiveHandler == null) {
595
596 log.warn("No archiveHandler found while trying to unPack " + sourceFile);
597 return null;
598 }
599
600 String extension = FileUtils.getExtension(sourceFile);
601
602 if (!archiveHandler.canUnPack(extension)) {
603
604 log.warn("No archiveHandler found for " + sourceFile);
605 return null;
606 }
607
608
609 unPackDestinationDirectory = file2CacheDir(sourceFile, thisCollection);
610 log.debug("unpacking " + sourceFile + " into " + unPackDestinationDirectory);
611
612
613 String unArchiveCommand = archiveHandler.getUnArchiveCommand(extension);
614
615 if (SysUtils.execute(unArchiveCommand, sourceFile, unPackDestinationDirectory)) {
616 log.info("Executed: " + unArchiveCommand + " " + sourceFile + " in " + unPackDestinationDirectory);
617 } else {
618 log.warn("Can not execute " + unArchiveCommand + " " + sourceFile + " in " + unPackDestinationDirectory);
619 }
620
621
622 if (FileUtils.isIn(sourceFile, thisCollection.getCacheDirWithManagerDefaults())) {
623 sourceFile.delete();
624 }
625
626 return unPackDestinationDirectory;
627 }
628
629 /***
630 * Takes a file and creates a directory with a derived name in the cacheDir. If the file was not already in cache, and sits in
631 * contentDir it is mapped to cache, otherwise it stays within cache.
632 *
633 * <p>
634 * e.g. given cachedir <code>c:\temp\</code> and contentdir <code>e:\docs\Projects\lucene\content\</code>,
635 * <code>e:\docs\Projects\lucene\content\books.zip</code> yields <code>c:\temp\books_zip\</code>
636 * </p>
637 *
638 * <p>
639 * <code>c:\temp\books.zip</code> yields <code>c:\temp\books_zip\</code>
640 * </p>
641 *
642 * @param sourceFile the file to be used as name for the directory
643 * @param thisCollection the collection (not null) whose cache and contenDir is used
644 *
645 * @return File the (newly created) directory
646 */
647 public static File file2CacheDir(final File sourceFile, final FileSystemCollection thisCollection) {
648 log.debug("Entering file2Dir, with " + sourceFile + ", for collection:" + thisCollection.getName());
649
650 File unZipDestinationDirectory = null;
651
652 try {
653 File cacheDir = thisCollection.getCacheDirWithManagerDefaults();
654
655 if (!cacheDir.isDirectory()) {
656 if (!cacheDir.mkdirs()) {
657 log.warn("Can't create cache directory " + cacheDir);
658 return null;
659 }
660 }
661
662 String destinationDirectory = sourceFile.getCanonicalPath();
663
664
665 int index = destinationDirectory.lastIndexOf('.');
666 String extension;
667
668 if (index != -1) {
669 extension = destinationDirectory.substring(index + 1);
670 destinationDirectory = destinationDirectory.substring(0, index) + '_' + extension;
671 }
672
673
674 String collectionPath = thisCollection.getContentDir().getCanonicalPath();
675
676 if (destinationDirectory.startsWith(collectionPath)) {
677
678 String relativePath = destinationDirectory.substring(collectionPath.length());
679
680 unZipDestinationDirectory = new File(thisCollection.getCacheDirWithManagerDefaults(), relativePath);
681 log.debug("Mapped " + relativePath + " to cache: " + thisCollection.getCacheDirWithManagerDefaults());
682 } else {
683 unZipDestinationDirectory = new File(destinationDirectory);
684 }
685
686
687 boolean canCreate = unZipDestinationDirectory.mkdirs();
688
689 if (!canCreate) {
690 log.warn("Could not create: " + unZipDestinationDirectory);
691 }
692
693 log.debug("Created: " + unZipDestinationDirectory + " from File: " + sourceFile);
694 }
695 catch (Exception e) {
696 log.error("error creating directory from file: " + sourceFile, e);
697 }
698
699 return unZipDestinationDirectory;
700 }
701
702 /***
703 * unZips a given zip file into cache directory with derived name. e.g. c:\temp\file.zip wil be unziiped into
704 * [cacheDir]\file_zip\.
705 *
706 * @param sourceZipFile the ZIP file to be unzipped
707 * @param thisCollection the collection whose cache and contenDir is used
708 *
709 * @return File (new) directory containing zip file
710 */
711 public static File unZip(final File sourceZipFile, final FileSystemCollection thisCollection) {
712
713 final int aBUFFER = 2048;
714 File unzipDestinationDirectory = null;
715 ZipFile zipFile = null;
716 FileOutputStream fos = null;
717 BufferedOutputStream dest = null;
718 BufferedInputStream bis = null;
719
720 try {
721
722 unzipDestinationDirectory = file2CacheDir(sourceZipFile, thisCollection);
723 log.info("unzipping " + sourceZipFile + " into " + unzipDestinationDirectory);
724
725 zipFile = new ZipFile(sourceZipFile, ZipFile.OPEN_READ);
726
727 Enumeration zipFileEntries = zipFile.entries();
728 while (zipFileEntries.hasMoreElements()) {
729
730 ZipEntry entry = (ZipEntry) zipFileEntries.nextElement();
731 String currentEntry = entry.getName();
732 log.debug("Extracting: " + entry);
733 File destFile = new File(unzipDestinationDirectory, currentEntry);
734
735 File destinationParent = destFile.getParentFile();
736
737 destinationParent.mkdirs();
738
739 if (!entry.isDirectory()) {
740 bis = new BufferedInputStream(zipFile.getInputStream(entry));
741 int currentByte;
742
743 byte[] data = new byte[aBUFFER];
744
745 fos = new FileOutputStream(destFile);
746 dest = new BufferedOutputStream(fos, aBUFFER);
747
748 while ((currentByte = bis.read(data, 0, aBUFFER)) != -1) {
749 dest.write(data, 0, currentByte);
750 }
751 dest.flush();
752 dest.close();
753 bis.close();
754 }
755 }
756 zipFile.close();
757
758
759 if (FileUtils.isIn(sourceZipFile, thisCollection.getCacheDirWithManagerDefaults())) {
760 sourceZipFile.delete();
761 }
762 }
763 catch (Exception e) {
764 log.error("Can't unzip: " + sourceZipFile, e);
765 }
766 finally {
767 try {
768 if (fos != null) {
769 fos.close();
770 }
771 if (dest != null) {
772 dest.close();
773 }
774 if (bis != null) {
775 bis.close();
776 }
777 }
778 catch (IOException e1) {
779 log.error("Error closing files", e1);
780 }
781 }
782
783 return unzipDestinationDirectory;
784 }
785
786 /***
787 * @return Returns the allExtractors.
788 */
789 public String[] getAllExtractors() {
790 return allExtractors;
791 }
792
793 /***
794 * @return Returns the mergeFactor.
795 */
796 public Integer getMergeFactor() {
797 return mergeFactor;
798 }
799
800 /***
801 * @param mergeFactor The mergeFactor to set.
802 */
803 public void setMergeFactor(Integer mergeFactor) {
804 this.mergeFactor = mergeFactor;
805 }
806
807 /***
808 * @return Returns the priority.
809 */
810 public Integer getPriority() {
811 return priority;
812 }
813
814 /***
815 * @param priority The priority to set.
816 */
817 public void setPriority(Integer priority) {
818 this.priority = priority;
819 }
820
821 /***
822 * @return Returns the maxMergeDocs.
823 */
824 public Integer getMaxMergeDocs() {
825 return maxMergeDocs;
826 }
827
828 /***
829 * @param maxMergeDocs The maxMergeDocs to set.
830 */
831 public void setMaxMergeDocs(Integer maxMergeDocs) {
832 this.maxMergeDocs = maxMergeDocs;
833 }
834
835 /***
836 * @return Returns the minMergeDocs.
837 */
838 public Integer getMinMergeDocs() {
839 return minMergeDocs;
840 }
841
842 /***
843 * @param minMergeDocs The minMergeDocs to set.
844 */
845 public void setMinMergeDocs(Integer minMergeDocs) {
846 this.minMergeDocs = minMergeDocs;
847 }
848 }