View Javadoc

1   /*
2    * Copyright 2003-2004 Michael Franken, Zilverline.
3    *
4    * The contents of this file, or the files included with this file, are subject to
5    * the current version of ZILVERLINE Collaborative Source License for the
6    * Zilverline Search Engine (the "License"); You may not use this file except in
7    * compliance with the License.
8    *
9    * You may obtain a copy of the License at
10   *
11   *     http://www.zilverline.org.
12   *
13   * See the License for the rights, obligations and
14   * limitations governing use of the contents of the file.
15   *
16   * The Original and Upgraded Code is the Zilverline Search Engine. The developer of
17   * the Original and Upgraded Code is Michael Franken. Michael Franken owns the
18   * copyrights in the portions it created. All Rights Reserved.
19   *
20   */
21  
22  package org.zilverline.extractors;
23  
24  import java.io.CharArrayReader;
25  import java.io.CharArrayWriter;
26  import java.io.File;
27  import java.io.FileInputStream;
28  import java.io.InputStream;
29  import java.io.Reader;
30  import java.util.Iterator;
31  
32  import org.apache.poi.hssf.usermodel.HSSFCell;
33  import org.apache.poi.hssf.usermodel.HSSFRow;
34  import org.apache.poi.hssf.usermodel.HSSFSheet;
35  import org.apache.poi.hssf.usermodel.HSSFWorkbook;
36  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
37  
38  /***
39   * This class extracts text from MS Excel files by using the POI library.
40   * 
41   * @author Michael Franken
42   * @version $Revision: 1.6 $
43   */
44  public class ExcelExtractor extends AbstractExtractor {
45      /***
46       * Extract the content from the given Excel file. As a side effect the type is set too.
47       * 
48       * @see org.zilverline.extractors.AbstractExtractor#getContent(java.io.File)
49       */
50      public final Reader getContent(final File f) {
51          Reader reader = null;
52  
53          setType("EXCEL");
54  
55          try {
56              CharArrayWriter writer = new CharArrayWriter();
57  
58              POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(f));
59              HSSFWorkbook workbook = new HSSFWorkbook(fs);
60  
61              for (int i = 0; i < workbook.getNumberOfSheets(); i++) {
62                  HSSFSheet sheet = workbook.getSheetAt(i);
63  
64                  Iterator rows = sheet.rowIterator();
65                  while (rows.hasNext()) {
66                      HSSFRow row = (HSSFRow) rows.next();
67  
68                      Iterator cells = row.cellIterator();
69                      while (cells.hasNext()) {
70                          HSSFCell cell = (HSSFCell) cells.next();
71                          switch (cell.getCellType()) {
72                          case HSSFCell.CELL_TYPE_NUMERIC:
73                              String num = Double.toString(cell.getNumericCellValue()).trim();
74                              if (num.length() > 0) {
75                                  writer.write(num + " ");
76                              }
77                              break;
78                          case HSSFCell.CELL_TYPE_STRING:
79                              String text = cell.getStringCellValue().trim();
80                              if (text.length() > 0) {
81                                  writer.write(text + " ");
82                              }
83                              break;
84                          default: // skip
85                          }
86                      }
87                  }
88              }
89              setSummary(getSummaryFromContent(writer.toString()));
90  
91              return new CharArrayReader(writer.toCharArray());
92          }
93          catch (Exception e) {
94              log.warn("Can't extract contents for: " + f.getName(), e);
95          }
96  
97          return reader;
98      }
99  
100     /***
101      * Extract the content from the given Excel file. As a side effect the type is set too.
102      * 
103      * @see org.zilverline.extractors.AbstractExtractor#getContent(java.io.File)
104      */
105     public final String getContent(final InputStream is) {
106         try {
107             CharArrayWriter writer = new CharArrayWriter();
108 
109             POIFSFileSystem fs = new POIFSFileSystem(is);
110             HSSFWorkbook workbook = new HSSFWorkbook(fs);
111 
112             for (int i = 0; i < workbook.getNumberOfSheets(); i++) {
113                 HSSFSheet sheet = workbook.getSheetAt(i);
114 
115                 Iterator rows = sheet.rowIterator();
116                 while (rows.hasNext()) {
117                     HSSFRow row = (HSSFRow) rows.next();
118 
119                     Iterator cells = row.cellIterator();
120                     while (cells.hasNext()) {
121                         HSSFCell cell = (HSSFCell) cells.next();
122                         switch (cell.getCellType()) {
123                         case HSSFCell.CELL_TYPE_NUMERIC:
124                             String num = Double.toString(cell.getNumericCellValue()).trim();
125                             if (num.length() > 0) {
126                                 writer.write(num + " ");
127                             }
128                             break;
129                         case HSSFCell.CELL_TYPE_STRING:
130                             String text = cell.getStringCellValue().trim();
131                             if (text.length() > 0) {
132                                 writer.write(text + " ");
133                             }
134                             break;
135                         default: // skip
136                         }
137                     }
138                 }
139             }
140 
141             return new String(writer.toCharArray());
142         }
143         catch (Exception e) {
144             log.warn("Can't extract contents", e);
145         }
146 
147         return "";
148     }
149 }