1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package org.zilverline.extractors;
23
24 import java.io.CharArrayReader;
25 import java.io.CharArrayWriter;
26 import java.io.File;
27 import java.io.FileInputStream;
28 import java.io.InputStream;
29 import java.io.Reader;
30 import java.util.Iterator;
31
32 import org.apache.poi.hssf.usermodel.HSSFCell;
33 import org.apache.poi.hssf.usermodel.HSSFRow;
34 import org.apache.poi.hssf.usermodel.HSSFSheet;
35 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
36 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
37
38 /***
39 * This class extracts text from MS Excel files by using the POI library.
40 *
41 * @author Michael Franken
42 * @version $Revision: 1.6 $
43 */
44 public class ExcelExtractor extends AbstractExtractor {
45 /***
46 * Extract the content from the given Excel file. As a side effect the type is set too.
47 *
48 * @see org.zilverline.extractors.AbstractExtractor#getContent(java.io.File)
49 */
50 public final Reader getContent(final File f) {
51 Reader reader = null;
52
53 setType("EXCEL");
54
55 try {
56 CharArrayWriter writer = new CharArrayWriter();
57
58 POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(f));
59 HSSFWorkbook workbook = new HSSFWorkbook(fs);
60
61 for (int i = 0; i < workbook.getNumberOfSheets(); i++) {
62 HSSFSheet sheet = workbook.getSheetAt(i);
63
64 Iterator rows = sheet.rowIterator();
65 while (rows.hasNext()) {
66 HSSFRow row = (HSSFRow) rows.next();
67
68 Iterator cells = row.cellIterator();
69 while (cells.hasNext()) {
70 HSSFCell cell = (HSSFCell) cells.next();
71 switch (cell.getCellType()) {
72 case HSSFCell.CELL_TYPE_NUMERIC:
73 String num = Double.toString(cell.getNumericCellValue()).trim();
74 if (num.length() > 0) {
75 writer.write(num + " ");
76 }
77 break;
78 case HSSFCell.CELL_TYPE_STRING:
79 String text = cell.getStringCellValue().trim();
80 if (text.length() > 0) {
81 writer.write(text + " ");
82 }
83 break;
84 default:
85 }
86 }
87 }
88 }
89 setSummary(getSummaryFromContent(writer.toString()));
90
91 return new CharArrayReader(writer.toCharArray());
92 }
93 catch (Exception e) {
94 log.warn("Can't extract contents for: " + f.getName(), e);
95 }
96
97 return reader;
98 }
99
100 /***
101 * Extract the content from the given Excel file. As a side effect the type is set too.
102 *
103 * @see org.zilverline.extractors.AbstractExtractor#getContent(java.io.File)
104 */
105 public final String getContent(final InputStream is) {
106 try {
107 CharArrayWriter writer = new CharArrayWriter();
108
109 POIFSFileSystem fs = new POIFSFileSystem(is);
110 HSSFWorkbook workbook = new HSSFWorkbook(fs);
111
112 for (int i = 0; i < workbook.getNumberOfSheets(); i++) {
113 HSSFSheet sheet = workbook.getSheetAt(i);
114
115 Iterator rows = sheet.rowIterator();
116 while (rows.hasNext()) {
117 HSSFRow row = (HSSFRow) rows.next();
118
119 Iterator cells = row.cellIterator();
120 while (cells.hasNext()) {
121 HSSFCell cell = (HSSFCell) cells.next();
122 switch (cell.getCellType()) {
123 case HSSFCell.CELL_TYPE_NUMERIC:
124 String num = Double.toString(cell.getNumericCellValue()).trim();
125 if (num.length() > 0) {
126 writer.write(num + " ");
127 }
128 break;
129 case HSSFCell.CELL_TYPE_STRING:
130 String text = cell.getStringCellValue().trim();
131 if (text.length() > 0) {
132 writer.write(text + " ");
133 }
134 break;
135 default:
136 }
137 }
138 }
139 }
140
141 return new String(writer.toCharArray());
142 }
143 catch (Exception e) {
144 log.warn("Can't extract contents", e);
145 }
146
147 return "";
148 }
149 }