1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package org.zilverline.extractors;
23
24 import java.io.ByteArrayInputStream;
25 import java.io.ByteArrayOutputStream;
26 import java.io.File;
27 import java.io.FileInputStream;
28 import java.io.InputStream;
29 import java.io.InputStreamReader;
30 import java.io.Reader;
31
32 import org.apache.poi.poifs.eventfilesystem.POIFSReader;
33 import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
34 import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
35 import org.apache.poi.poifs.filesystem.DocumentInputStream;
36 import org.apache.poi.util.LittleEndian;
37
38 /***
39 * This class extracts text from MS Powerpoint files by using the POI library.
40 *
41 * @author Michael Franken
42 * @version $Revision: 1.7 $
43 */
44 public class PowerPointExtractor extends AbstractExtractor implements POIFSReaderListener {
45
46 /*** Writer to store parsed content. */
47 private ByteArrayOutputStream writer = new ByteArrayOutputStream();
48
49 /***
50 * Extract the content from the given Powerpoint file. As a side effect the type is set too.
51 *
52 * @see org.zilverline.extractors.AbstractExtractor#getContent(java.io.File)
53 */
54 public final Reader getContent(final File f) {
55
56 setType("POWERPOINT");
57
58 try {
59 POIFSReader reader = new POIFSReader();
60 reader.registerListener(this);
61 reader.read(new FileInputStream(f));
62 setSummary(getSummaryFromContent(writer.toString()));
63
64 return new InputStreamReader(new ByteArrayInputStream(writer.toByteArray()));
65 }
66 catch (Exception e) {
67 log.warn("Can't extract contents for: " + f.getName(), e);
68 }
69
70 return null;
71 }
72
73 /***
74 * Extract the content from the given Powerpoint file. As a side effect the type is set too.
75 *
76 * @see org.zilverline.extractors.AbstractExtractor#getContent(java.io.File)
77 */
78 public final String getContent(final InputStream is) {
79 try {
80 POIFSReader reader = new POIFSReader();
81 reader.registerListener(this);
82 reader.read(is);
83 return new String(writer.toByteArray());
84 }
85 catch (Exception e) {
86 log.warn("Can't extract contents", e);
87 }
88
89 return "";
90 }
91
92 /***
93 * @see org.apache.poi.poifs.eventfilesystem.POIFSReaderListener#processPOIFSReaderEvent(org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent)
94 */
95 public void processPOIFSReaderEvent(POIFSReaderEvent event) {
96 try {
97 if (!event.getName().equalsIgnoreCase("PowerPoint Document")) {
98 return;
99 }
100 DocumentInputStream input = event.getStream();
101 byte[] buffer = new byte[input.available()];
102 input.read(buffer, 0, input.available());
103 for (int i = 0; i < buffer.length - 20; i++) {
104 long type = LittleEndian.getUShort(buffer, i + 2);
105 long size = LittleEndian.getUInt(buffer, i + 4);
106 if (type == 4008) {
107 writer.write(' ');
108 writer.write(buffer, i + 4 + 4, (int) size);
109 i = i + 4 + 4 + (int) size - 1;
110 }
111 }
112 }
113 catch (Exception e) {
114 log.warn("Error parsing powerpoint", e);
115 }
116 }
117 }