View Javadoc

1   /*
2    * Copyright 2003-2004 Michael Franken, Zilverline.
3    *
4    * The contents of this file, or the files included with this file, are subject to
5    * the current version of ZILVERLINE Collaborative Source License for the
6    * Zilverline Search Engine (the "License"); You may not use this file except in
7    * compliance with the License.
8    *
9    * You may obtain a copy of the License at
10   *
11   *     http://www.zilverline.org.
12   *
13   * See the License for the rights, obligations and
14   * limitations governing use of the contents of the file.
15   *
16   * The Original and Upgraded Code is the Zilverline Search Engine. The developer of
17   * the Original and Upgraded Code is Michael Franken. Michael Franken owns the
18   * copyrights in the portions it created. All Rights Reserved.
19   *
20   */
21  
22  package org.zilverline.extractors;
23  
24  import java.io.ByteArrayInputStream;
25  import java.io.ByteArrayOutputStream;
26  import java.io.File;
27  import java.io.FileInputStream;
28  import java.io.InputStream;
29  import java.io.InputStreamReader;
30  import java.io.Reader;
31  
32  import org.apache.poi.poifs.eventfilesystem.POIFSReader;
33  import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
34  import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
35  import org.apache.poi.poifs.filesystem.DocumentInputStream;
36  import org.apache.poi.util.LittleEndian;
37  
38  /***
39   * This class extracts text from MS Powerpoint files by using the POI library.
40   * 
41   * @author Michael Franken
42   * @version $Revision: 1.7 $
43   */
44  public class PowerPointExtractor extends AbstractExtractor implements POIFSReaderListener {
45  
46      /*** Writer to store parsed content. */
47      private ByteArrayOutputStream writer = new ByteArrayOutputStream();
48  
49      /***
50       * Extract the content from the given Powerpoint file. As a side effect the type is set too.
51       * 
52       * @see org.zilverline.extractors.AbstractExtractor#getContent(java.io.File)
53       */
54      public final Reader getContent(final File f) {
55  
56          setType("POWERPOINT");
57  
58          try {
59              POIFSReader reader = new POIFSReader();
60              reader.registerListener(this);
61              reader.read(new FileInputStream(f));
62              setSummary(getSummaryFromContent(writer.toString()));
63  
64              return new InputStreamReader(new ByteArrayInputStream(writer.toByteArray()));
65          }
66          catch (Exception e) {
67              log.warn("Can't extract contents for: " + f.getName(), e);
68          }
69  
70          return null;
71      }
72  
73      /***
74       * Extract the content from the given Powerpoint file. As a side effect the type is set too.
75       * 
76       * @see org.zilverline.extractors.AbstractExtractor#getContent(java.io.File)
77       */
78      public final String getContent(final InputStream is) {
79          try {
80              POIFSReader reader = new POIFSReader();
81              reader.registerListener(this);
82              reader.read(is);
83              return new String(writer.toByteArray());
84          }
85          catch (Exception e) {
86              log.warn("Can't extract contents", e);
87          }
88  
89          return "";
90      }
91  
92      /***
93       * @see org.apache.poi.poifs.eventfilesystem.POIFSReaderListener#processPOIFSReaderEvent(org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent)
94       */
95      public void processPOIFSReaderEvent(POIFSReaderEvent event) {
96          try {
97              if (!event.getName().equalsIgnoreCase("PowerPoint Document")) {
98                  return;
99              }
100             DocumentInputStream input = event.getStream();
101             byte[] buffer = new byte[input.available()];
102             input.read(buffer, 0, input.available());
103             for (int i = 0; i < buffer.length - 20; i++) {
104                 long type = LittleEndian.getUShort(buffer, i + 2);
105                 long size = LittleEndian.getUInt(buffer, i + 4);
106                 if (type == 4008) {
107                     writer.write(' ');
108                     writer.write(buffer, i + 4 + 4, (int) size);
109                     i = i + 4 + 4 + (int) size - 1;
110                 }
111             }
112         }
113         catch (Exception e) {
114             log.warn("Error parsing powerpoint", e);
115         }
116     }
117 }