View Javadoc

1   /*
2    * Copyright 2003-2004 Michael Franken, Zilverline.
3    *
4    * The contents of this file, or the files included with this file, are subject to
5    * the current version of ZILVERLINE Collaborative Source License for the
6    * Zilverline Search Engine (the "License"); You may not use this file except in
7    * compliance with the License.
8    *
9    * You may obtain a copy of the License at
10   *
11   *     http://www.zilverline.org.
12   *
13   * See the License for the rights, obligations and
14   * limitations governing use of the contents of the file.
15   *
16   * The Original and Upgraded Code is the Zilverline Search Engine. The developer of
17   * the Original and Upgraded Code is Michael Franken. Michael Franken owns the
18   * copyrights in the portions it created. All Rights Reserved.
19   *
20   */
21  
22  package org.zilverline.extractors;
23  
24  import java.io.File;
25  import java.io.FileInputStream;
26  import java.io.FileNotFoundException;
27  import java.io.IOException;
28  import java.io.InputStream;
29  import java.io.InputStreamReader;
30  import java.io.Reader;
31  import java.io.StringReader;
32  
33  import org.cyberneko.html.parsers.DOMFragmentParser;
34  import org.w3c.dom.DocumentFragment;
35  import org.w3c.dom.Node;
36  import org.w3c.dom.NodeList;
37  import org.xml.sax.InputSource;
38  import org.xml.sax.SAXException;
39  
40  import org.apache.html.dom.HTMLDocumentImpl;
41  
42  /***
43   * Extracts contents from an HTML file using the NekoHTML library, based on Lucene in Action Book.
44   * 
45   * @author Michael Franken
46   * @version $Revision: 1.17 $
47   */
48  public final class HTMLExtractor extends AbstractExtractor {
49      /***
50       * Extract the content from the given HTML file. As a side effect the type, title and summary are set too.
51       * 
52       * @see org.zilverline.extractors.AbstractExtractor#getContent(java.io.File)
53       */
54      public Reader getContent(final File f) {
55          Reader reader = null;
56  
57          setType("HTML");
58          try {
59              DOMFragmentParser parser = new DOMFragmentParser();
60              DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
61              log.debug("start parsing: " + f.getName());
62              parser.parse(new InputSource(new FileInputStream(f)), node);
63              log.debug("finished parsing: " + f.getName());
64              StringBuffer sb = new StringBuffer();
65              // get the Title
66              getText(sb, node, "title");
67              setTitle(sb.toString());
68              // get the contents
69              sb.setLength(0);
70              getText(sb, node);
71              reader = new StringReader(sb.toString());
72              setSummary(getSummaryFromContent(sb.toString()));
73              setISBN(getISBNFromContent(sb.toString()));
74  
75              // setSummary(sb.toString().substring(0, Math.min(SUMMARY_SIZE, sb.length())));
76          }
77          catch (FileNotFoundException e) {
78              log.warn("Can't open file: " + f.getName(), e);
79          }
80          catch (IOException e) {
81              log.warn("Can't extract contents for: " + f.getName(), e);
82          }
83          catch (SAXException e) {
84              log.warn("Can't parse contents for: " + f.getName(), e);
85          }
86  
87          return reader;
88      }
89  
90      /***
91       * Extract the content from the given HTML InputStream.
92       * 
93       * @see org.zilverline.extractors.AbstractExtractor#getContent(java.io.File)
94       */
95      public String getContent(final InputStream is) {
96          try {
97              DOMFragmentParser parser = new DOMFragmentParser();
98              DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
99              parser.parse(new InputSource(new InputStreamReader(is)), node);
100             StringBuffer sb = new StringBuffer();
101             // get the contents
102             getText(sb, node);
103             return sb.toString();
104         }
105         catch (IOException e) {
106             log.warn("Can't extract contents for: " + is, e);
107         }
108         catch (SAXException e) {
109             log.warn("Can't parse contents for: " + is, e);
110         }
111 
112         return "";
113     }
114 
115     /***
116      * Extract the content from the given HTML String.
117      * 
118      * @see org.zilverline.extractors.AbstractExtractor#getContent(java.io.File)
119      */
120     public String getContent(final String s) {
121         try {
122             DOMFragmentParser parser = new DOMFragmentParser();
123             DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
124             parser.parse(new InputSource(new StringReader(s)), node);
125             StringBuffer sb = new StringBuffer();
126             // get the contents
127             getText(sb, node);
128             return sb.toString();
129         }
130         catch (IOException e) {
131             log.warn("Can't extract contents for: " + s, e);
132         }
133         catch (SAXException e) {
134             log.warn("Can't parse contents for: " + s, e);
135         }
136 
137         return "";
138     }
139 
140     /***
141      * Get all text from the HTML document.
142      * 
143      * @param sb the buffer to add the contents to.
144      * @param node the starting node.
145      */
146     private void getText(StringBuffer sb, final Node node) {
147         if (node.getNodeType() == Node.TEXT_NODE) {
148             sb.append(node.getNodeValue());
149         }
150         NodeList children = node.getChildNodes();
151         if (children != null) {
152             int len = children.getLength();
153             for (int i = 0; i < len; i++) {
154                 getText(sb, children.item(i));
155             }
156         }
157     }
158 
159     /***
160      * Get all text from a specific element in the HTML document.
161      * 
162      * @param sb the buffer to add the contents to.
163      * @param node the starting node.
164      * @param element the element, such as 'title'.
165      * 
166      * @return true if anything was added
167      */
168     private boolean getText(final StringBuffer sb, final Node node, final String element) {
169         if (node.getNodeType() == Node.ELEMENT_NODE) {
170             if (element.equalsIgnoreCase(node.getNodeName())) {
171                 getText(sb, node);
172                 return true;
173             }
174         }
175         NodeList children = node.getChildNodes();
176         if (children != null) {
177             int len = children.getLength();
178             for (int i = 0; i < len; i++) {
179                 if (getText(sb, children.item(i), element)) {
180                     return true;
181                 }
182             }
183         }
184         return false;
185     }
186 }