1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package org.zilverline.extractors;
23
24 import java.io.File;
25 import java.io.FileInputStream;
26 import java.io.FileNotFoundException;
27 import java.io.IOException;
28 import java.io.InputStream;
29 import java.io.InputStreamReader;
30 import java.io.Reader;
31 import java.io.StringReader;
32
33 import org.cyberneko.html.parsers.DOMFragmentParser;
34 import org.w3c.dom.DocumentFragment;
35 import org.w3c.dom.Node;
36 import org.w3c.dom.NodeList;
37 import org.xml.sax.InputSource;
38 import org.xml.sax.SAXException;
39
40 import org.apache.html.dom.HTMLDocumentImpl;
41
42 /***
43 * Extracts contents from an HTML file using the NekoHTML library, based on Lucene in Action Book.
44 *
45 * @author Michael Franken
46 * @version $Revision: 1.17 $
47 */
48 public final class HTMLExtractor extends AbstractExtractor {
49 /***
50 * Extract the content from the given HTML file. As a side effect the type, title and summary are set too.
51 *
52 * @see org.zilverline.extractors.AbstractExtractor#getContent(java.io.File)
53 */
54 public Reader getContent(final File f) {
55 Reader reader = null;
56
57 setType("HTML");
58 try {
59 DOMFragmentParser parser = new DOMFragmentParser();
60 DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
61 log.debug("start parsing: " + f.getName());
62 parser.parse(new InputSource(new FileInputStream(f)), node);
63 log.debug("finished parsing: " + f.getName());
64 StringBuffer sb = new StringBuffer();
65
66 getText(sb, node, "title");
67 setTitle(sb.toString());
68
69 sb.setLength(0);
70 getText(sb, node);
71 reader = new StringReader(sb.toString());
72 setSummary(getSummaryFromContent(sb.toString()));
73 setISBN(getISBNFromContent(sb.toString()));
74
75
76 }
77 catch (FileNotFoundException e) {
78 log.warn("Can't open file: " + f.getName(), e);
79 }
80 catch (IOException e) {
81 log.warn("Can't extract contents for: " + f.getName(), e);
82 }
83 catch (SAXException e) {
84 log.warn("Can't parse contents for: " + f.getName(), e);
85 }
86
87 return reader;
88 }
89
90 /***
91 * Extract the content from the given HTML InputStream.
92 *
93 * @see org.zilverline.extractors.AbstractExtractor#getContent(java.io.File)
94 */
95 public String getContent(final InputStream is) {
96 try {
97 DOMFragmentParser parser = new DOMFragmentParser();
98 DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
99 parser.parse(new InputSource(new InputStreamReader(is)), node);
100 StringBuffer sb = new StringBuffer();
101
102 getText(sb, node);
103 return sb.toString();
104 }
105 catch (IOException e) {
106 log.warn("Can't extract contents for: " + is, e);
107 }
108 catch (SAXException e) {
109 log.warn("Can't parse contents for: " + is, e);
110 }
111
112 return "";
113 }
114
115 /***
116 * Extract the content from the given HTML String.
117 *
118 * @see org.zilverline.extractors.AbstractExtractor#getContent(java.io.File)
119 */
120 public String getContent(final String s) {
121 try {
122 DOMFragmentParser parser = new DOMFragmentParser();
123 DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
124 parser.parse(new InputSource(new StringReader(s)), node);
125 StringBuffer sb = new StringBuffer();
126
127 getText(sb, node);
128 return sb.toString();
129 }
130 catch (IOException e) {
131 log.warn("Can't extract contents for: " + s, e);
132 }
133 catch (SAXException e) {
134 log.warn("Can't parse contents for: " + s, e);
135 }
136
137 return "";
138 }
139
140 /***
141 * Get all text from the HTML document.
142 *
143 * @param sb the buffer to add the contents to.
144 * @param node the starting node.
145 */
146 private void getText(StringBuffer sb, final Node node) {
147 if (node.getNodeType() == Node.TEXT_NODE) {
148 sb.append(node.getNodeValue());
149 }
150 NodeList children = node.getChildNodes();
151 if (children != null) {
152 int len = children.getLength();
153 for (int i = 0; i < len; i++) {
154 getText(sb, children.item(i));
155 }
156 }
157 }
158
159 /***
160 * Get all text from a specific element in the HTML document.
161 *
162 * @param sb the buffer to add the contents to.
163 * @param node the starting node.
164 * @param element the element, such as 'title'.
165 *
166 * @return true if anything was added
167 */
168 private boolean getText(final StringBuffer sb, final Node node, final String element) {
169 if (node.getNodeType() == Node.ELEMENT_NODE) {
170 if (element.equalsIgnoreCase(node.getNodeName())) {
171 getText(sb, node);
172 return true;
173 }
174 }
175 NodeList children = node.getChildNodes();
176 if (children != null) {
177 int len = children.getLength();
178 for (int i = 0; i < len; i++) {
179 if (getText(sb, children.item(i), element)) {
180 return true;
181 }
182 }
183 }
184 return false;
185 }
186 }