日期:2014-05-17  浏览次数:20714 次

HTML标签补缺器
需要导入Nekohtml解析器相关包

import org.cyberneko.html.parsers.DOMParser;


DOMParser parser = new DOMParser();
		
		//下面这段代码缺少了</p></pre>标记
		InputStream in = new ByteArrayInputStream("<pre><pre>Product Name:lady slipper  </pre><pre>Model Number: 816</pre><pre>Size:36-41#</pre><p>Color: all colors available </p><pre>Place of Origin: China</pre><pre> </pre><pre>Feature:</pre><pre>1)UPPER: pvc</pre><pre>2)OUTSOLE:pvc</pre><pre>3)Suitable age:women</pre><p>Packing:polybag or according to your requests".getBytes());
		InputSource source = new InputSource(in);
		
		//解析源
		parser.parse(source);
		//得到解析完成的DOCUMENT
		Document doc = parser.getDocument();
		// 获得将DOM文档转化为XML文件的转换器。
		TransformerFactory tfactory = TransformerFactory.newInstance();
		Transformer transformer = tfactory.newTransformer();
		// 将DOM对象转化为DOMSource类对象
		DOMSource dsource = new DOMSource(doc);
		StringWriter write = new StringWriter();
		// 获得一个StreamResult类对象,该对象是DOM文档转化成的其他形式的文档的容器,可以是XML文件,文本文件,HTML文件。这里为一个XML文件
		StreamResult result = new StreamResult(write);
		// 调用API,将DOM文档转化成XML文件
		transformer.transform(dsource, result);
		System.out.println(write.getBuffer().toString());