日期:2014-05-17  浏览次数:20628 次

解析html获取imag路径
package com.web.test;

import java.io.*;
import java.util.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
import javax.swing.text.html.parser.*;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;

/**
 * 过滤image
 * @author chenxiaobing
 *
 */
public class HTMLParser extends ParserCallback
{ // 继承ParserCallback,解析结果驱动这些回调方法
	protected String base;
	protected boolean isImg = false;
	protected boolean isParagraph = false;
	protected static Vector<String> element = new Vector<String>();
	protected static String paragraphText = new String();

	public HTMLParser()
	{

	}

	public static String getParagraphText()
	{
		return paragraphText;
	}

	public void handleComment(char[] data, int pos)
	{
	}

	public void handleEndTag(HTML.Tag t, int pos)
	{
		if (t == HTML.Tag.P)
		{
			if (isParagraph)
			{
				isParagraph = false;
			}
		}
		else if (t == HTML.Tag.IMG)
		{
			if (isImg)
			{
				isImg = false;
			}

		}
	}

	public void handleError(String errorMsg, int pos)
	{
	}

	public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos)
	{
		handleStartTag(t, a, pos);
	}

	public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos)
	{
		if (t == HTML.Tag.P)
		{
			isParagraph = true;
		}
		else if ((t == HTML.Tag.IMG))
		{
			String src = (String) a.getAttribute(HTML.Attribute.SRC);
			if (src != null)
			{

				element.addElement(src);
				isImg = true;
			}
		}
	}

	public void handleText(char[] data, int pos)
	{

		if (isParagraph)
		{
			String tempParagraphText = new String(data);
			if (paragraphText != null)
			{
				element.addElement(tempParagraphText);
				;

			}
		}
	}

	private static void startParse(String sHtml)
	{
		try
		{
			ParserDelegator ps = new ParserDelegator();// 负责每次在调用其 parse
														// 方法时启动一个新的
														// DocumentParser
			HTMLEditorKit.ParserCallback parser = new HTMLParser();// 解析结果驱动这些回调方法。
			ps.parse(new StringReader(sHtml), parser, true);// 解析给定的流并通过解析的结果驱动给定的回调。

			// System.out.println(getParagraphText());
			Vector link = element;
			for (int i = 0; i < link.size(); i++)
			{
				System.out.println("----haha-----");
				System.out.println(link.get(i));
			}

		}
		catch (Exception e)
		{
			e.printStackTrace();
		}
	}

	public static void main(String args[])
	{

		try
		{
			String filename = "E:/HTML/html002.html";
			BufferedReader brd = new BufferedReader(new FileReader(filename));
			char[] str = new char[50000];
			brd.read(str);
			String sHtml = new String(str);

			startParse(sHtml);
		}
		catch (Exception e)
		{
			e.printStackTrace();
		}
	}

}