日期:2014-05-17  浏览次数:20745 次

perl 解析html, 自己写的几个比较常用方法

perl解析html, 比较常用的模块是 HTML::TreeBuilder,该模块将html字符转化dom树,方便操作

一个dom元素对应 一个HTML::Element对象,dom的

属性方法都定义在该类中,以下是代码

?

# 一个dom元素,比较重要的属性,方法
# attr: $e->attr('id')                 返回标签的某个属性值
# text: $e->as_text                    返回标签包裹的内容,如<a>click me</a> 返回的是 click me
# html: $e->as_HTML                    返回该元素的html文本
# tagname: $e->tag() ,                 返回标签名,如a,div, 小写形式
# parent node: $e->parent           , 返回父节点
# children node: $e->content_list() ,  返回所有子节点,只是直接子节点

use HTML::TreeBuilder;
@d=<DATA>;
$html = join "",@d;
#print get_elements_by_attr_regex($html, 'id', qr/^\d+$/)->as_HTML;
# @elist = get_elements($html,'table','id',qr/\d+/);
# print $elist[0]->as_HTML;


$table = get_element_by_id($html, 'table1');
@children = $table->content_list();
foreach $child (@children){
	print "child tag:",$child->tag(), "\n";
}





#function defined........................

#$html: a html content
#$attr: attribute name
#$attr_regex: a pattern of attr value
sub get_elements_by_attr_regex{
	my ($html, $attr, $attr_regex) = @_;
	my $tree = new HTML::TreeBuilder;
	$tree->parse_content($html);
	my @list = $tree->look_down($attr, $attr_regex);
	return @list;
}



#$html: a html content
#$idvalue: id value
#
sub get_element_by_id{
	my ($html, $idvalue) =@_;
	my $tree = new HTML::TreeBuilder;
	$tree->parse_content($html);
	my @list = $tree->look_down('id',$idvalue);
	die "not unique element by id:$idvalue" if scalar(@list) != 1;
	return $list[0];
}

#$html: a html content
#$tagname: tag name
#
sub get_elements_by_tag_name{
	my ($html, $tagname) =@_;
	my $tree = new HTML::TreeBuilder;
	$tree->parse_content($html);
	return $tree->find_by_tag_name($tagname);
}

#$html: a html string
#$tag:  tag name
#$attr: attr name
#$attr_regex: attr value pattern
sub get_elements{
	my ($html, $tag, $attr, $attr_regex) = @_;
	my @list = get_elements_by_attr_regex($html, $attr, $attr_regex);
	$tag = lc $tag;
	@list = grep $_->tag() eq $tag, @list;
	return @list;
}


__DATA__
<table	id="table1" border="1" cellspacing="0" cellpadding="6">
  <tr><td><a href="x">x text</a></td><td><a href="y">y</a></td></tr>
  <tr><td id='1s'>1</td><td >2</td></tr>
</table>
?

?

?

?

?

?