日期:2014-05-17  浏览次数:20599 次

使用Winista.Text.HtmlParser采集南京价格信息网

数据库:MySQL

平台:.net framework 2.0 (C#)

组建:Winista.Text.HtmlParser

演示页面:2013年3月7日南京市各区县农贸市场主副食品价格对比表  http://www.njprice.com/col71/col464/articleinfo.php?infoid=44181

2013年2月28日南京市各区县农贸市场主副食品价格对比表   http://www.njprice.com/col71/col464/articleinfo.php?infoid=44079

以及所有其他日期发布的《南京市各区县农贸市场主副食品价格对比表》

using System;
using System.Collections.Generic;
using System.Text;
using nanjing_price.WebUtility;
using Winista.Text.HtmlParser;
using Winista.Text.HtmlParser.Lex;
using Winista.Text.HtmlParser.Util;
using Winista.Text.HtmlParser.Tags;
using Winista.Text.HtmlParser.Filters;
using org.nutlab;

namespace nanjing_price.Fuction
{
    class NanjingMain
    {
        string content;
        string urlStr;

        public NanjingMain(string url)
        {
            this.urlStr = url;
            getContent();
            inputDB();
        }

        void getContent()
        {
            webUtility web = new webUtility();
            content = Tools.filterScript(web.getContent(urlStr));
        }

        void inputDB()
        {
            DateTime publishTime = new DateTime();
            good_price price = new good_price();
            Parser parser = Parser.CreateParser(Tools.filterTableAttribute(content.Replace(System.Environment.NewLine, "")), "gb2312");
            NodeFilter table = new TagNameFilter("table");
            INode Table = parser.Parse(table)[2].Children[3];
            parser = Parser.CreateParser(Table.ToHtml(), "gb2312");
            Table = parser.Parse(table)[3];
            parser = Parser.CreateParser(Table.ToHtml(), "gb2312");
            Table = parser.Parse(table)[2];
            Console.WriteLine(Table.ToHtml());
            INode tempTag = Table;
            TableTag tableTag = (TableTag)tempTag;//上面判断如果得到的第一个为table 
            Winista.Text.HtmlParser.Tags.TableRow[] tr = tableTag.Rows;//得到该table所有的tr
            TableColumn[] tc = tr[2].Columns;
            publishTime = Convert.ToDateTime(tc[0].ToPlainTextString().Trim().Substring(3));
            Console.WriteLine(publishTime);
            for (int i = 6; i < tr.Length; i++)
            {
                tc = tr[i].Columns;
                for (int j = 3; j < tc.Length; j++)
                {
                    price.name = tc[0].ToPlainTextString().Trim();
                    price.standard = tc[1].ToPlainTextString().Trim();
                    price.unit = tc[2].ToPlainTextString().Trim();
                    price.district_name = tr[4].Columns[j].ToPlainTextString().Trim();
                    price.market_name = tr[5].Columns[j].ToPlainTextString().Trim();
                    price.amount = tc[j].ToPlainTextString().Trim();
                    price.publish_time = publishTime;
                    price.get_time = System.DateTime.Now;
                    price.get_url = urlStr;
                    price.Add();
                }
            }
        }

    }
}

源代码下载:点击下载

SVN: Google Code