日期:2014-05-17  浏览次数:20780 次

C#采集CSDN单个博客所有文章

原理:

通过HtmlAgilityPack解析html源码得到所需的数据。

1、首先通过http://blog.csdn.net/gdjlc 底部的“xx条数据 共xx页”,获取得总页数;

2、获取每一页的所有文章URL,每一页的URL如下所示: http://blog.csdn.net/gdjlc/article/list/当前页索引,从1一直循环到总页数即可得.

3、获取单个文章的内容。

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.IO;
using System.Diagnostics;
using System.Collections;
using System.Threading;
using System.Threading.Tasks;
using System.Xml.Linq;
using System.Net;
using HtmlAgilityPack;

namespace Demo
{
    public partial class FrmCSDN : Form
    {       
        const string BLOGUSER = "gdjlc"; //博客用户名
        const string BLOGURL = "http://blog.csdn.net";
        const string PAGECOUNTPATH = "//div[@id='papelist']/span[1]";//总页数PATH
        const string ARTICLEURLPATH = "//span[@class='link_title']/a[1]"; //文章URL的PATH    
        const string ARTICLETITLEPATH = "//div[@class='article_title']/h3/span/a";//文章标题PATH
        const string POSTDATEPATH = "//span[@class='link_postdate']"; //文章创建日期PATH
        const string ARTICLECONTENTPATH = "//div[@id='article_content']"; //文章内容PATH
        List<string> articleUrlList = new List<string>(); //所有文章的URL
        private object moniter = new object();
        Stopwatch stopwatch = new Stopwatch();
        int cnt = 0;

        public FrmCSDN()
        {
            InitializeComponent();
        }

        //获取总页数
        private int GetPageCount(string pageCountUrl)
        {         
            HtmlNode rootNode = GetHtmlNodeByUrl(pageCountUrl, Encoding.UTF8);
            if (rootNode == null)
                return 0;            
            //形如“177条数据 共12页”
            string pageCountText = GetNodeInnerText(rootNode, PAGECOUNTPATH);
            int firstIndex = pageCountText.LastIndexOf("共") + 1;
            int lastIndex = pageCountText.LastIndexOf("页");
            string result = pageCountText.Substring(firstIndex, lastIndex - firstIndex);
            return Convert.ToInt32(result);
        }
        //开始采集按钮
        private void btnCollect_Click(object sender, EventArgs e)
        {
            stopwatch.Restart();         
            Task.Factory.StartNew(() =>
            {
                cnt = 0;
                int pageCount = GetPageCount(BLOGURL + "/" + BLOGUSER);
                if (pageCount == 0) 
                    return;
                
                //所有文章的URL
                for (int pageIndex = 1; pageIndex <= pageCount; pageIndex++)
                {
                    string pageIndexUrl = BLOGURL + "/" + BLOGUSER + "/article/list/" + pageIndex.ToString();
                    HtmlNode rootNode = GetHtmlNodeByUrl(pageIndexUrl, Encoding.UTF8);
                    if (rootNode == null)
                        continue;

                    HtmlNodeCollection ArticleUrlList = rootNode.SelectNodes(ARTICLEURLPATH);
                    foreach (HtmlNode articleUrlNode in ArticleUrlList)
                    {
                        string articleUrl = BLOGURL + articleUrlNode.Attributes["href"].Value;
                        articleUrlList.Add(articleUrl);
                    }
                }
            }).ContinueWith((x) =>
            {
                TaskFactory taskFactory = new TaskFactory();
                Task[] tasks = new Task[articleUrlList.Count];
                for (int i = 0; i < articleUrlList.Count; i++)
                {
                    tasks[i] = new Task(CollectArticle, articleUrlList[i]);
                    tasks[i].Start();
                }
                taskFactory.ContinueWhenAll(tasks, TaskEnded, TaskContinuationOptions.None);
            });
        }