日期:2014-05-18  浏览次数:20997 次

获取网页的html
在做一个抓取网页信息的小软件,对方网页使用url重写,不能获得真实的url,我该怎么获取他的html呢?源码是
C# code
<a name="EntrezSystem2.PEntrez.Pubmed.Pubmed_ResultsPanel.Entrez_Pager.Page" title="Next page of results" class="active page_link next" href="#" sid="3" page="2" id="EntrezSystem2.PEntrez.Pubmed.Pubmed_ResultsPanel.Entrez_Pager.Page">Next &gt;</a>

如何获取next的html,最好有代码,给个思路也行

------解决方案--------------------
dom 有了解吗 他是为了domhtml内容的工具。
------解决方案--------------------
也看这个 我之前代码,获得html的所有数据 遍历到一个treeview上。自己学习下
 private void button1_Click(object sender, EventArgs e)
{
string test = Application.StartupPath + "WriteLines.html";
string textResult = Convert(test );
MessageBox.Show(textResult );
}
public static string Convert(string html)
{
if (string.IsNullOrEmpty(html.Trim()))
{
return string.Empty;
}
using (SgmlReader reader = new SgmlReader())
{
reader.DocType = "HTML";
reader.InputStream = new StringReader(html);
using (StringWriter stringWriter = new StringWriter())
{
using (XmlTextWriter writer = new XmlTextWriter(stringWriter))
{
reader.WhitespaceHandling = WhitespaceHandling.None;
writer.Formatting = Formatting.Indented;
XmlDocument doc = new XmlDocument();
doc.Load(reader);
doc.Save("c:\\txt.xml");
if (doc.DocumentElement == null)
{
return string.Empty;
}
else
{
doc.DocumentElement.WriteContentTo(writer);
}
writer.Close();
string xhtml = stringWriter.ToString();
return xhtml;
}
}
}
}

private void button2_Click(object sender, EventArgs e)
{
object Zero = 0;
object EmptyString = "";
axWebBrowser1.Navigate(textBox1.Text, ref Zero, ref EmptyString, ref EmptyString, ref EmptyString);
}

private void axWebBrowser1_DocumentComplete(object sender, AxSHDocVw.DWebBrowserEvents2_DocumentCompleteEvent e)
{
IHTMLDocument2 HTMLDocument = (IHTMLDocument2)axWebBrowser1.Document;
IHTMLElementCollection links = HTMLDocument.links;

listBox1.Items.Clear();
string uspath = Application.StartupPath + "\\WriteLines.html";
uspath.Remove(0);
//using ( StreamWriter sw = new StreamWriter(@"C:\WriteLines.html", true))
using (StreamWriter sw = new StreamWriter(uspath, true))
foreach (HTMLAnchorElementClass el in links)
{
listBox1.Items.Add(el.outerHTML);
sw.WriteLine(el.outerHTML);
sw.Close();
}
}

private void button3_Click(object sender, EventArgs e)
{
string uspath = Application.StartupPath + "\\WriteLines.html";
StreamReader objreder = new StreamReader(uspath );
string sling = "";
ArrayList arlist = new ArrayList();
while (sling != null)
{
sling = objreder.ReadLine();
//插入数组;
Convert(sling );
if (sling != null)
arlist.Add(sling);
}
objreder.Close();
foreach (string strout in arlist)
{
MessageBox.Show(strout );
}
}

}
------解决方案--------------------
HtmlAgilityPack 这个玩意 也可以。
------解决方案--------------------
Fiddler2工具可以获取真实的url
------解决方案--------------------
探讨

引用:

也看这个 我之前代码,获得html的所有数据 遍历到一个treeview上。自己学习下
private void button1_Click(object sender, EventArgs e)