45fan.com - 路饭网

搜索: 您的位置主页 > 网络频道 > 阅读资讯:怎么样通过c#抓取网页分析?

怎么样通过c#抓取网页分析?

2016-09-02 14:05:14 来源:www.45fan.com 【

怎么样通过c#抓取网页分析?

c#抓取网页分析

目的:

抓取网页,分析网页内容,进行处理获取信息。

例子:

抓km169上的adsl用户的费用信息,分析存储到本地数据库。

步骤:1、抓龋2、分析。3、存储。

王暴徒 2006-2-13 05:48
1抓取

public string GetPage(string url, string postData, out string err)

{

err = "";

Stream outstream = null;

Stream instream = null;

StreamReader sr = null;

HttpWebResponse response = null;

HttpWebRequest request = null;

Encoding encoding = Encoding.Default;

byte[] data = encoding.GetBytes(postData);

// 准备请求...

try

{

// 设置参数

request = WebRequest.Create(url) as HttpWebRequest;

CookieContainer cookieContainer = new CookieContainer();

request.CookieContainer = cookieContainer;

request.AllowAutoRedirect = true;

request.Method = "POST";

request.ContentType = "application/x-www-form-urlencoded";

request.ContentLength = data.Length;

outstream = request.GetRequestStream();

outstream.Write(data, 0, data.Length);

outstream.Close();

//发送请求并获取相应回应数据

response = request.GetResponse() as HttpWebResponse;

//直到request.GetResponse()程序才开始向目标网页发送Post请求

instream = response.GetResponseStream();

sr = new StreamReader(instream, encoding);

//返回结果网页(html)代码

string content = sr.ReadToEnd();

err = string.Empty;

return content;

}

catch (Exception ex)

{

err = ex.Message;

return string.Empty;

}

}

[[i] Last edited by 王暴徒 on 2006-2-13 at 13:49 [/i]]

王暴徒 2006-2-13 05:56
2、分析

public string Get()

{

string str = GetPage(KMADSLURL, strReq, out err);

Regex rgx = new Regex("table_det//(//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/"//);", RegexOptions.Singleline);

foreach (Match m in rgx.Matches(str))

{

Rec r = new Rec();

r.str1 = m.Groups[1].Value;

r.Save();

}

return null;

}

此处的关键在于正则表达式,利用匹配关系获得一条条记录,再用%1~%9分组,得到每个字段的内容,最后生成相应的记录即可(拼sql也可),这里用了个持久化的咚咚,下次详细说。

正则技巧:用^(间隔符号)来划分字段,:)不大好解释,大家自己体会下吧。

[[i] Last edited by 王暴徒 on 2006-2-13 at 13:58 [/i]]

Timothy 2006-2-14 01:44

我以前写了个多线程批量下载歌曲的程序,当时程序考虑的是挂接百渡,同时又预留了扩展性,比如通过配置也可以获取雅虎的歌曲,这就好考虑到个网站网页的编码方式,和暴徒的一样,我也是用了HttpWebResponse 类.通过对各种编码的网页在2进制下面的观察,发现前2个字节不同,所以转换成STRING时候需要特殊处理,否则中文有乱码

以下是我对几种常用的编码进行的分析

//获取源代码的编码类别

//UNICODE

if(b[0]==0xFF && b[1]==0xFE)

{

return System.Text.Encoding.Unicode.GetString(b,0,b.Length);

}

//UNICODE BIG ENDIAN

else if(b[0]==0xFE && b[1]==0xFF)

{

return System.Text.Encoding.BigEndianUnicode.GetString(b,0,b.Length);

}

//UTF8

else if(b[0]==0xEF && b[1]==0xBB)

{

return System.Text.Encoding.UTF8.GetString(b,0,b.Length);

}

//DEFAULT ANSII

else

{

return System.Text.Encoding.Default.GetString(b,0,b.Length);

}

 

Timothy 2006-2-14 01:49
其中b是网页源代码的以二进制方式读取的数组

王暴徒 2006-2-14 03:39
小强~呵呵,

Timothy 2006-2-14 05:38
这样的也算小强阿,我一向都把你当我的偶像来的:)

王暴徒 2006-2-16 09:01
你是我偶像,把你那个程序详细分析下给大家看看嘛。

Timothy 2006-2-16 10:14
我开始是想把它完善,然后和我的网站绑定,然后通过在程序开始或者结束弹出广告

来赚钱,当我完成了一个模型的时候,就是能够从百度对它的几个分类(TOP500什么的)进行批量下载的时候,

我同事说已经有人作了,我下载了看了下挺好的,不过还不人性化,有些需求没有考虑到,

比如只能挂接BAIDU,没有灵活配置的接口,使可以挂接其他网站,比如古典音乐

网站什么的.但是很小,才300k,而我的.net作的东西太大了,有FREAMEWORK都快接近25M了,

别人说什么也不会用,所以就暂停了,等我有时间多用VC重新弄个人性化的来玩玩.

现在要弄微软路线的BI了,看看数据挖掘和报表服务,耽误了工作以后就麻烦了.

百度也过分,知道有人老是通过它来下载歌曲,源代码的结构经常改,有两个办法,一个是

你也经常去分析他的源代码,发现变了,你赶快修改你的配置文件,其中配置文件放在

你的网站上,下载歌曲的程序运行时通过后台线程去读取它.这样就是累.另外一个办法就是

通过人工智能分析,比如读取TOP500页面后,上面有几百首歌,让程序自动去分析那些是

歌曲的显示名称,那些是连接的URL,哪些是序号什么的,还有歌词URL,这样就对百度不变应万变了,

呵呵,需要一定技术

今天玩台球去了,明天贴点源代码,主要是太多了,除非画个UML图.

Timothy 2006-2-17 01:32
我开始也写了你和你一样的单线程下载的类,不过后来通过增加一个委托,使他能够实现同步和异步的调用,后来我在网上下载了一个兄弟的源代码,他的是异步多线程调用,不过代码有BUG,不能能多线程下载,但是架构很优美,我花了一个晚上时间把它修改了一下,为了错误重试我个人方便,我增加了个别方法破坏了他的封装性,有兴趣看他的代码的朋友可以在下面看

namespace Mp3Crazy

{

using System;

/// <summary>

/// 包含 Exception 事件数据的类

/// </summary>

public class ExceptionEventArgs : System.EventArgs

{

private System.Exception _Exception;

private ExceptionActions _ExceptionAction;

private DownLoadState _DownloadState;

public DownLoadState DownloadState

{

get

{

return _DownloadState;

}

}

public Exception Exception

{

get

{

return _Exception;

}

}

public ExceptionActions ExceptionAction

{

get

{

return _ExceptionAction;

}

set

{

_ExceptionAction = value;

}

}

internal ExceptionEventArgs(System.Exception e, DownLoadState DownloadState)

{

this._Exception = e;

this._DownloadState = DownloadState;

}

}

/// <summary>

/// 包含 DownLoad 事件数据的类

/// </summary>

public class DownLoadEventArgs : System.EventArgs

{

private DownLoadState _DownloadState;

public DownLoadState DownloadState

{

get

{

return _DownloadState;

}

}

public DownLoadEventArgs(DownLoadState DownloadState)

{

this._DownloadState = DownloadState;

}

}

public class ThreadProcessEventArgs : System.EventArgs

{

private string _id;

public ThreadProcessEventArgs(string id)

{

this._id=id;

}

}

}
Timothy 2006-2-17 01:33
namespace Mp3Crazy

{

using System;

/// <summary>

/// 记录下载的字节位置

/// </summary>

public class DownLoadState

{

private string _FileName;

private string _AttachmentName;

private int _Position;

private string _RequestURL;

private string _ResponseURL;

private int _Length;

private byte[] _Data;

public string FileName

{

get

{

return _FileName;

}

}

public int Position

{

get

{

return _Position;

}

}

public int Length

{

get

{

return _Length;

}

}

public string AttachmentName

{

get

{

return _AttachmentName;

}

}

public string RequestURL

{

get

{

return _RequestURL;

}

}

public string ResponseURL

{

get

{

return _ResponseURL;

}

}

public byte[] Data

{

get

{

return _Data;

}

}

internal DownLoadState(string RequestURL, string ResponseURL, string FileName, string AttachmentName, int Position, int Length, byte[] Data)

{

this._FileName = FileName;

this._RequestURL = RequestURL;

this._ResponseURL = ResponseURL;

this._AttachmentName = AttachmentName;

this._Position = Position;

this._Data = Data;

this._Length = Length;

}

internal DownLoadState(string RequestURL, string ResponseURL, string FileName, string AttachmentName, int Position, int Length, ThreadCallbackHandler tch)

{

this._RequestURL = RequestURL;

this._ResponseURL = ResponseURL;

this._FileName = FileName;

this._AttachmentName = AttachmentName;

this._Position = Position;

this._Length = Length;

this._ThreadCallback = tch;

}

internal DownLoadState(string RequestURL, string ResponseURL, string FileName, string AttachmentName, int Position, int Length)

{

this._RequestURL = RequestURL;

this._ResponseURL = ResponseURL;

this._FileName = FileName;

this._AttachmentName = AttachmentName;

this._Position = Position;

this._Length = Length;

}

private ThreadCallbackHandler _ThreadCallback;

public HttpWebClient httpWebClient

{

get

{

return this._hwc;

}

set

{

this._hwc = value;

}

}

private HttpWebClient _hwc;

internal void StartDownloadFileChunk()

{

if (this._ThreadCallback != null)

{

this._ThreadCallback(this._RequestURL, this._FileName, this._Position, this._Length);

this._hwc.OnThreadProcess("");

}

}

}

}
Timothy 2006-2-17 01:33
/* .Net/C#: 实现支持断点续传多线程下载的工具类

* Reflector 了一下 System.Net.WebClient ,改写或增加了若干:

* DownLoad、Upload 相关方法!

* 增加了 DataReceive、ExceptionOccurrs事件

*/

namespace Mp3Crazy

{

using System;

using System.IO;

using System.Net;

using System.Text;

using System.Security;

using System.Threading;

using System.Collections.Specialized;

//委托代理线程的所执行的方法签名一致

public delegate void ThreadCallbackHandler(string S, string s, int I, int i);

//异常处理动作

public enum ExceptionActions

{

Throw,

CancelAll,

Ignore,

Retry

}

/// <summary>

/// 支持断点续传多线程下载的类

/// </summary>

public class HttpWebClient

{

public delegate void ExceptionEventHandler(HttpWebClient Sender, ExceptionEventArgs e);

public event ExceptionEventHandler ExceptionOccurrs; //发生异常事件

public delegate void ThreadProcessEventHandler(HttpWebClient Sender, ThreadProcessEventArgs e);

public event ThreadProcessEventHandler ThreadProcessEnd; //发生多线程处理完毕事件

private int _FileLength,_getLength; //下载文件的总大小

public int TimeOut=20000;

public int SongID=0;

public bool UrlParsed;

public string FileName;

public bool Free=true;

public int RetryTimes;

public int TBlocks=1,curBlock;

public int FileLength

{

get

{

return _FileLength;

}

}

public int GetLength

{

get

{

return _getLength;

}

}

[[i] Last edited by Timothy on 2006-2-17 at 09:48 [/i]]

Timothy 2006-2-17 01:34
/// <summary>

/// 分块下载文件

/// </summary>

/// <param name="Address">URL 地址</param>

/// <param name="FileName">保存到本地的路径文件名</param>

/// <param name="ChunksCount">块数,线程数</param>

public void DownloadFile(string Address, string FileName, int ChunksCount)

{

int p = 0; // position

int s = 0; // chunk size

_getLength=0;

string a = null;

HttpWebRequest hwrq;

HttpWebResponse hwrp = null;

try

{

hwrq = (HttpWebRequest) WebRequest.Create(this.GetUri(Address));

hwrq.Timeout=TimeOut;

hwrp = (HttpWebResponse) hwrq.GetResponse();

//hwrq=null;

long L = hwrp.ContentLength;

hwrq.Credentials = this.m_credentials;

L = ((L == -1) || (L > 0x7fffffff)) ? ((long) 0x7fffffff) : L; //Int32.MaxValue 该常数的值为 2,147,483,647; 即十六进制的 0x7FFFFFFF

int l = (int) L;

this._FileLength = l;

bool b = true;//(hwrp.Headers["Accept-Ranges"] != null && hwrp.Headers["Accept-Ranges"] == "bytes");

a = hwrp.Headers["Content-Disposition"]; //attachment

if (a != null)

{

a = a.Substring(a.LastIndexOf("filename=") + 9);

}

else

{

a = FileName;

}

int ss = s;

if (b)

{

s = l / ChunksCount;

if (s < 2 * 64 * 1024) //块大小至少为 128 K 字节

{

s = 2 * 64 * 1024;

}

ss = s;

int i = 0;

while (l >= s)

{

l -= s;

if (l < s)

{

s += l;

}

if (i++ > 0)

{

DownLoadState x = new DownLoadState(Address, hwrp.ResponseUri.AbsolutePath, FileName, a, p, s, new ThreadCallbackHandler(this.DownloadFileChunk));

// 单线程下载

// x.StartDownloadFileChunk();

x.httpWebClient = this;

//多线程下载

Thread t = new Thread(new ThreadStart(x.StartDownloadFileChunk));

//this.OnThreadProcess(t);

t.Start();

}

p += s;

}

s = ss;

this.ResponseAsBytes(Address, hwrp, s, FileName);

this.OnThreadProcess("");

}

}

catch (Exception e)

{

if (this.ExceptionOccurrs != null)

{

string path="";

if(hwrp!=null)

path=hwrp.ResponseUri.AbsolutePath;

DownLoadState x = new DownLoadState(Address,path, FileName, a, p, s);

ExceptionEventArgs eea = new ExceptionEventArgs(e, x);

ExceptionOccurrs(this, eea);

}

}

}

internal void OnThreadProcess(string id)

{

if (ThreadProcessEnd != null)

{

ThreadProcessEventArgs tpea = new ThreadProcessEventArgs(id);

ThreadProcessEnd(this, tpea);

}

}

/// <summary>

/// 下载一个文件块,利用该方法可自行实现多线程断点续传

/// </summary>

/// <param name="Address">URL 地址</param>

/// <param name="FileName">保存到本地的路径文件名</param>

/// <param name="Length">块大小</param>

public void DownloadFileChunk(string Address, string FileName, int FromPosition, int Length)

{

HttpWebResponse hwrp = null;

string a = null;

try

{

//this._FileName = FileName;

HttpWebRequest hwrq = (HttpWebRequest) WebRequest.Create(this.GetUri(Address));

//hwrq.Credentials = this.m_credentials;

hwrq.AddRange(FromPosition);

hwrp = (HttpWebResponse) hwrq.GetResponse();

hwrq=null;

a = hwrp.Headers["Content-Disposition"]; //attachment

if (a != null)

{

a = a.Substring(a.LastIndexOf("filename=") + 9);

}

else

{

a = FileName;

}

this.ResponseAsBytes(Address, hwrp, Length, FileName);

}

catch (Exception e)

{

if (this.ExceptionOccurrs != null)

{

DownLoadState x = new DownLoadState(Address, hwrp.ResponseUri.AbsolutePath, FileName, a, FromPosition, Length);

ExceptionEventArgs eea = new ExceptionEventArgs(e, x);

ExceptionOccurrs(this, eea);

}

}

}

internal void ResponseAsBytes(string RequestURL, WebResponse Response, long Length, string FileName)

{

string a = null; //AttachmentName

int P = 0; //整个文件的位置指针

int num2 = 0;

try

{

a = Response.Headers["Content-Disposition"]; //attachment

if (a != null)

{

a = a.Substring(a.LastIndexOf("filename=") + 9);

}

int p = 0; //本块的位置指针

int num1=(int)Length;

byte[] buffer1 = new byte[30000];

string s = Response.Headers["Content-Range"];

if (s != null)

{

s = s.Replace("bytes ", "");

s = s.Substring(0, s.IndexOf("-"));

P = Convert.ToInt32(s);

}

Stream S = Response.GetResponseStream();

System.IO.FileStream sw = new System.IO.FileStream(FileName, System.IO.FileMode.OpenOrCreate, System.IO.FileAccess.ReadWrite, System.IO.FileShare.ReadWrite);

//Console.WriteLine("P:{0}",P);

do

{

num2 = S.Read(buffer1, 0,30000);

if (num2 > 0)

{

sw.Position = P;

&nb

本文地址:http://www.45fan.com/a/question/71253.html
Tags: 网页 目的 抓取
编辑:路饭网
关于我们 | 联系我们 | 友情链接 | 网站地图 | Sitemap | App | 返回顶部