ACE之网页链接提取程序的方法
一个简单得提取网页上的链接的小程序
#include<iostream>
#include<fstream>
#include"boost/regex.hpp"
#include"ace/INET_Addr.h"
#include"ace/SOCK_Stream.h"
#include"ace/SOCK_Connector.h"
#include "ace/Time_Value.h"
#include"ace/Log_Msg.h"
using namespace std;
//getHtml得到网页的内容
int getHtml(const char* ipaddr, char* recvbuf, unsigned len,char *pathname)
{
ACE_INET_Addr servaddr(80,ipaddr);
ACE_SOCK_Connector connector;
ACE_SOCK_Stream peer;
ACE_Time_Value sendTime(0,100);
ACE_Time_Value recvTime(0,1000);
if(connector.connect(peer, servaddr) == -1)
ACE_ERROR_RETURN((LM_ERROR,ACE_TEXT("%p/n"),ACE_TEXT("connect")),1);
char buff[512];
servaddr.addr_to_string(buff, 512);
std::cout<<buff<<std::endl;
iovec iov[3];
//填写HTTP请求命令
iov[0].iov_base = "GET ";
iov[0].iov_len = 4;
iov[1].iov_base = pathname;
iov[1].iov_len = strlen(pathname);
iov[2].iov_base =" HTTP/1.0/r/n/r/n";
iov[2].iov_len = 13;
if(peer.sendv_n(iov,3,&sendTime)== -1)
ACE_ERROR_RETURN((LM_ERROR,ACE_TEXT("(%p|%t) error in sending"),
ACE_TEXT("query to status server/n")),1);
ACE_OS::sleep(1);
if(peer.recv(recvbuf,len, &recvTime) == -1)
ACE_ERROR_RETURN((LM_ERROR,ACE_TEXT("(%p|%t) error in recving"),
ACE_TEXT("query to status server/n")),1);
return 0;
}
int ACE_TMAIN(int argc, ACE_TCHAR ** argv)
{
if(argc < 2)
cout<<"参数个数不够"<<endl;
const unsigned int BUFF_SIZE = 1024*64;
char buff[BUFF_SIZE];
char *pathname;
if(argc >=3)
{
pathname = argv[2];
}
else
{
pathname = "/";
}
getHtml(argv[1],buff,BUFF_SIZE,pathname);
cout<<buff<<endl;
boost::smatch m;
boost::regex reg("(((href)|(src))=.*?)(>)",boost::regex::icase);
string str(buff);
//寻找匹配
boost::sregex_iterator it(str.begin(),str.end(),reg);
boost::sregex_iterator end;
string filename(argv[1]);
filename += ".txt";
ofstream out(filename.c_str());
//输出到文件
for(;it != end;++it)
{
string trim = it->str();
string path(string(argv[1]) + pathname);
cout<<*it<<endl;
int pos = trim.find(' ');
if(pos >= 0)
{
trim = trim.substr(0,pos + 1);
}
pos = trim.find("http:");
if(pos >= 0)
{
trim = trim.substr(pos);
trim.erase(trim.size() - 1);
}
else
{
pos = trim.find_first_of('=');
trim = trim.substr(pos + 2);
trim.erase(trim.size() - 1);
if(trim[0] == '/')
{
string temp(trim.begin() +1 ,trim.end());
trim = temp;
}
path = "http://" + path;
if(*(path.end() - 1) == ' ')
path.erase(path.end() - 1);
trim = path + trim;
}
if(trim[(trim.size() - 1)] == '/"'||trim[(trim.size() - 1)] == '/'')
trim.erase(trim.size() - 1);
out<<trim<<endl;
}
return 0;
}
输入 程序名 wjl.scu.edu.cn /soft/
得出这些链接
http://wjl.scu.edu.cn/soft/images/logotop1.gif
http://wjl.scu.edu.cn/soft/images/menuleft.gif
http://wjl.scu.edu.cn/soft/
http://wjl.scu.edu.cn/soft/images/menu_mid.gif
http://wjl.scu.edu.cn/soft/music/
http://wjl.scu.edu.cn/soft/images/menu_mid.gif
http://wjl.scu.edu.cn/soft/soft/
http://wjl.scu.edu.cn/soft/images/menu_mid.gif
http://wjl.scu.edu.cn/soft/news/
http://wjl.scu.edu.cn/soft/images/menu_mid.gif
http://wjl.scu.edu.cn/soft/movie/
http://wjl.scu.edu.cn/soft/images/menu_mid.gif
http://wjl.scu.edu.cn/soft/original/
http://wjl.scu.edu.cn/soft/images/spacer.gif
http://wjl.scu.edu.cn/soft/images/search.gif
http://wjl.scu.edu.cn/soft/images/topbg1.gif
http://wjl.scu.edu.cn/soft/categories.php
http://wjl.scu.edu.cn/soft/images/soft5_r2_c1.jpg
http://wjl.scu.edu.cn/soft/images/spacer.gif
http://wjl.scu.edu.cn/soft/images/sm/smallico_r1_c5.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=%B3%A3%D3%C3%B9%A4%BE%DF
http://wjl.scu.edu.cn/soft/images/sm/smallico_r4_c4.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=操作系统
http://wjl.scu.edu.cn/soft/images/sm/smallico_r6_c5.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=图形图像
http://wjl.scu.edu.cn/soft/images/sm/smallico_r8_c4.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=媒体工具
http://wjl.scu.edu.cn/soft/images/sm/smallico_r10_c5.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=硬件驱动
http://wjl.scu.edu.cn/soft/images/sm/smallico_r12_c6.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=网络工具
http://wjl.scu.edu.cn/soft/images/sm/smallico_r19_c5.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=书籍教程
http://wjl.scu.edu.cn/soft/images/sm/smallico_r16_c2.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=休闲娱乐
http://wjl.scu.edu.cn/soft/images/sm/smallico_r20_c1.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=电脑编程
http://wjl.scu.edu.cn/soft/images/sm/smallico_r22_c3.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=其它软件
http://wjl.scu.edu.cn/soft/images/sm/smallico_r22_c3.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=软件原码
http://wjl.scu.edu.cn/soft/images/sm/smallico_r22_c3.jpg
http://wjl.scu.edu.cn/soft/search.php?selecttype=原创开发者专栏
http://wjl.scu.edu.cn/soft/images/sm/smallico_r24_c4.jpg
本文地址:http://www.45fan.com/a/luyou/73855.html