45fan.com - 路饭网

搜索: 您的位置主页 > 网络频道 > 阅读资讯:网络爬行者源代码介绍

网络爬行者源代码介绍

2016-08-31 06:01:28 来源:www.45fan.com 【

网络爬行者源代码介绍

import java.awt.*;

import java.awt.event.*;

import java.io.*;

import java.net.*;

import java.util.*;

import java.util.regex.*;

import javax.swing.*;

import javax.swing.table.*;

//一个Web的爬行者(注:爬行在这里的意思与抓取,捕获相同)

public class SearchCrawler extends JFrame{

//最大URL保存值

private static final String[] MAX_URLS={"50","100","500","1000"};

//缓存robot禁止爬行列表

private HashMap disallowListCache=new HashMap();

//搜索GUI控件

private JTextField startTextField;

private JComboBox maxComboBox;

private JCheckBox limitCheckBox;

private JTextField logTextField;

private JTextField searchTextField;

private JCheckBox caseCheckBox;

private JButton searchButton;

//搜索状态GUI控件

private JLabel crawlingLabel2;

private JLabel crawledLabel2;

private JLabel toCrawlLabel2;

private JProgressBar progressBar;

private JLabel matchesLabel2;

//搜索匹配项表格列表

private JTable table;

//标记爬行机器是否正在爬行

private boolean crawling;

//写日志匹配文件的引用

private PrintWriter logFileWriter;

//网络爬行者的构造函数

public SearchCrawler(){

//设置应用程序标题栏

setTitle("搜索爬行者");

//设置窗体大小

setSize(600,600);

//处理窗体关闭事件

addWindowListener(new WindowAdapter(){

public void windowClosing(WindowEvent e){

actionExit();

}

});

//设置文件菜单

JMenuBar menuBar=new JMenuBar();

JMenu fileMenu=new JMenu("文件");

fileMenu.setMnemonic(KeyEvent.VK_F);

JMenuItem fileExitMenuItem=new JMenuItem("退出",KeyEvent.VK_X);

fileExitMenuItem.addActionListener(new ActionListener(){

public void actionPerformed(ActionEvent e){

actionExit();

}

});

fileMenu.add(fileExitMenuItem);

menuBar.add(fileMenu);

setJMenuBar(menuBar);

//设置搜索面板

JPanel searchPanel=new JPanel();

GridBagConstraints constraints;

GridBagLayout layout=new GridBagLayout();

searchPanel.setLayout(layout);

JLabel startLabel=new JLabel("开始URL:");

constraints=new GridBagConstraints();

constraints.anchor=GridBagConstraints.EAST;

constraints.insets=new Insets(5,5,0,0);

layout.setConstraints(startLabel,constraints);

searchPanel.add(startLabel);

startTextField=new JTextField();

constraints=new GridBagConstraints();

constraints.fill=GridBagConstraints.HORIZONTAL;

constraints.gridwidth=GridBagConstraints.REMAINDER;

constraints.insets=new Insets(5,5,0,5);

layout.setConstraints(startTextField,constraints);

searchPanel.add(startTextField);

JLabel maxLabel=new JLabel("最大抓取URL数(0表示不限制):");

constraints=new GridBagConstraints();

constraints.anchor=GridBagConstraints.EAST;

constraints.insets=new Insets(5,5,0,0);

layout.setConstraints(maxLabel,constraints);

searchPanel.add(maxLabel);

maxComboBox=new JComboBox(MAX_URLS);

maxComboBox.setEditable(true);

constraints=new GridBagConstraints();

constraints.insets=new Insets(5,5,0,0);

layout.setConstraints(maxComboBox,constraints);

searchPanel.add(maxComboBox);

limitCheckBox=new JCheckBox("限制抓取开始URL站点");

constraints=new GridBagConstraints();

constraints.anchor=GridBagConstraints.WEST;

constraints.insets=new Insets(0,10,0,0);

layout.setConstraints(limitCheckBox,constraints);

searchPanel.add(limitCheckBox);

JLabel blankLabel=new JLabel();

constraints=new GridBagConstraints();

constraints.gridwidth=GridBagConstraints.REMAINDER;

layout.setConstraints(blankLabel,constraints);

searchPanel.add(blankLabel);

JLabel logLabel=new JLabel("匹配日志文件:");

constraints=new GridBagConstraints();

constraints.anchor=GridBagConstraints.EAST;

constraints.insets=new Insets(5,5,0,0);

layout.setConstraints(logLabel,constraints);

searchPanel.add(logLabel);

String file=System.getProperty("user.dir")+

System.getProperty("file.separator")+

"crawler.log";

logTextField=new JTextField(file);

constraints=new GridBagConstraints();

constraints.fill=GridBagConstraints.HORIZONTAL;

constraints.gridwidth=GridBagConstraints.REMAINDER;

constraints.insets=new Insets(5,5,0,5);

layout.setConstraints(logTextField,constraints);

searchPanel.add(logTextField);

JLabel searchLabel=new JLabel("搜索字符串:");

constraints=new GridBagConstraints();

constraints.anchor=GridBagConstraints.EAST;

constraints.insets=new Insets(5,5,0,0);

layout.setConstraints(searchLabel,constraints);

searchPanel.add(searchLabel);

searchTextField=new JTextField();

constraints=new GridBagConstraints();

constraints.fill=GridBagConstraints.HORIZONTAL;

constraints.insets=new Insets(5,5,0,0);

constraints.gridwidth=2;

constraints.weightx=1.0d;

layout.setConstraints(searchTextField,constraints);

searchPanel.add(searchTextField);

caseCheckBox=new JCheckBox("大小写敏感");

constraints=new GridBagConstraints();

constraints.insets=new Insets(5,5,0,5);

constraints.gridwidth=GridBagConstraints.REMAINDER;

layout.setConstraints(caseCheckBox,constraints);

searchPanel.add(caseCheckBox);

searchButton=new JButton("搜索");

searchButton.addActionListener(new ActionListener(){

public void actionPerformed(ActionEvent e){

actionSearch();

}

});

constraints=new GridBagConstraints();

constraints.gridwidth=GridBagConstraints.REMAINDER;

constraints.insets=new Insets(5,5,5,5);

layout.setConstraints(searchButton,constraints);

searchPanel.add(searchButton);

JSeparator separator=new JSeparator();

constraints=new GridBagConstraints();

constraints.fill=GridBagConstraints.HORIZONTAL;

constraints.gridwidth=GridBagConstraints.REMAINDER;

constraints.insets=new Insets(5,5,5,5);

layout.setConstraints(separator,constraints);

searchPanel.add(separator);

JLabel crawlingLabel1=new JLabel("爬行:");

constraints=new GridBagConstraints();

constraints.anchor=GridBagConstraints.EAST;

constraints.insets=new Insets(5,5,0,0);

layout.setConstraints(crawlingLabel1,constraints);

searchPanel.add(crawlingLabel1);

crawlingLabel2=new JLabel();

crawlingLabel2.setFont(crawlingLabel2.getFont().deriveFont(Font.PLAIN));

constraints=new GridBagConstraints();

constraints.fill=GridBagConstraints.HORIZONTAL;

constraints.gridwidth=GridBagConstraints.REMAINDER;

constraints.insets=new Insets(5,5,0,5);

layout.setConstraints(crawlingLabel2,constraints);

searchPanel.add(crawlingLabel2);

JLabel crawledLabel1=new JLabel("已抓取的URL数:");

constraints=new GridBagConstraints();

constraints.anchor=GridBagConstraints.EAST;

constraints.insets=new Insets(5,5,0,0);

layout.setConstraints(crawledLabel1,constraints);

searchPanel.add(crawledLabel1);

crawledLabel2=new JLabel();

crawledLabel2.setFont(crawledLabel2.getFont().deriveFont(Font.PLAIN));

constraints=new GridBagConstraints();

constraints.fill=GridBagConstraints.HORIZONTAL;

constraints.gridwidth=GridBagConstraints.REMAINDER;

constraints.insets=new Insets(5,5,0,5);

layout.setConstraints(crawledLabel2,constraints);

searchPanel.add(crawledLabel2);

JLabel toCrawlLabel1=new JLabel("爬行的URL数");

constraints=new GridBagConstraints();

constraints.anchor=GridBagConstraints.EAST;

constraints.insets=new Insets(5,5,0,0);

layout.setConstraints(toCrawlLabel1,constraints);

searchPanel.add(toCrawlLabel1);

toCrawlLabel2=new JLabel();

toCrawlLabel2.setFont(toCrawlLabel2.getFont().deriveFont(Font.PLAIN));

constraints=new GridBagConstraints();

constraints.fill=GridBagConstraints.HORIZONTAL;

constraints.gridwidth=GridBagConstraints.REMAINDER;

constraints.insets=new Insets(5,5,0,5);

layout.setConstraints(toCrawlLabel2,constraints);

searchPanel.add(toCrawlLabel2);

JLabel progressLabel=new JLabel("正在爬行进度:");

constraints=new GridBagConstraints();

constraints.anchor=GridBagConstraints.EAST;

constraints.insets=new Insets(5,5,0,0);

layout.setConstraints(progressLabel,constraints);

searchPanel.add(progressLabel);

progressBar=new JProgressBar();

progressBar.setMinimum(0);

progressBar.setStringPainted(true);

constraints=new GridBagConstraints();

constraints.gridwidth=GridBagConstraints.HORIZONTAL;

constraints.insets=new Insets(5,5,0,5);

layout.setConstraints(progressBar,constraints);

searchPanel.add(progressBar);

JLabel matchesLabel1=new JLabel("搜索匹配:");

constraints=new GridBagConstraints();

constraints.anchor=GridBagConstraints.EAST;

constraints.insets=new Insets(5,5,10,0);

layout.setConstraints(matchesLabel1,constraints);

searchPanel.add(matchesLabel1);

matchesLabel2=new JLabel();

matchesLabel2.setFont(matchesLabel2.getFont().deriveFont(Font.PLAIN));

constraints=new GridBagConstraints();

constraints.fill=GridBagConstraints.HORIZONTAL;

constraints.gridwidth=GridBagConstraints.REMAINDER;

constraints.insets=new Insets(5,5,10,5);

layout.setConstraints(matchesLabel2,constraints);

searchPanel.add(matchesLabel2);

//设置匹配表

table=new JTable(new DefaultTableModel(new Object[][]{},new String[]{"URL"}){

public boolean isCellEditable(int row,int column){

return false;

}

});

//设置匹配面板

JPanel matchesPanel=new JPanel();

matchesPanel.setBorder(BorderFactory.createTitledBorder("匹配"));

matchesPanel.setLayout(new BorderLayout());

matchesPanel.add(new JScrollPane(table),BorderLayout.CENTER);

//把面板添加到窗体上

getContentPane().setLayout(new BorderLayout());

getContentPane().add(searchPanel,BorderLayout.NORTH);

getContentPane().add(matchesPanel,BorderLayout.CENTER);

}

//处理搜索/停止按钮被点到

private void actionSearch(){

//如果停止按钮被点到,爬行标志关闭

if(crawling){

crawling=false;

return;

}

ArrayList errorList=new ArrayList();

//验证起始URL已经输入

String startUrl=startTextField.getText().trim();

if(startUrl.length()<1){

errorList.add("没有起始URL");

}else if(verifyUrl(startUrl)==null){//校验起始URL

errorList.add("非法的起始URL");

}

//校验最大URL数是否为空或者是一个数字

int maxUrls=0;

String max=((String)maxComboBox.getSelectedItem()).trim();

if(max.length()>0){

try{

maxUrls=Integer.parseInt(max);

}catch(NumberFormatException e){

}

if(maxUrls<1){

errorList.add("非法最大URL数值");

}

}

//验证匹配的日志文件已经键入

String logFile=logTextField.getText().trim();

if(logFile.length()<0){

errorList.add("未填写日志文件");

}

//验证搜索字符串已经被键入

String searchString=searchTextField.getText().trim();

if(searchString.length()<1){

errorList.add("未填写搜索字符串");

}

//如果有错,显示这些错误,然后返回

if(errorList.size()>0){

StringBuffer message=new StringBuffer();

//连接所有的错误到一个字符串中

for(int i=0;i<errorList.size();i++){

message.append(errorList.get(i));

if(i+1<errorList.size()){

message.append("/n");

}

}

showError(message.toString());

return;

}

//从起始URL移除"www"

startUrl=removeWwwFromUrl(startUrl);

//启动搜索爬行者

search(logFile,startUrl,maxUrls,searchString);

}

private void search(final String logFile,final String startUrl,

final int maxUrls,final String searchString){

//在一个新线程里开始搜索

Thread thread=new Thread(new Runnable(){

public void run(){

//当搜索正在进行时,换一个等待鼠标

setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR));

//禁用搜索控制面板

startTextField.setEnabled(false);

maxComboBox.setEnabled(false);

limitCheckBox.setEnabled(false);

logTextField.setEnabled(false);

searchTextField.setEnabled(false);

caseCheckBox.setEnabled(false);

//更改搜索按钮为"停止"

searchButton.setText("停止");

//重设状态

table.setModel(new DefaultTableModel(new Object[][]{},new String[]{"URL"}){

public boolean isCellEditable(int row,int column){

return false;

}

});

updateStats(startUrl,0,0,maxUrls);

//打开匹配日志文件

try{

logFileWriter=new PrintWriter(new FileWriter(logFile));

}catch(Exception e){

showError("不能打开匹配日志文件");

return;

}

//打开正在爬行标志

crawling=true;

//执行真正的爬行

crawl(startUrl,maxUrls,limitCheckBox.isSelected(),searchString,caseCheckBox.isSelected());

//关闭正在爬行标志

crawling=false;

//关闭匹配日志文件

try{

logFileWriter.close();

}catch(Exception e){

showError("不能关闭匹配日志文件");

}

//标记搜索结束

crawlingLabel2.setText("结束");

//重新使搜索面板可用

startTextField.setEnabled(true);

maxComboBox.setEnabled(true);

limitCheckBox.setEnabled(true);

logTextField.setEnabled(true);

searchTextField.setEnabled(true);

caseCheckBox.setEnabled(true);

//将搜索按钮改回"搜索"

searchButton.setText("搜索");

//改回默认的鼠标形状

setCursor(Cursor.getDefaultCursor());

//如果搜索字符串未被发现显示一个信息

if(table.getRowCount()==0){

JOptionPane.showMessageDialog(SearchCrawler.this,"你的搜索字符串未被发现,请尝试其它","搜索字符串未被发现",JOptionPane.WARNING_MESSAGE);

}

}

});

thread.start();

}

//退出程序

private void actionExit(){

System.exit(0);

}

//校验URL格式

private URL verifyUrl(String url ){

//只允许HTTP的URL

if(!url.toLowerCase().startsWith("http://")){

return null;

}

//校验URL的格式

URL verifiedUrl=null;

try{

verifiedUrl=new URL(url);

}catch(Exception e){

return null;

}

return verifiedUrl;

}

//添加匹配到匹配表和日志文件

private void addMatch(String url){

//添加URL到匹配表

DefaultTableModel model=(DefaultTableModel)table.getModel();

model.addRow(new Object[]{url});

//添加URL到日志文件

try{

logFileWriter.println(url);

}catch(Exception e){

showError("未成功的日志匹配");

}

}

//更新爬行中状态

private void updateStats(String crawling,int crawled,int toCrawl,int maxUrls){

crawlingLabel2.setText(crawling);

crawledLabel2.setText(""+crawled);

toCrawlLabel2.setText(""+toCrawl);

//更新进度条

if(maxUrls==-1){

progressBar.setMaximum(crawled+toCrawl);

}else{

progressBar.setMaximum(maxUrls);

}

progressBar.setValue(crawled);

matchesLabel2.setText(""+table.getRowCount());

}

//检查机器人是否允许访问获得的URL

private boolean isRobotAllowed(URL urlToCheck){

String host=urlToCheck.getHost().toLowerCase();

//从缓冲中找回服务器的不被允许列表

ArrayList disallowList=(ArrayList)disallowListCache.get(host);

//如果列表不在名单中,下载将它收入列表

if(disallowList==null){

disallowList=new ArrayList();

try{

URL robotsFileUrl=new URL("http://"+host+"/robots.txt");

//打开并读取robot文件

BufferedReader reader=new BufferedReader(new InputStreamReader(robotsFileUrl.openStream()));

//读robot文件,建立不被允许路径列表

String line;

while((line=reader.readLine())!=null){

if(line.indexOf("Disallow:")==0){

String disallowPath=line.substring("Disallow:".length());

//检查不被允许路径中如果含有注释则去除它

int commentIndex=disallowPath.indexOf("#");

if(commentIndex!=-1){

disallowPath=disallowPath.substring(0,commentIndex);

}

//移除不被允许路径前后空格

disallowPath=disallowPath.trim();

//添加不被允许路径到列表中

disallowList.add(disallowPath);

}

}

}catch(Exception e){

//假设当robot文件不存在时,所有的路径都将被允许爬行

return true;

}

}

//循环检查列表中是否包含给定的URL

String file=urlToCheck.getFile();

for(int i=0;i<disallowList.size();i++){

String disallow=(String)disallowList.get(i);

if(file.startsWith(disallow)){

return false;

}

}

return true;

}

//下载给定的URL页

private String downloadPage(URL pageUrl){

try{

//为读取打开一个到URL的连接

BufferedReader reader=new BufferedReader(new InputStreamReader(pageUrl.openStream()));

//读文件到缓冲中

String line;

StringBuffer pageBuffer=new StringBuffer();

while((line=reader.readLine())!=null){

pageBuffer.append(line);

}

return pageBuffer.toString();

}catch(Exception e){

}

return null;

}

//从一个URL中删除开头的"www",如果它存在

private String removeWwwFromUrl(String url){

int index=url.indexOf("://www");

if(index!=-1){

return url.substring(0,index+3)+url.substring(index+7);

}

return url;

}

//解析所有的页面内容找到链接

private ArrayList retrieveLinks(URL pageUrl,String pageContents,HashSet crawledList,boolean limitHost){

//编译链接匹配模式

Pattern p=Pattern.compile("<a//s+href//s*=//s*/"?(.*?)[/"|>]",Pattern.CASE_INSENSITIVE);

Matcher m=p.matcher(pageContents);

//建立链接匹配列表

ArrayList linkList=new ArrayList();

while(m.find()){

String link=m.group(1).trim();

//跳过空链接

if(link.length()<1){

continue;

}

//跳过页面锚记链接

if(link.charAt(0)=='#'){

continue;

}

//跳过邮件链接

if(link.indexOf("mailto:")!=-1){

continue;

}

//跳过JavaScript链接

if(link.toLowerCase().indexOf("javascript")!=-1){

continue;

}

//如果需要,加上绝对与相对URL

if(link.indexOf("://")==-1){

//处理绝对URL

if(link.charAt(0)=='/'){

link="http://"+pageUrl.getHost()+link;

//处理相对URL

}else{

String file=pageUrl.getFile();

if(file.indexOf('/')==-1){

link="http://"+pageUrl.getHost()+"/"+link;

}else{

String path=file.substring(0,file.lastIndexOf('/')+1);

link="http://"+pageUrl.getHost()+path+link;

}

}

}

//从链接移除锚记

int index=link.indexOf('#');

if(index!=-1){

link=link.substring(0,index);

}

//去除开头的"www"

link=removeWwwFromUrl(link);

//校验链接,如果非法,则跳过

URL verifiedLink=verifyUrl(link);

if(verifiedLink==null){

continue;

}

//如果是特定的,那些与起始相同的服务器的链接,则跳过

if(limitHost && !pageUrl.getHost().toLowerCase().equals(

verifiedLink.getHost().toLowerCase())){

continue;

}

//如果它已经被捕获,则跳过

if(crawledList.contains(link)){

continue;

}

//添加链接到列表

linkList.add(link);

}

return linkList;

}

//决定获得的页面内容里是否有匹配的字符串

private boolean searchStringMatches(String pageContents,String searchString,boolean caseSensitive){

String searchContents=pageContents;

//如果是非大小写敏感,小写所有页面内容

if(!caseSensitive){

searchContents=pageContents.toLowerCase();

}

//从个别的队列中分隔字符串

Pattern p=Pattern.compile("[//s]+");

String[] terms=p.split(searchString);

//检查每一个队列是否匹配

for(int i=0;i<terms.length;i++){

if(caseSensitive){

if(searchContents.indexOf(terms[i])==-1){

return false;

}

}else{

if(searchContents.indexOf(terms[i].toLowerCase())==-1){

return false;

}

}

}

return false;

}

//执行真正的爬行,搜索搜索字符串

public void crawl(String startUrl,int maxUrls,boolean limitHost,String searchString,boolean caseSensitive){

//设置爬行列表

HashSet crawledList=new HashSet();

LinkedHashSet toCrawlList=new LinkedHashSet();

//添加开始URL到要爬行列表

toCrawlList.add(startUrl);

//循环整个要爬行列表,执行真正的爬行

while(crawling && toCrawlList.size()>0){

//如果指定过最大URL数,则检查是否达到了最大URL数

if(maxUrls!=-1){

if(crawledList.size()==maxUrls){

break;

}

}

//从底部的列表中获得URL

String url=(String)toCrawlList.iterator().next();

//从要爬行列表中移除URL

toCrawlList.remove(url);

//转换字符串URL为URL对象

URL verifiedUrl=verifyUrl(url);

//如果robots不允许访问这个URL,则跳过

if(!isRobotAllowed(verifiedUrl)){

continue;

}

//更新爬行状态

updateStats(url,crawledList.size(),toCrawlList.size(),maxUrls);

//添加页面到被爬行的列表

crawledList.add(url);

//从获得的URL下载页面

String pageContents=downloadPage(verifiedUrl);

//如果一个页面被下载成功,则找到所有的链接并比较是否包含搜索字符串

if(pageContents!=null&&pageContents.length()>0){

//从页面获得合法的链接

ArrayList links=retrieveLinks(verifiedUrl,pageContents,crawledList,limitHost);

//添加列表到被爬行列表

toCrawlList.addAll(links);

//检查搜索字符串是否存在,如果存在,则记录一个匹配

if(searchStringMatches(pageContents,searchString,caseSensitive)){

addMatch(url);

}

}

//更新爬行状态

updateStats(url,crawledList.size(),toCrawlList.size(),maxUrls);

}

}

//显示错误信息

private void showError(String message){

JOptionPane.showMessageDialog(this,message,"错误",JOptionPane.ERROR_MESSAGE);

}

public static void main(String[] args){

SearchCrawler crawler=new SearchCrawler();

crawler.show();

}

}

 

本文地址:http://www.45fan.com/a/question/70072.html
Tags: 网络 爬行者 SearchCrawler
编辑:路饭网
关于我们 | 联系我们 | 友情链接 | 网站地图 | Sitemap | App | 返回顶部