网络爬行者源代码介绍
import java.awt.*;
import java.awt.event.*; import java.io.*; import java.net.*; import java.util.*; import java.util.regex.*; import javax.swing.*; import javax.swing.table.*;//一个Web的爬行者(注:爬行在这里的意思与抓取,捕获相同)
public class SearchCrawler extends JFrame{ //最大URL保存值 private static final String[] MAX_URLS={"50","100","500","1000"}; //缓存robot禁止爬行列表 private HashMap disallowListCache=new HashMap(); //搜索GUI控件 private JTextField startTextField; private JComboBox maxComboBox; private JCheckBox limitCheckBox; private JTextField logTextField; private JTextField searchTextField; private JCheckBox caseCheckBox; private JButton searchButton; //搜索状态GUI控件 private JLabel crawlingLabel2; private JLabel crawledLabel2; private JLabel toCrawlLabel2; private JProgressBar progressBar; private JLabel matchesLabel2; //搜索匹配项表格列表 private JTable table; //标记爬行机器是否正在爬行 private boolean crawling; //写日志匹配文件的引用 private PrintWriter logFileWriter; //网络爬行者的构造函数 public SearchCrawler(){ //设置应用程序标题栏 setTitle("搜索爬行者"); //设置窗体大小 setSize(600,600); //处理窗体关闭事件 addWindowListener(new WindowAdapter(){ public void windowClosing(WindowEvent e){ actionExit(); } }); //设置文件菜单 JMenuBar menuBar=new JMenuBar(); JMenu fileMenu=new JMenu("文件"); fileMenu.setMnemonic(KeyEvent.VK_F); JMenuItem fileExitMenuItem=new JMenuItem("退出",KeyEvent.VK_X); fileExitMenuItem.addActionListener(new ActionListener(){ public void actionPerformed(ActionEvent e){ actionExit(); } }); fileMenu.add(fileExitMenuItem); menuBar.add(fileMenu); setJMenuBar(menuBar); //设置搜索面板 JPanel searchPanel=new JPanel(); GridBagConstraints constraints; GridBagLayout layout=new GridBagLayout(); searchPanel.setLayout(layout); JLabel startLabel=new JLabel("开始URL:"); constraints=new GridBagConstraints(); constraints.anchor=GridBagConstraints.EAST; constraints.insets=new Insets(5,5,0,0); layout.setConstraints(startLabel,constraints); searchPanel.add(startLabel); startTextField=new JTextField(); constraints=new GridBagConstraints(); constraints.fill=GridBagConstraints.HORIZONTAL; constraints.gridwidth=GridBagConstraints.REMAINDER; constraints.insets=new Insets(5,5,0,5); layout.setConstraints(startTextField,constraints); searchPanel.add(startTextField); JLabel maxLabel=new JLabel("最大抓取URL数(0表示不限制):"); constraints=new GridBagConstraints(); constraints.anchor=GridBagConstraints.EAST; constraints.insets=new Insets(5,5,0,0); layout.setConstraints(maxLabel,constraints); searchPanel.add(maxLabel); maxComboBox=new JComboBox(MAX_URLS); maxComboBox.setEditable(true); constraints=new GridBagConstraints(); constraints.insets=new Insets(5,5,0,0); layout.setConstraints(maxComboBox,constraints); searchPanel.add(maxComboBox); limitCheckBox=new JCheckBox("限制抓取开始URL站点"); constraints=new GridBagConstraints(); constraints.anchor=GridBagConstraints.WEST; constraints.insets=new Insets(0,10,0,0); layout.setConstraints(limitCheckBox,constraints); searchPanel.add(limitCheckBox); JLabel blankLabel=new JLabel(); constraints=new GridBagConstraints(); constraints.gridwidth=GridBagConstraints.REMAINDER; layout.setConstraints(blankLabel,constraints); searchPanel.add(blankLabel); JLabel logLabel=new JLabel("匹配日志文件:"); constraints=new GridBagConstraints(); constraints.anchor=GridBagConstraints.EAST; constraints.insets=new Insets(5,5,0,0); layout.setConstraints(logLabel,constraints); searchPanel.add(logLabel); String file=System.getProperty("user.dir")+ System.getProperty("file.separator")+ "crawler.log"; logTextField=new JTextField(file); constraints=new GridBagConstraints(); constraints.fill=GridBagConstraints.HORIZONTAL; constraints.gridwidth=GridBagConstraints.REMAINDER; constraints.insets=new Insets(5,5,0,5); layout.setConstraints(logTextField,constraints); searchPanel.add(logTextField); JLabel searchLabel=new JLabel("搜索字符串:"); constraints=new GridBagConstraints(); constraints.anchor=GridBagConstraints.EAST; constraints.insets=new Insets(5,5,0,0); layout.setConstraints(searchLabel,constraints); searchPanel.add(searchLabel); searchTextField=new JTextField(); constraints=new GridBagConstraints(); constraints.fill=GridBagConstraints.HORIZONTAL; constraints.insets=new Insets(5,5,0,0); constraints.gridwidth=2; constraints.weightx=1.0d; layout.setConstraints(searchTextField,constraints); searchPanel.add(searchTextField); caseCheckBox=new JCheckBox("大小写敏感"); constraints=new GridBagConstraints(); constraints.insets=new Insets(5,5,0,5); constraints.gridwidth=GridBagConstraints.REMAINDER; layout.setConstraints(caseCheckBox,constraints); searchPanel.add(caseCheckBox); searchButton=new JButton("搜索"); searchButton.addActionListener(new ActionListener(){ public void actionPerformed(ActionEvent e){ actionSearch(); } }); constraints=new GridBagConstraints(); constraints.gridwidth=GridBagConstraints.REMAINDER; constraints.insets=new Insets(5,5,5,5); layout.setConstraints(searchButton,constraints); searchPanel.add(searchButton); JSeparator separator=new JSeparator(); constraints=new GridBagConstraints(); constraints.fill=GridBagConstraints.HORIZONTAL; constraints.gridwidth=GridBagConstraints.REMAINDER; constraints.insets=new Insets(5,5,5,5); layout.setConstraints(separator,constraints); searchPanel.add(separator); JLabel crawlingLabel1=new JLabel("爬行:"); constraints=new GridBagConstraints(); constraints.anchor=GridBagConstraints.EAST; constraints.insets=new Insets(5,5,0,0); layout.setConstraints(crawlingLabel1,constraints); searchPanel.add(crawlingLabel1); crawlingLabel2=new JLabel(); crawlingLabel2.setFont(crawlingLabel2.getFont().deriveFont(Font.PLAIN)); constraints=new GridBagConstraints(); constraints.fill=GridBagConstraints.HORIZONTAL; constraints.gridwidth=GridBagConstraints.REMAINDER; constraints.insets=new Insets(5,5,0,5); layout.setConstraints(crawlingLabel2,constraints); searchPanel.add(crawlingLabel2); JLabel crawledLabel1=new JLabel("已抓取的URL数:"); constraints=new GridBagConstraints(); constraints.anchor=GridBagConstraints.EAST; constraints.insets=new Insets(5,5,0,0); layout.setConstraints(crawledLabel1,constraints); searchPanel.add(crawledLabel1); crawledLabel2=new JLabel(); crawledLabel2.setFont(crawledLabel2.getFont().deriveFont(Font.PLAIN)); constraints=new GridBagConstraints(); constraints.fill=GridBagConstraints.HORIZONTAL; constraints.gridwidth=GridBagConstraints.REMAINDER; constraints.insets=new Insets(5,5,0,5); layout.setConstraints(crawledLabel2,constraints); searchPanel.add(crawledLabel2); JLabel toCrawlLabel1=new JLabel("爬行的URL数"); constraints=new GridBagConstraints(); constraints.anchor=GridBagConstraints.EAST; constraints.insets=new Insets(5,5,0,0); layout.setConstraints(toCrawlLabel1,constraints); searchPanel.add(toCrawlLabel1); toCrawlLabel2=new JLabel(); toCrawlLabel2.setFont(toCrawlLabel2.getFont().deriveFont(Font.PLAIN)); constraints=new GridBagConstraints(); constraints.fill=GridBagConstraints.HORIZONTAL; constraints.gridwidth=GridBagConstraints.REMAINDER; constraints.insets=new Insets(5,5,0,5); layout.setConstraints(toCrawlLabel2,constraints); searchPanel.add(toCrawlLabel2); JLabel progressLabel=new JLabel("正在爬行进度:"); constraints=new GridBagConstraints(); constraints.anchor=GridBagConstraints.EAST; constraints.insets=new Insets(5,5,0,0); layout.setConstraints(progressLabel,constraints); searchPanel.add(progressLabel); progressBar=new JProgressBar(); progressBar.setMinimum(0); progressBar.setStringPainted(true); constraints=new GridBagConstraints(); constraints.gridwidth=GridBagConstraints.HORIZONTAL; constraints.insets=new Insets(5,5,0,5); layout.setConstraints(progressBar,constraints); searchPanel.add(progressBar); JLabel matchesLabel1=new JLabel("搜索匹配:"); constraints=new GridBagConstraints(); constraints.anchor=GridBagConstraints.EAST; constraints.insets=new Insets(5,5,10,0); layout.setConstraints(matchesLabel1,constraints); searchPanel.add(matchesLabel1); matchesLabel2=new JLabel(); matchesLabel2.setFont(matchesLabel2.getFont().deriveFont(Font.PLAIN)); constraints=new GridBagConstraints(); constraints.fill=GridBagConstraints.HORIZONTAL; constraints.gridwidth=GridBagConstraints.REMAINDER; constraints.insets=new Insets(5,5,10,5); layout.setConstraints(matchesLabel2,constraints); searchPanel.add(matchesLabel2); //设置匹配表 table=new JTable(new DefaultTableModel(new Object[][]{},new String[]{"URL"}){ public boolean isCellEditable(int row,int column){ return false; } }); //设置匹配面板 JPanel matchesPanel=new JPanel(); matchesPanel.setBorder(BorderFactory.createTitledBorder("匹配")); matchesPanel.setLayout(new BorderLayout()); matchesPanel.add(new JScrollPane(table),BorderLayout.CENTER); //把面板添加到窗体上 getContentPane().setLayout(new BorderLayout()); getContentPane().add(searchPanel,BorderLayout.NORTH); getContentPane().add(matchesPanel,BorderLayout.CENTER); } //处理搜索/停止按钮被点到 private void actionSearch(){ //如果停止按钮被点到,爬行标志关闭 if(crawling){ crawling=false; return; } ArrayList errorList=new ArrayList(); //验证起始URL已经输入 String startUrl=startTextField.getText().trim(); if(startUrl.length()<1){ errorList.add("没有起始URL"); }else if(verifyUrl(startUrl)==null){//校验起始URL errorList.add("非法的起始URL"); } //校验最大URL数是否为空或者是一个数字 int maxUrls=0; String max=((String)maxComboBox.getSelectedItem()).trim(); if(max.length()>0){ try{ maxUrls=Integer.parseInt(max); }catch(NumberFormatException e){ } if(maxUrls<1){ errorList.add("非法最大URL数值"); } } //验证匹配的日志文件已经键入 String logFile=logTextField.getText().trim(); if(logFile.length()<0){ errorList.add("未填写日志文件"); } //验证搜索字符串已经被键入 String searchString=searchTextField.getText().trim(); if(searchString.length()<1){ errorList.add("未填写搜索字符串"); } //如果有错,显示这些错误,然后返回 if(errorList.size()>0){ StringBuffer message=new StringBuffer(); //连接所有的错误到一个字符串中 for(int i=0;i<errorList.size();i++){ message.append(errorList.get(i)); if(i+1<errorList.size()){ message.append("/n"); } } showError(message.toString()); return; } //从起始URL移除"www" startUrl=removeWwwFromUrl(startUrl); //启动搜索爬行者 search(logFile,startUrl,maxUrls,searchString); } private void search(final String logFile,final String startUrl, final int maxUrls,final String searchString){ //在一个新线程里开始搜索 Thread thread=new Thread(new Runnable(){ public void run(){ //当搜索正在进行时,换一个等待鼠标 setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR)); //禁用搜索控制面板 startTextField.setEnabled(false); maxComboBox.setEnabled(false); limitCheckBox.setEnabled(false); logTextField.setEnabled(false); searchTextField.setEnabled(false); caseCheckBox.setEnabled(false); //更改搜索按钮为"停止" searchButton.setText("停止"); //重设状态 table.setModel(new DefaultTableModel(new Object[][]{},new String[]{"URL"}){ public boolean isCellEditable(int row,int column){ return false; } }); updateStats(startUrl,0,0,maxUrls); //打开匹配日志文件 try{ logFileWriter=new PrintWriter(new FileWriter(logFile)); }catch(Exception e){ showError("不能打开匹配日志文件"); return; } //打开正在爬行标志 crawling=true; //执行真正的爬行 crawl(startUrl,maxUrls,limitCheckBox.isSelected(),searchString,caseCheckBox.isSelected()); //关闭正在爬行标志 crawling=false; //关闭匹配日志文件 try{ logFileWriter.close(); }catch(Exception e){ showError("不能关闭匹配日志文件"); } //标记搜索结束 crawlingLabel2.setText("结束"); //重新使搜索面板可用 startTextField.setEnabled(true); maxComboBox.setEnabled(true); limitCheckBox.setEnabled(true); logTextField.setEnabled(true); searchTextField.setEnabled(true); caseCheckBox.setEnabled(true); //将搜索按钮改回"搜索" searchButton.setText("搜索"); //改回默认的鼠标形状 setCursor(Cursor.getDefaultCursor()); //如果搜索字符串未被发现显示一个信息 if(table.getRowCount()==0){ JOptionPane.showMessageDialog(SearchCrawler.this,"你的搜索字符串未被发现,请尝试其它","搜索字符串未被发现",JOptionPane.WARNING_MESSAGE); } } }); thread.start(); } //退出程序 private void actionExit(){ System.exit(0); } //校验URL格式 private URL verifyUrl(String url ){ //只允许HTTP的URL if(!url.toLowerCase().startsWith("http://")){ return null; } //校验URL的格式 URL verifiedUrl=null; try{ verifiedUrl=new URL(url); }catch(Exception e){ return null; } return verifiedUrl; } //添加匹配到匹配表和日志文件 private void addMatch(String url){ //添加URL到匹配表 DefaultTableModel model=(DefaultTableModel)table.getModel(); model.addRow(new Object[]{url}); //添加URL到日志文件 try{ logFileWriter.println(url); }catch(Exception e){ showError("未成功的日志匹配"); } } //更新爬行中状态 private void updateStats(String crawling,int crawled,int toCrawl,int maxUrls){ crawlingLabel2.setText(crawling); crawledLabel2.setText(""+crawled); toCrawlLabel2.setText(""+toCrawl); //更新进度条 if(maxUrls==-1){ progressBar.setMaximum(crawled+toCrawl); }else{ progressBar.setMaximum(maxUrls); } progressBar.setValue(crawled); matchesLabel2.setText(""+table.getRowCount()); } //检查机器人是否允许访问获得的URL private boolean isRobotAllowed(URL urlToCheck){ String host=urlToCheck.getHost().toLowerCase(); //从缓冲中找回服务器的不被允许列表 ArrayList disallowList=(ArrayList)disallowListCache.get(host); //如果列表不在名单中,下载将它收入列表 if(disallowList==null){ disallowList=new ArrayList(); try{ URL robotsFileUrl=new URL("http://"+host+"/robots.txt"); //打开并读取robot文件 BufferedReader reader=new BufferedReader(new InputStreamReader(robotsFileUrl.openStream())); //读robot文件,建立不被允许路径列表 String line; while((line=reader.readLine())!=null){ if(line.indexOf("Disallow:")==0){ String disallowPath=line.substring("Disallow:".length()); //检查不被允许路径中如果含有注释则去除它 int commentIndex=disallowPath.indexOf("#"); if(commentIndex!=-1){ disallowPath=disallowPath.substring(0,commentIndex); } //移除不被允许路径前后空格 disallowPath=disallowPath.trim(); //添加不被允许路径到列表中 disallowList.add(disallowPath); } } }catch(Exception e){ //假设当robot文件不存在时,所有的路径都将被允许爬行 return true; } } //循环检查列表中是否包含给定的URL String file=urlToCheck.getFile(); for(int i=0;i<disallowList.size();i++){ String disallow=(String)disallowList.get(i); if(file.startsWith(disallow)){ return false; } } return true; } //下载给定的URL页 private String downloadPage(URL pageUrl){ try{ //为读取打开一个到URL的连接 BufferedReader reader=new BufferedReader(new InputStreamReader(pageUrl.openStream())); //读文件到缓冲中 String line; StringBuffer pageBuffer=new StringBuffer(); while((line=reader.readLine())!=null){ pageBuffer.append(line); } return pageBuffer.toString(); }catch(Exception e){ } return null; } //从一个URL中删除开头的"www",如果它存在 private String removeWwwFromUrl(String url){ int index=url.indexOf("://www"); if(index!=-1){ return url.substring(0,index+3)+url.substring(index+7); } return url; } //解析所有的页面内容找到链接 private ArrayList retrieveLinks(URL pageUrl,String pageContents,HashSet crawledList,boolean limitHost){ //编译链接匹配模式 Pattern p=Pattern.compile("<a//s+href//s*=//s*/"?(.*?)[/"|>]",Pattern.CASE_INSENSITIVE); Matcher m=p.matcher(pageContents); //建立链接匹配列表 ArrayList linkList=new ArrayList(); while(m.find()){ String link=m.group(1).trim(); //跳过空链接 if(link.length()<1){ continue; } //跳过页面锚记链接 if(link.charAt(0)=='#'){ continue; } //跳过邮件链接 if(link.indexOf("mailto:")!=-1){ continue; } //跳过JavaScript链接 if(link.toLowerCase().indexOf("javascript")!=-1){ continue; } //如果需要,加上绝对与相对URL if(link.indexOf("://")==-1){ //处理绝对URL if(link.charAt(0)=='/'){ link="http://"+pageUrl.getHost()+link; //处理相对URL }else{ String file=pageUrl.getFile(); if(file.indexOf('/')==-1){ link="http://"+pageUrl.getHost()+"/"+link; }else{ String path=file.substring(0,file.lastIndexOf('/')+1); link="http://"+pageUrl.getHost()+path+link; } } } //从链接移除锚记 int index=link.indexOf('#'); if(index!=-1){ link=link.substring(0,index); } //去除开头的"www" link=removeWwwFromUrl(link); //校验链接,如果非法,则跳过 URL verifiedLink=verifyUrl(link); if(verifiedLink==null){ continue; } //如果是特定的,那些与起始相同的服务器的链接,则跳过 if(limitHost && !pageUrl.getHost().toLowerCase().equals( verifiedLink.getHost().toLowerCase())){ continue; } //如果它已经被捕获,则跳过 if(crawledList.contains(link)){ continue; } //添加链接到列表 linkList.add(link); } return linkList; } //决定获得的页面内容里是否有匹配的字符串 private boolean searchStringMatches(String pageContents,String searchString,boolean caseSensitive){ String searchContents=pageContents; //如果是非大小写敏感,小写所有页面内容 if(!caseSensitive){ searchContents=pageContents.toLowerCase(); } //从个别的队列中分隔字符串 Pattern p=Pattern.compile("[//s]+"); String[] terms=p.split(searchString); //检查每一个队列是否匹配 for(int i=0;i<terms.length;i++){ if(caseSensitive){ if(searchContents.indexOf(terms[i])==-1){ return false; } }else{ if(searchContents.indexOf(terms[i].toLowerCase())==-1){ return false; } } } return false; } //执行真正的爬行,搜索搜索字符串 public void crawl(String startUrl,int maxUrls,boolean limitHost,String searchString,boolean caseSensitive){ //设置爬行列表 HashSet crawledList=new HashSet(); LinkedHashSet toCrawlList=new LinkedHashSet(); //添加开始URL到要爬行列表 toCrawlList.add(startUrl); //循环整个要爬行列表,执行真正的爬行 while(crawling && toCrawlList.size()>0){ //如果指定过最大URL数,则检查是否达到了最大URL数 if(maxUrls!=-1){ if(crawledList.size()==maxUrls){ break; } } //从底部的列表中获得URL String url=(String)toCrawlList.iterator().next(); //从要爬行列表中移除URL toCrawlList.remove(url); //转换字符串URL为URL对象 URL verifiedUrl=verifyUrl(url); //如果robots不允许访问这个URL,则跳过 if(!isRobotAllowed(verifiedUrl)){ continue; } //更新爬行状态 updateStats(url,crawledList.size(),toCrawlList.size(),maxUrls); //添加页面到被爬行的列表 crawledList.add(url); //从获得的URL下载页面 String pageContents=downloadPage(verifiedUrl); //如果一个页面被下载成功,则找到所有的链接并比较是否包含搜索字符串 if(pageContents!=null&&pageContents.length()>0){ //从页面获得合法的链接 ArrayList links=retrieveLinks(verifiedUrl,pageContents,crawledList,limitHost); //添加列表到被爬行列表 toCrawlList.addAll(links); //检查搜索字符串是否存在,如果存在,则记录一个匹配 if(searchStringMatches(pageContents,searchString,caseSensitive)){ addMatch(url); } } //更新爬行状态 updateStats(url,crawledList.size(),toCrawlList.size(),maxUrls); } } //显示错误信息 private void showError(String message){ JOptionPane.showMessageDialog(this,message,"错误",JOptionPane.ERROR_MESSAGE); } public static void main(String[] args){ SearchCrawler crawler=new SearchCrawler(); crawler.show(); } }