宽度优先搜索实现的Java爬虫

宽度优先搜索实现的Java爬虫
Crawler类能够通过宽度优先搜索不断地抓取网站上的url。
这里需要用到 FileHelper类的writeFile方法用于写入文件。
代码如下:
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Queue;


public  class Crawler {
    
     private  static HashMap<String, Integer> map =  new HashMap<String, Integer>();
     private  static  int count = 0;
     private  static  int max_count = 200000;
    
     public  static String[] getLinks(String content) {
        HashMap<String, Integer> map =  new HashMap<String, Integer>();
         int len = content.length();
        
         for( int i=0;i+9 < len;i++) {
             if(content.substring(i, i+8).equals("\"http: // ") || content.substring(i, i+9).equals("\"https: // ")) {
                String ss =  new String();
                 for( int j=i+1;j<len && content.charAt(j) != '\"';j++) ss += content.charAt(j);
                 if(map.containsKey(ss))  continue;
                map.put(ss,  new Integer(1));
            }
        }
         int N = map.size();
        String[] ans =  new String[N];
        Iterator<String> iter = map.keySet().iterator();
         int cnt = 0;
         while (iter.hasNext()) {
            String key = iter.next();
            ans[cnt++] = key;
        }
         return ans;
    }
    
     private  static  boolean isPictureUrl(String url) {
         int len = url.length();
         if(url.substring(len-4, len).equals(".jpg") 
                || url.substring(len-4, len).equals(".png") 
                || url.substring(len-4, len).equals(".gif"))
             return  true;
         return  false;
    }
    
     public  static  void bfs(String u, String filename) {
        String ans = "";
        Queue<String> queue =  new LinkedList<String>();
        map.put(u,  new Integer(1));
        count ++;
        queue.offer(u);
         while ((u = queue.poll()) !=  null) {
            System.out.println("digging in " + u);
            System.out.println("have digged " + count + " pages now ");
            String content;
             try {
                content = URLAnalysis.getContent(u);
                String[] res = getLinks(content);
                 int n = res.length;
                 for ( int i = 0; i < n; i++) {
                    String v = res[i];
                     if (map.containsKey(v))
                         continue;
                    count ++;
                    ans += v + "\n";
                    map.put(v,  new Integer(1));
                     if( false == isPictureUrl(v))
                        queue.offer(v);
                }
                 if(count >= max_count)  break;
            }  catch (Exception e) {
                e.printStackTrace();
            }
        }
         try {
            FileHelper.writeFile(ans, filename);
        }  catch (Exception e) {
            e.printStackTrace();
        }
    }
    
    
    
     public  static  void main(String[] args) {
        bfs("http://www.163.com", "D:\\test321\\urls.txt");
    }
}

下面是部分输出内容:
http://
http://focus.news.163.com/15/0319/10/AL2INPO400011SM9.html
http://lady.163.com/15/0317/14/AKTR681900264IJ2.html
http://dajia.163.com/article/147.html#AL1GT1GU0095004J
http://xf.house.163.com/qhd/search/0-0-0-0-0-0-0-0-1.html
http://rd.da.netease.com/redirect?t=mwGQ3t&p=EA7B9E&target=http%3A%2F%2Fwww.kaola.com
http://tech.163.com/15/0321/07/AL7C7U3R000915BF.html
http://yuedu.163.com/book_reader/b39efe40b81843a8ac4eabdd3b756d92_4/cd59ff87a38e48eba21b312c4d26f2c7_4?utm_campaign=163ad&utm_source=163home&utm_medium=tab_1_2_7
http://v.163.com/special/opencourse/financialmarkets.html
http://paopao.163.com/schedule/show?pageId=4050&utm_source=163&utm_medium=wytab01&utm_campaign=warmup
http://xf.house.163.com/zz/search/0-0-0-0-0-0-0-0-1.html
http://sports.163.com/15/0321/10/AL7MA69F00052UUC.html
http://ent.163.com/15/0321/01/AL6NG0GI00031H2L.html
http://img2.cache.netease.com/lady/2014/3/1/201403012352473e66b.jpg
http://love.163.com/?vendor=163.navi.icon&utm_source=163.com&utm_campaign=163navi
http://caipiao.163.com/#from=www
http://money.163.com/15/0321/08/AL7GDD1L00253B0H.html
http://yichuangqingshu.lofter.com/post/21d053_641bd4b?act=qbwysylofer_20150101_01
http://img4.cache.netease.com/tech/2015/3/21/20150321095714dd3c3.jpg
http://m.163.com/iphone/index.html
http://yuanst.blog.163.com/blog/static/186229043201522084612809/
http://lady.163.com/15/0320/00/AL42J3UD00264OCL.html
http://w.163.com/15/0320/15/AL5MBP6J00314C3U.html
http://vhouse.163.com/1421889369882.html
http://img2.cache.netease.com/edu/2015/3/20/2015032017293274fa5.jpg

你可能感兴趣的:(宽度优先搜索实现的Java爬虫)