11package com .crawl .comments ;
22
3+ import org .apache .http .client .config .RequestConfig ;
34import org .apache .http .client .methods .CloseableHttpResponse ;
45import org .apache .http .client .methods .HttpGet ;
56import org .apache .http .impl .client .CloseableHttpClient ;
1112import org .json .JSONObject ;
1213
1314import java .io .IOException ;
14- import java .util .*;
15- import java .util .concurrent .*;
15+ import java .util .Set ;
16+ import java .util .concurrent .ExecutorService ;
17+ import java .util .concurrent .Executors ;
1618
1719/**
1820 * Created by geekgao on 15-10-25.
@@ -21,13 +23,15 @@ public class Main {
2123 public static void main (String [] args ) throws IOException , InterruptedException {
2224 //获取要抓取的app的id
2325 Set <String > appIds = CrawlUtils .getAppIds ("" ,1 );
26+ //设置超时
27+ RequestConfig requestConfig = RequestConfig .custom ().setConnectionRequestTimeout (2000 ).setSocketTimeout (2000 ).setConnectTimeout (2000 ).build ();
28+ //建立client
29+ CloseableHttpClient client = HttpClients .custom ().setDefaultRequestConfig (requestConfig ).build ();
2430
2531 for (String id :appIds ) {
26- //建立client
27- CloseableHttpClient client = HttpClients .createDefault ();
2832 //建立线程池
2933 ExecutorService executorService = Executors .newFixedThreadPool (30 );
30-
34+ //建立xml根节点
3135 Element app = DocumentHelper .createDocument ().addElement ("app" );
3236
3337 //添加appid节点
@@ -61,38 +65,27 @@ public static void main(String[] args) throws IOException, InterruptedException
6165 int commentsCount = CrawlUtils .getCommentCount (Integer .valueOf (id ));
6266 System .out .println ("[" + appName + "]总共" + commentsCount + "条评论" );
6367 //每次获取的评论个数
64- int count = 50 ;
65- //用这个控制每个线程
66- List <Future > futures = new LinkedList <Future >();
68+ int count = 25 ;
6769 for (int start = 0 ;start < commentsCount ;start += count ) {
6870 //如果最后一次不够count个评论
6971 if (start + count > commentsCount ) {
7072 count = commentsCount - start ;
7173 }
72- // System.out.println("从第" + start + "个评论开始抓取");
73- Future future = executorService .submit (new CrawlComments (client , app , start , count , Integer .valueOf (id )));
7474
75- try {
76- //设置超时
77- future .get (7000 , TimeUnit .MILLISECONDS );
78- } catch (ExecutionException e ) {
79- e .printStackTrace ();
80- } catch (TimeoutException e ) {
81- System .err .println ("[" + appName + "]从[" + start + "]开始的抓取超时了,退出此线程" );
82- future .cancel (true );
83- }
75+ // System.out.println("从第" + start + "个评论开始抓取");
76+ executorService .submit (new CrawlComments (app , start , count , Integer .valueOf (id )));
8477 }
8578
8679 executorService .shutdown ();
8780 while (true ) {
8881 if (executorService .isTerminated ()) {
8982 break ;
9083 }
91- Thread .sleep (100 );
84+ Thread .sleep (1000 );
9285 }
9386
94- client .close ();
9587 CrawlUtils .writeXmlToFile (app ,"/home/geekgao/comments/" + System .currentTimeMillis () + ".xml" );
9688 }
89+ client .close ();
9790 }
9891}
0 commit comments