Skip to content

Commit a681d79

Browse files
committed
每个线程使用自己的Httpclient实例,比原来使用一个client实例快非常多
1 parent 2c30534 commit a681d79

File tree

3 files changed

+49
-39
lines changed

3 files changed

+49
-39
lines changed

com/crawl/comments/CrawlComments.java

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,44 @@
11
package com.crawl.comments;
22

3+
import org.apache.http.client.ClientProtocolException;
34
import org.apache.http.client.config.RequestConfig;
45
import org.apache.http.client.methods.HttpGet;
6+
import org.apache.http.conn.ConnectionPoolTimeoutException;
57
import org.apache.http.impl.client.CloseableHttpClient;
8+
import org.apache.http.impl.client.HttpClients;
69
import org.apache.http.util.EntityUtils;
710
import org.dom4j.Element;
811
import org.json.JSONArray;
912
import org.json.JSONObject;
1013

1114
import java.io.IOException;
15+
import java.net.SocketTimeoutException;
1216

1317
/**
1418
* Created by geekgao on 15-10-19.
1519
*/
1620
public class CrawlComments implements Runnable {
17-
private CloseableHttpClient client;
1821
private Element app;
1922
private int start;
2023
private int count;
2124
private int appId;
2225

23-
public CrawlComments(CloseableHttpClient client, Element app, int start, int count, int appId) {
24-
this.client = client;
26+
public CrawlComments(Element app, int start, int count, int appId) {
2527
this.app = app;
2628
this.start = start;
2729
this.count = count;
2830
this.appId = appId;
2931
}
3032

3133
private void setAppXml() throws IOException {
34+
//设置超时
35+
RequestConfig requestConfig = RequestConfig.custom().setConnectionRequestTimeout(2000).setSocketTimeout(6000).setConnectTimeout(2000).build();
36+
//建立client
37+
CloseableHttpClient client = HttpClients.custom().setDefaultRequestConfig(requestConfig).build();
38+
3239
HttpGet getContentJson = new HttpGet("http://comment.mobilem.360.cn/comment/getComments?baike=" + appId + "&level=0&start=" + start + "&count=" + count + "&fm=home_jingjia_3&m=c1804fc5ca4ded8293acd1151efaf3db&m2=61f3c1e4d105b55aff323b20a8136c4e&v=3.2.50&re=1&nt=1&ch=493041&os=21&model=MX4+Pro&sn=4.66476154040931&cu=m76&ca1=armeabi-v7a&ca2=armeabi&ppi=1536x2560&cpc=1&startCount=4");
3340
String contentJson = EntityUtils.toString(client.execute(getContentJson).getEntity());
41+
3442
JSONObject jsonObject = new JSONObject(contentJson);
3543
JSONArray contentJsonArray = jsonObject.getJSONObject("data").getJSONArray("messages");
3644

@@ -50,16 +58,25 @@ private void setAppXml() throws IOException {
5058
comment.addElement("review").setText(review);
5159
comment.addElement("agreecount").setText(agreecount);
5260
}
61+
client.close();
5362
}
5463

5564
public void run() {
5665
try {
5766
setAppXml();
58-
//只是为了使提示醒目一点使用这个红色的输出
59-
System.out.println(appId + "号app从" + start + "开始的评论抓取完毕");
67+
} catch (ConnectionPoolTimeoutException e) {
68+
System.err.println(appId + "号app从" + start + "开始的评论发生-ConnectionPoolTimeoutException");
69+
return;
70+
} catch (ClientProtocolException e) {
71+
e.printStackTrace();
72+
return;
73+
} catch (SocketTimeoutException e) {
74+
System.err.println(appId + "号app从" + start + "开始的评论发生-SocketTimeoutException");
75+
return;
6076
} catch (IOException e) {
61-
System.err.println("执行setAppXml()出错.");
6277
e.printStackTrace();
78+
return;
6379
}
80+
System.out.println(appId + "号app从" + start + "开始的评论抓取完毕");
6481
}
6582
}

com/crawl/comments/CrawlUtils.java

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -87,23 +87,23 @@ public static Set<String> getAppIds(String uri,int limit) throws IOException {
8787
8888
return appIds;*/
8989
Set<String> s = new HashSet<String>();
90-
s.add("3581");
91-
s.add("778702");
92-
s.add("1586");
90+
// s.add("3581");
91+
// s.add("778702");
92+
// s.add("1586");
9393
// s.add("6276");
94-
s.add("122437");
95-
s.add("5632");
96-
s.add("4107");
94+
// s.add("122437");
95+
// s.add("5632");
96+
// s.add("4107");
9797
// s.add("98008");
9898
// s.add("3100672");
99-
s.add("2345172");
100-
s.add("1343");
101-
s.add("3094256");
99+
// s.add("2345172");
100+
// s.add("1343");
101+
// s.add("3094256");
102102
// s.add("101594");
103-
s.add("1840672");
104-
s.add("1643");
103+
// s.add("1840672");
104+
// s.add("1643");
105105
// s.add("893686");
106-
s.add("3032510");
106+
// s.add("3032510");
107107
s.add("1936882");
108108
// s.add("7256");
109109
// s.add("727030");

com/crawl/comments/Main.java

Lines changed: 14 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package com.crawl.comments;
22

3+
import org.apache.http.client.config.RequestConfig;
34
import org.apache.http.client.methods.CloseableHttpResponse;
45
import org.apache.http.client.methods.HttpGet;
56
import org.apache.http.impl.client.CloseableHttpClient;
@@ -11,8 +12,9 @@
1112
import org.json.JSONObject;
1213

1314
import java.io.IOException;
14-
import java.util.*;
15-
import java.util.concurrent.*;
15+
import java.util.Set;
16+
import java.util.concurrent.ExecutorService;
17+
import java.util.concurrent.Executors;
1618

1719
/**
1820
* Created by geekgao on 15-10-25.
@@ -21,13 +23,15 @@ public class Main {
2123
public static void main(String[] args) throws IOException, InterruptedException {
2224
//获取要抓取的app的id
2325
Set<String> appIds = CrawlUtils.getAppIds("",1);
26+
//设置超时
27+
RequestConfig requestConfig = RequestConfig.custom().setConnectionRequestTimeout(2000).setSocketTimeout(2000).setConnectTimeout(2000).build();
28+
//建立client
29+
CloseableHttpClient client = HttpClients.custom().setDefaultRequestConfig(requestConfig).build();
2430

2531
for (String id:appIds) {
26-
//建立client
27-
CloseableHttpClient client = HttpClients.createDefault();
2832
//建立线程池
2933
ExecutorService executorService = Executors.newFixedThreadPool(30);
30-
34+
//建立xml根节点
3135
Element app = DocumentHelper.createDocument().addElement("app");
3236

3337
//添加appid节点
@@ -61,38 +65,27 @@ public static void main(String[] args) throws IOException, InterruptedException
6165
int commentsCount = CrawlUtils.getCommentCount(Integer.valueOf(id));
6266
System.out.println("[" + appName + "]总共" + commentsCount + "条评论");
6367
//每次获取的评论个数
64-
int count = 50;
65-
//用这个控制每个线程
66-
List<Future> futures = new LinkedList<Future>();
68+
int count = 25;
6769
for (int start = 0;start < commentsCount;start += count) {
6870
//如果最后一次不够count个评论
6971
if (start + count > commentsCount) {
7072
count = commentsCount - start;
7173
}
72-
// System.out.println("从第" + start + "个评论开始抓取");
73-
Future future = executorService.submit(new CrawlComments(client, app, start, count, Integer.valueOf(id)));
7474

75-
try {
76-
//设置超时
77-
future.get(7000, TimeUnit.MILLISECONDS);
78-
} catch (ExecutionException e) {
79-
e.printStackTrace();
80-
} catch (TimeoutException e) {
81-
System.err.println("[" + appName + "]从[" + start + "]开始的抓取超时了,退出此线程");
82-
future.cancel(true);
83-
}
75+
// System.out.println("从第" + start + "个评论开始抓取");
76+
executorService.submit(new CrawlComments(app, start, count, Integer.valueOf(id)));
8477
}
8578

8679
executorService.shutdown();
8780
while (true) {
8881
if (executorService.isTerminated()) {
8982
break;
9083
}
91-
Thread.sleep(100);
84+
Thread.sleep(1000);
9285
}
9386

94-
client.close();
9587
CrawlUtils.writeXmlToFile(app,"/home/geekgao/comments/" + System.currentTimeMillis() + ".xml");
9688
}
89+
client.close();
9790
}
9891
}

0 commit comments

Comments
 (0)