Skip to content

多个代理的管理 #128

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 27, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ out/
.idea
.classpath
.project
.settings/
bin/
.myeclipse
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ public class Request implements Serializable {
private static final long serialVersionUID = 2062192774891352043L;

public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
public static final String STATUS_CODE = "statusCode";
public static final String PROXY = "proxy";

private String url;

Expand Down
32 changes: 32 additions & 0 deletions webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import com.google.common.collect.HashBasedTable;
import com.google.common.collect.Table;
import org.apache.http.HttpHost;

import us.codecraft.webmagic.proxy.ProxyPool;
import us.codecraft.webmagic.utils.UrlUtils;

import java.util.*;
Expand Down Expand Up @@ -47,6 +49,8 @@ public class Site {

private HttpHost httpProxy;

private ProxyPool httpProxyPool=new ProxyPool();

private boolean useGzip = true;

/**
Expand Down Expand Up @@ -438,4 +442,32 @@ public String toString() {
", headers=" + headers +
'}';
}

/**
* Set httpProxyPool, String[0]:ip, String[1]:port <br>
*
* @return this
*/
public Site setHttpProxyPool(List<String[]> httpProxyList) {
this.httpProxyPool=new ProxyPool(httpProxyList);
return this;
}

public ProxyPool getHttpProxyPool() {
return httpProxyPool;
}

public HttpHost getHttpProxyFromPool() {
return httpProxyPool.getProxy();
}

public void returnHttpProxyToPool(HttpHost proxy,int statusCode) {
httpProxyPool.returnProxy(proxy,statusCode);
}

public Site setProxyReuseInterval(int reuseInterval) {
this.httpProxyPool.setReuseInterval(reuseInterval);
return this;
}

}
5 changes: 5 additions & 0 deletions webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import com.google.common.collect.Lists;
import org.apache.commons.collections.CollectionUtils;
import org.apache.http.HttpHost;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.downloader.Downloader;
Expand Down Expand Up @@ -324,6 +325,10 @@ public void run() {
onError(requestFinal);
logger.error("process request " + requestFinal + " error", e);
} finally {
if (site.getHttpProxyPool().isEnable()) {
site.returnHttpProxyToPool((HttpHost) requestFinal.getExtra(Request.PROXY), (Integer) requestFinal
.getExtra(Request.STATUS_CODE));
}
pageCount.incrementAndGet();
signalNewUrl();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import com.google.common.collect.Sets;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.annotation.ThreadSafe;
Expand Down Expand Up @@ -84,10 +85,12 @@ public Page download(Request request, Task task) {
}
logger.info("downloading page {}", request.getUrl());
CloseableHttpResponse httpResponse = null;
int statusCode=0;
try {
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers);
httpResponse = getHttpClient(site).execute(httpUriRequest);
int statusCode = httpResponse.getStatusLine().getStatusCode();
statusCode = httpResponse.getStatusLine().getStatusCode();
request.putExtra(Request.STATUS_CODE, statusCode);
if (statusAccept(acceptStatCode, statusCode)) {
//charset
if (charset == null) {
Expand All @@ -109,6 +112,7 @@ public Page download(Request request, Task task) {
onError(request);
return null;
} finally {
request.putExtra(Request.STATUS_CODE, statusCode);
try {
if (httpResponse != null) {
//ensure the connection is released back to pool
Expand Down Expand Up @@ -173,9 +177,11 @@ protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map<Strin
.setSocketTimeout(site.getTimeOut())
.setConnectTimeout(site.getTimeOut())
.setCookieSpec(CookieSpecs.BEST_MATCH);
if (site != null && site.getHttpProxy() != null) {
requestConfigBuilder.setProxy(site.getHttpProxy());
}
if (site.getHttpProxyPool().isEnable()) {
HttpHost host = site.getHttpProxyFromPool();
requestConfigBuilder.setProxy(host);
request.putExtra(Request.PROXY, host);
}
requestBuilder.setConfig(requestConfigBuilder.build());
return requestBuilder.build();
}
Expand Down
172 changes: 172 additions & 0 deletions webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
package us.codecraft.webmagic.proxy;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Delayed;
import java.util.concurrent.TimeUnit;

import org.apache.http.HttpHost;

/**
* >>>>Proxy Status
+----------+ +-----+
| last use | | new |
+-----+----+ +---+-+
| +------+ |
+->| init |<--+
+--+---+
|
v
+--------+
+--->| borrow |
| +---+----+
| |+------------------+
| v
| +--------+
| | in use | Respone Time
| +---+----+
| |+------------------+
| v
| +--------+
| | return |
| +---+----+
| |+-------------------+
| v
| +-------+ reuse interval
| | delay | (delay time)
| +---+---+
| |+-------------------+
| v
| +------+
| | idle | idle time
| +---+--+
| |+-------------------+
+--------+
*/
public class Proxy implements Delayed, Serializable {

private static final long serialVersionUID = 228939737383625551L;
public static final int ERROR_403 = 403;
public static final int ERROR_404 = 404;
public static final int ERROR_BANNED = 10000;
public static final int ERROR_Proxy = 10001;
public static final int SUCCESS = 200;

private final HttpHost httpHost;

private int reuseTimeInterval = 1500;// ms
private Long canReuseTime = 0L;
private Long lastBorrowTime = System.currentTimeMillis();
private Long responseTime = 0L;
private Long idleTime = 0L;

private int failedNum = 0;
private int successNum = 0;
private int borrowNum = 0;

private List<Integer> failedErrorType = new ArrayList<Integer>();

Proxy(HttpHost httpHost) {
this.httpHost = httpHost;
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
}

Proxy(HttpHost httpHost, int reuseInterval) {
this.httpHost = httpHost;
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseInterval, TimeUnit.MILLISECONDS);
}

public int getSuccessNum() {
return successNum;
}

public void successNumIncrement(int increment) {
this.successNum += increment;
}

public Long getLastUseTime() {
return lastBorrowTime;
}

public void setLastBorrowTime(Long lastBorrowTime) {
this.lastBorrowTime = lastBorrowTime;
}

public void recordResponse() {
this.responseTime = (System.currentTimeMillis() - lastBorrowTime + responseTime) / 2;
this.lastBorrowTime = System.currentTimeMillis();
}

public List<Integer> getFailedErrorType() {
return failedErrorType;
}

public void setFailedErrorType(List<Integer> failedErrorType) {
this.failedErrorType = failedErrorType;
}

public void fail(int failedErrorType) {
this.failedNum++;
this.failedErrorType.add(failedErrorType);
}

public void setFailedNum(int failedNum) {
this.failedNum = failedNum;
}

public int getFailedNum() {
return failedNum;
}

public String getFailedType() {
String re = "";
for (Integer i : this.failedErrorType) {
re += i + " . ";
}
return re;
}

public HttpHost getHttpHost() {
return httpHost;
}

public int getReuseTimeInterval() {
return reuseTimeInterval;
}

public void setReuseTimeInterval(int reuseTimeInterval) {
this.reuseTimeInterval = reuseTimeInterval;
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);

}

@Override
public long getDelay(TimeUnit unit) {
return unit.convert(canReuseTime - System.nanoTime(), unit.NANOSECONDS);
}

@Override
public int compareTo(Delayed o) {
Proxy that = (Proxy) o;
return canReuseTime > that.canReuseTime ? 1 : (canReuseTime < that.canReuseTime ? -1 : 0);

}

@Override
public String toString() {

String re = String.format("host: %15s >> %5dms >> success: %-3.2f%% >> borrow: %d", httpHost.getAddress().getHostAddress(), responseTime,
successNum * 100.0 / borrowNum, borrowNum);
return re;

}

public void borrowNumIncrement(int increment) {
this.borrowNum += increment;
}

public int getBorrowNum() {
return borrowNum;
}
}
Loading