private String getByCompanyProxy(String url, String encode) {
int count = 10;
String result = "";
String urlString = url;
String proxy = "";
HttpHost proxyHost = null;
boolean newProxy = false;
int oldProxyUsecount = 0;
for (int i = 0; i <= count; i++) {
if (!ip.equals("")) {
proxyHost = new HttpHost(ip, port, null);
}
try {
if (newProxy || oldProxyUsecount > 2 || ip.equals("")) {
oldProxyUsecount = 0;
String[] proxys = null;
try {
while (proxy.equals("") || !proxy.contains(":")) {
System.out.println("ip为空,正在提取");
proxy = doGet(ipUrl, "gbk");
}
proxys = proxy.replaceAll("\"|//|/|\r\n| ", "").split(
":");
} catch (Exception e) {
while (proxy.equals("") || !proxy.contains(":")) {
System.out.println("ip为空,正在提取");
proxy = doGet(ipUrl, "gbk");
}
proxys = proxy.replaceAll("\"|//|/|\r\n| ", "").split(
":");
// proxy = doGet(
// ,
// "gbk");
// proxys = proxy.split(":");
}
ip = proxys[0];
port = Integer.parseInt(proxys[1]);
proxyHost = new HttpHost(ip, port, null);
}
System.out.println("正在使用代理" + ip + ":" + port + ":" + port);
HttpGet httpRequst = new HttpGet(urlString);
httpRequst.addHeader("Accept-Encoding", "gzip,deflate,sdch");
httpRequst.getParams().setParameter(
CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode);
DefaultHttpClient httpClient = new DefaultHttpClient();
httpClient.getParams().setParameter(
CoreConnectionPNames.CONNECTION_TIMEOUT, 9000);// 连接时间20s
httpClient.getParams().setParameter(
CoreConnectionPNames.SO_TIMEOUT, 9000);// 数据传输时间60s
httpClient.getParams().setParameter(
ConnRouteParams.DEFAULT_PROXY, proxyHost);
HttpResponse httpResponse = httpClient.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类
if (httpResponse.getStatusLine().getStatusCode() == 200) {
HttpEntity httpEntity = httpResponse.getEntity();
if (httpEntity.getContentEncoding() != null) {
if ("gzip".equalsIgnoreCase(httpEntity
.getContentEncoding().getValue())) {
httpEntity = new GzipDecompressingEntity(httpEntity);
} else if ("deflate".equalsIgnoreCase(httpEntity
.getContentEncoding().getValue())) {
httpEntity = new DeflateDecompressingEntity(
httpEntity);
}
}
result = enCodetoString(httpEntity, encode);// 取出应答字符串
if (resultTest(result)) {
System.out.println(ip + "公司代理成功抓取" + url);
return result;
} else if (result.contains("function JumpSelf")
&& result.contains("WebShieldSessionVerify")) {
int indexs = result.indexOf("&WebShieldSessionVerify");
int indexe = result.indexOf("\";}</script>");
String verify = result.substring(indexs, indexe);
urlString = urlString + verify;
newProxy = false;
} else if (result.contains("function JumpSelf")
&& !result.contains("WebShieldSessionVerify")) {
urlString = url;
newProxy = false;
} else {
System.out.println("网页含有错误特殊字符" + urlString);
oldProxyUsecount++;
System.out.println(result);
}
} else
System.out.println(httpResponse.getStatusLine()
.getStatusCode() + " " + urlString + " 状态不为200");
oldProxyUsecount++;
httpRequst.abort();
} catch (ClientProtocolException e) {
newProxy = true;
System.out.println(ip + "代理ip拒绝了");
} catch (IOException e) {
oldProxyUsecount++;
System.out.println(ip + "代理读取超时");
}
}
return "";
} |