以下是我常用的抓取类,直接调用其中方法可实现本机ip抓取,goagent代理ip抓取,代理ip抓取。以及对文件的下载,页面内容保存到本地等。
package crawlMethodManager;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.ParseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.DeflateDecompressingEntity;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.conn.params.ConnRouteParams;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.protocol.HTTP;
import org.apache.http.util.CharArrayBuffer;
@SuppressWarnings("deprecation")
public class CrawlMethodManager {
static String ip = "";
static int port = 0;
static String ipUrl = "http://localhost:8080/ipFilter/getIp/getIp";
static HttpClient httpPostClient = new DefaultHttpClient(
new ThreadSafeClientConnManager());
/**
* httpClient的get方法
*
* @param url
* String 要抓取的链接
* @param encode
* String 抓取时使用的编码
* @param goagentFlag
* boolean 是否启用goagent
* @param goagentNum
* int goagent尝试的次数
* @param companyFlag
* boolean 是否启用代理
* @param companyNum
* int 代理尝试的次数
* @param localFlag
* boolean 是否启用本机
* @param localNum
* int 本机尝试的次数
*/
public String crawlPageContentByGet(String url, String encode,
boolean goagentFlag, int goagentNum, boolean companyFlag,
int companyNum, boolean localFlag, int localNum)
throws ClientProtocolException, IOException {
String content = "";
if (goagentFlag && content.equals("")) {
int goagentCount = 0;
while (content.equals("") && goagentCount < goagentNum) {
try {
System.out.println("goagent正在请求");
content = doGetByGoagent(url, encode);
} catch (Exception e) {
// System.out.println("goagent请求失败");
}
goagentCount++;
}
}
if (companyFlag && content.equals("")) {
int companyCount = 0;
while (content.equals("") && companyCount < companyNum) {
try {
System.out.println("公司代理ip正在请求");
content = getByCompanyProxy(url, encode);
} catch (Exception e) {
// System.out.println("公司代理ip请求失败");
}
companyCount++;
}
}
if (localFlag && content.equals("")) {
int localCount = 0;
while (content.equals("") && localCount < localNum) {
try {
System.out.println("本机正在请求");
content = doGet(url, encode);
} catch (Exception e) {
// System.out.println("本机请求失败");
}
localCount++;
}
}
return content;
}
/**
*
* @Description: get web content
* @param @param url
* @param @param encode
* @param @return
* @param @throws ClientProtocolException
* @param @throws IOException
* @return String
* @throws
* @author joe
* @date 2014-12-11
*/
public String crawlPageContentByGet(String url, String encode)
throws ClientProtocolException, IOException {
String content = "";
try {
content = doGetByGoagent(url, encode);
if (content == null || content.equals("")) {
System.out.println("启用公司代理");
content = getByCompanyProxy(url, encode);
// if (content == null || content.equals("")) {
// System.out.println("启用本机");
// content = doGet(url, encode);
// }
}
} catch (Exception e) {
try {
System.out.println("goagent连接失败,启用公司代理");
content = getByCompanyProxy(url, encode);
// if (content == null || content.equals("")) {
// System.out.println("公司代理连接失败,启用本机");
// content = doGet(url, encode);
// }
} catch (Exception e2) {
try {
content = getByCompanyProxy(url, encode);
// e2.printStackTrace();
// System.out.println("公司代理连接失败,5秒后启用本机");
// Thread.sleep(5000);
// content = doGet(url, encode);
} catch (Exception e3) {
e3.printStackTrace();
}
}
}
return content;
} |