09-01
26
网络爬虫(搜索引擎)模拟代码
作者:Java伴侣 日期:2009-01-26
这是一个专门在GOOGLE上查一些图书统计资料的小爬虫
复制内容到剪贴板 程序代码
package com.taobao.html;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Calendar;
public class HtmlUtil {
public static String connectURL(String url){
HttpURLConnection url_c = null;
InputStream in = null;
BufferedReader rd = null;
StringBuffer buf = new StringBuffer();
try{
URL url_i = new URL(url);
url_c = (HttpURLConnection)url_i.openConnection();
//url_c.setRequestProperty("User-agent","IE/6.0");
url_c.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows 2000)");
url_c.setRequestProperty("Content-Type","application/x-www-form-urlencoded");
url_c.setRequestProperty("Accept","text/html, image/gif, image/jpeg, *; q=.2, */*; q=.2");
url_c.setRequestProperty("Connection","keep-alive");
url_c.setDoInput(true);
url_c.connect();
in = url_c.getInputStream();
rd = new BufferedReader(new InputStreamReader(in,
"utf-8"));
String line = rd.readLine();
String crlf=System.getProperty("line.separator");
while(line != null){
buf.append(line);
buf.append(crlf);
line = rd.readLine();
}
in.close();
rd.close();
}catch(Exception e){
System.out.println("URL connect fild url is:"+url);
e.printStackTrace();
}finally{
if(url_c != null)
url_c.disconnect();
try{
if(in != null)
in.close();
if(rd != null)
rd.close();
}catch(Exception e){}
}
return buf.toString();
}
/**
* 删除input字符串中的html格式
*
* @param input
* @param length
* @return
*/
public static String splitAndFilterString(String input, int length) {
if (input == null || input.trim().equals("")) {
return "";
}
// 去掉所有html元素,
String str = input.replaceAll("\\&[a-zA-Z]{1,10};", "").replaceAll(
"<[^>]*>", "");
str = str.replaceAll("[(/>)<]", "");
str = str.replaceAll("\\r", "");
str = str.replaceAll("\\n", "");
//根据实际情况来增加
str = spareString(str);
int len = str.length();
if (len <= length) {
return str;
} else {
str = str.substring(0, length);
str += "......";
}
return str;
}
public static String spareString(String str1){
String str = new String();
str = str1;
str = str.replaceAll("(", "");
str = str.replaceAll(")", "");
str = str.replaceAll(":", "");
str = str.replaceAll("-", "");
str = str.replaceAll("《", "");
str = str.replaceAll("》", "");
str = str.replaceAll("~", "");
str = str.replaceAll("=", "");
str = str.replaceAll(" ", "");
str = str.replaceAll("“", "");
str = str.replaceAll("”", "");
str = str.replaceAll("、", "");
str = str.replaceAll("·", "");
//str = str.replaceAll("\\", "");
//str = str.replaceAll("/", "");
//str = str.replaceAll("*", "");
str = str.replaceAll("$", "");
str = str.replaceAll("&", "");
str = str.replaceAll(" ", "");
str = str.replaceAll("\"", "");
//str = str.replaceAll("(", "");
//str = str.replaceAll(")", "");
str = str.replaceAll("<", "");
str = str.replaceAll(">", "");
str = str.replaceAll("-", "");
str = str.replaceAll(":", "");
str = str.replaceAll("=", "");
str = str.replaceAll("~", "");
return str;
}
public static String getNow(){
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
Calendar calendar = Calendar.getInstance();
calendar.setTimeInMillis(System.currentTimeMillis());
return format.format(calendar.getTime());
}
}
public String getHtml(Book key, boolean isAll){
String title = HtmlUtil.spareString(key.getTitle());
String publish = HtmlUtil.spareString(key.getPublish());
String year = HtmlUtil.spareString(key.getIssueYear());
String name = "";
try{
if(isAll)
name = URLEncoder.encode(title+" "
+ publish + " " + year, "utf-8");
else
name = URLEncoder.encode(title, "utf-8");
}catch(Exception e){
System.out.println("URLEncoder fild name is:"+name);
e.printStackTrace();
}
String url = "http://scholar.google.cn/scholar?hl=zh-CN&lr=&newwindow=1&q=" +
name + "&btnG=%E6%90%9C%E7%B4%A2&lr=";
return HtmlUtil.connectURL(url);
}
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Calendar;
public class HtmlUtil {
public static String connectURL(String url){
HttpURLConnection url_c = null;
InputStream in = null;
BufferedReader rd = null;
StringBuffer buf = new StringBuffer();
try{
URL url_i = new URL(url);
url_c = (HttpURLConnection)url_i.openConnection();
//url_c.setRequestProperty("User-agent","IE/6.0");
url_c.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows 2000)");
url_c.setRequestProperty("Content-Type","application/x-www-form-urlencoded");
url_c.setRequestProperty("Accept","text/html, image/gif, image/jpeg, *; q=.2, */*; q=.2");
url_c.setRequestProperty("Connection","keep-alive");
url_c.setDoInput(true);
url_c.connect();
in = url_c.getInputStream();
rd = new BufferedReader(new InputStreamReader(in,
"utf-8"));
String line = rd.readLine();
String crlf=System.getProperty("line.separator");
while(line != null){
buf.append(line);
buf.append(crlf);
line = rd.readLine();
}
in.close();
rd.close();
}catch(Exception e){
System.out.println("URL connect fild url is:"+url);
e.printStackTrace();
}finally{
if(url_c != null)
url_c.disconnect();
try{
if(in != null)
in.close();
if(rd != null)
rd.close();
}catch(Exception e){}
}
return buf.toString();
}
/**
* 删除input字符串中的html格式
*
* @param input
* @param length
* @return
*/
public static String splitAndFilterString(String input, int length) {
if (input == null || input.trim().equals("")) {
return "";
}
// 去掉所有html元素,
String str = input.replaceAll("\\&[a-zA-Z]{1,10};", "").replaceAll(
"<[^>]*>", "");
str = str.replaceAll("[(/>)<]", "");
str = str.replaceAll("\\r", "");
str = str.replaceAll("\\n", "");
//根据实际情况来增加
str = spareString(str);
int len = str.length();
if (len <= length) {
return str;
} else {
str = str.substring(0, length);
str += "......";
}
return str;
}
public static String spareString(String str1){
String str = new String();
str = str1;
str = str.replaceAll("(", "");
str = str.replaceAll(")", "");
str = str.replaceAll(":", "");
str = str.replaceAll("-", "");
str = str.replaceAll("《", "");
str = str.replaceAll("》", "");
str = str.replaceAll("~", "");
str = str.replaceAll("=", "");
str = str.replaceAll(" ", "");
str = str.replaceAll("“", "");
str = str.replaceAll("”", "");
str = str.replaceAll("、", "");
str = str.replaceAll("·", "");
//str = str.replaceAll("\\", "");
//str = str.replaceAll("/", "");
//str = str.replaceAll("*", "");
str = str.replaceAll("$", "");
str = str.replaceAll("&", "");
str = str.replaceAll(" ", "");
str = str.replaceAll("\"", "");
//str = str.replaceAll("(", "");
//str = str.replaceAll(")", "");
str = str.replaceAll("<", "");
str = str.replaceAll(">", "");
str = str.replaceAll("-", "");
str = str.replaceAll(":", "");
str = str.replaceAll("=", "");
str = str.replaceAll("~", "");
return str;
}
public static String getNow(){
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
Calendar calendar = Calendar.getInstance();
calendar.setTimeInMillis(System.currentTimeMillis());
return format.format(calendar.getTime());
}
}
public String getHtml(Book key, boolean isAll){
String title = HtmlUtil.spareString(key.getTitle());
String publish = HtmlUtil.spareString(key.getPublish());
String year = HtmlUtil.spareString(key.getIssueYear());
String name = "";
try{
if(isAll)
name = URLEncoder.encode(title+" "
+ publish + " " + year, "utf-8");
else
name = URLEncoder.encode(title, "utf-8");
}catch(Exception e){
System.out.println("URLEncoder fild name is:"+name);
e.printStackTrace();
}
String url = "http://scholar.google.cn/scholar?hl=zh-CN&lr=&newwindow=1&q=" +
name + "&btnG=%E6%90%9C%E7%B4%A2&lr=";
return HtmlUtil.connectURL(url);
}
评论: 0 | 引用: 0 | 查看次数: 595
发表评论