10-01
11
使用java得到网页编码格式
作者:Java伴侣 日期:2010-01-11
package com.tag;
import java.net.MalformedURLException;
import java.net.URL;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HeaderElement;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.GetMethod;
import toptrack.tools.JQueryBase;
/**
* 得到网页编码格式
* @author dl
*/
public class JHtmlUpdateCheck {
/**文本内容编码识别类*/
private static cpdetector.io.CodepageDetectorProxy detector = cpdetector.io.CodepageDetectorProxy.getInstance();
static {
detector.add(new cpdetector.io.HTMLCodepageDetector(false));
detector.add(cpdetector.io.JChardetFacade.getInstance());
}
/**
*
方法说明:得到网页编码格式
*
输入参数:strUrl 网页链接; timeout 超时设置
*
返回类型:网页编码
*/
public static String getEncoding(String strUrl, int timeout) {
String strEncoding = null;
HttpClient client = new HttpClient();
client.getHttpConnectionManager().getParams().setConnectionTimeout(timeout);
GetMethod method = new GetMethod(strUrl);
method.setFollowRedirects( true );
int statusCode;
try {
statusCode = client.executeMethod(method);
if( statusCode != -1) {
//从http头得到网页编码
strEncoding = getContentCharSet(method.getResponseHeader("Content-Type"));
if (strEncoding != null) {
method.releaseConnection();
return strEncoding;
}
//通过解析meta得到网页编码
String strHtml = method.getResponseBodyAsString().toLowerCase();
StringBuffer strBuffer = new StringBuffer();
int pos = JQueryBase.getTagText(strHtml, "
while (strBuffer.length() > 0) {
StringBuffer strEncodingBuffer = new StringBuffer();
JQueryBase.getTagText(strBuffer.toString(), "charset=", "\"", strEncodingBuffer, 0);
if (strEncodingBuffer.length() > 0) {
strEncoding = strEncodingBuffer.toString();
method.releaseConnection();
return strEncoding;
}
strBuffer = new StringBuffer();
pos = JQueryBase.getTagText(strHtml, "
}
//分析字节得到网页编码
strEncoding = getFileEncoding(strUrl, timeout);
//设置默认网页字符编码
if (strEncoding == null)
strEncoding = "GBK";
}
method.releaseConnection();
} catch (Exception e) {
// TODO Auto-generated catch block
System.out.println(e.getClass() + "对" + strUrl + "提取网页编码信息出错");
return null;
}
return strEncoding;
}
/**
*
方法说明:通过http头得到网页编码信息
*
输入参数:contentheade rhttp头
*
返回类型:网页编码
*/
protected static String getContentCharSet(Header contentheader) {
String charset = null;
if (contentheader != null) {
HeaderElement values[] = contentheader.getElements();
if (values.length == 1) {
NameValuePair param = values[0].getParameterByName("charset");
if (param != null) {
charset = param.getValue();
}
}
}
return charset;
}
import java.net.MalformedURLException;
import java.net.URL;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HeaderElement;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.GetMethod;
import toptrack.tools.JQueryBase;
/**
* 得到网页编码格式
* @author dl
*/
public class JHtmlUpdateCheck {
/**文本内容编码识别类*/
private static cpdetector.io.CodepageDetectorProxy detector = cpdetector.io.CodepageDetectorProxy.getInstance();
static {
detector.add(new cpdetector.io.HTMLCodepageDetector(false));
detector.add(cpdetector.io.JChardetFacade.getInstance());
}
/**
*
方法说明:得到网页编码格式
*
输入参数:strUrl 网页链接; timeout 超时设置
*
返回类型:网页编码
*/
public static String getEncoding(String strUrl, int timeout) {
String strEncoding = null;
HttpClient client = new HttpClient();
client.getHttpConnectionManager().getParams().setConnectionTimeout(timeout);
GetMethod method = new GetMethod(strUrl);
method.setFollowRedirects( true );
int statusCode;
try {
statusCode = client.executeMethod(method);
if( statusCode != -1) {
//从http头得到网页编码
strEncoding = getContentCharSet(method.getResponseHeader("Content-Type"));
if (strEncoding != null) {
method.releaseConnection();
return strEncoding;
}
//通过解析meta得到网页编码
String strHtml = method.getResponseBodyAsString().toLowerCase();
StringBuffer strBuffer = new StringBuffer();
int pos = JQueryBase.getTagText(strHtml, "
while (strBuffer.length() > 0) {
StringBuffer strEncodingBuffer = new StringBuffer();
JQueryBase.getTagText(strBuffer.toString(), "charset=", "\"", strEncodingBuffer, 0);
if (strEncodingBuffer.length() > 0) {
strEncoding = strEncodingBuffer.toString();
method.releaseConnection();
return strEncoding;
}
strBuffer = new StringBuffer();
pos = JQueryBase.getTagText(strHtml, "
}
//分析字节得到网页编码
strEncoding = getFileEncoding(strUrl, timeout);
//设置默认网页字符编码
if (strEncoding == null)
strEncoding = "GBK";
}
method.releaseConnection();
} catch (Exception e) {
// TODO Auto-generated catch block
System.out.println(e.getClass() + "对" + strUrl + "提取网页编码信息出错");
return null;
}
return strEncoding;
}
/**
*
方法说明:通过http头得到网页编码信息
*
输入参数:contentheade rhttp头
*
返回类型:网页编码
*/
protected static String getContentCharSet(Header contentheader) {
String charset = null;
if (contentheader != null) {
HeaderElement values[] = contentheader.getElements();
if (values.length == 1) {
NameValuePair param = values[0].getParameterByName("charset");
if (param != null) {
charset = param.getValue();
}
}
}
return charset;
}
评论: 0 | 引用: 0 | 查看次数: 269
发表评论