使用java得到网页编码格式

作者:Java伴侣日期:2010-01-11

字体大小: 小中大

package com.tag;

import java.net.MalformedURLException;

import java.net.URL;

import org.apache.commons.httpclient.Header;

import org.apache.commons.httpclient.HeaderElement;

import org.apache.commons.httpclient.HttpClient;

import org.apache.commons.httpclient.NameValuePair;

import org.apache.commons.httpclient.methods.GetMethod;

import toptrack.tools.JQueryBase;

/**

* 得到网页编码格式

* @author dl

*/

public class JHtmlUpdateCheck {

/**文本内容编码识别类*/

private static cpdetector.io.CodepageDetectorProxy detector = cpdetector.io.CodepageDetectorProxy.getInstance();

static {

detector.add(new cpdetector.io.HTMLCodepageDetector(false));

detector.add(cpdetector.io.JChardetFacade.getInstance());

}

/**

*
方法说明：得到网页编码格式

*
输入参数：strUrl 网页链接; timeout 超时设置

*
返回类型：网页编码

*/

public static String getEncoding(String strUrl, int timeout) {

String strEncoding = null;

HttpClient client = new HttpClient();

client.getHttpConnectionManager().getParams().setConnectionTimeout(timeout);

GetMethod method = new GetMethod(strUrl);

method.setFollowRedirects( true );

int statusCode;

try {

statusCode = client.executeMethod(method);

if( statusCode != -1) {

//从http头得到网页编码

strEncoding = getContentCharSet(method.getResponseHeader("Content-Type"));

if (strEncoding != null) {

method.releaseConnection();

return strEncoding;

}

//通过解析meta得到网页编码

String strHtml = method.getResponseBodyAsString().toLowerCase();

StringBuffer strBuffer = new StringBuffer();

int pos = JQueryBase.getTagText(strHtml, "

while (strBuffer.length() > 0) {

StringBuffer strEncodingBuffer = new StringBuffer();

JQueryBase.getTagText(strBuffer.toString(), "charset=", "\"", strEncodingBuffer, 0);

if (strEncodingBuffer.length() > 0) {

strEncoding = strEncodingBuffer.toString();

method.releaseConnection();

return strEncoding;

}

strBuffer = new StringBuffer();

pos = JQueryBase.getTagText(strHtml, "

}

//分析字节得到网页编码

strEncoding = getFileEncoding(strUrl, timeout);

//设置默认网页字符编码

if (strEncoding == null)

strEncoding = "GBK";

}

method.releaseConnection();

} catch (Exception e) {

// TODO Auto-generated catch block

System.out.println(e.getClass() + "对" + strUrl + "提取网页编码信息出错");

return null;

}

return strEncoding;

}

/**

*
方法说明：通过http头得到网页编码信息

*
输入参数：contentheade rhttp头

*
返回类型：网页编码

*/

protected static String getContentCharSet(Header contentheader) {

String charset = null;

if (contentheader != null) {

HeaderElement values[] = contentheader.getElements();

if (values.length == 1) {

NameValuePair param = values[0].getParameterByName("charset");

if (param != null) {

charset = param.getValue();

}

}

}

return charset;

}

文章来自: 本站原创

引用通告: 查看所有引用 | 我要引用此文章

Tags: 编码

相关日志:

评论: 0 | 引用: 0 | 查看次数: 268

发表评论

昵　称:
密　码:	游客发言不需要密码.
内　容:	正在加载编辑器...
验证码:
选　项:	禁止表情转换禁止自动转换链接禁止自动转换关键字记住我的信息,以便下次评论时不用输入用户名.

虽然发表评论不用注册，但是为了保护您的发言权，建议您注册帐号. 字数限制 1000 字 \| UBB代码开启 \| [img]标签关闭

使用java得到网页编码格式

作者:Java伴侣日期:2010-01-11

Search

SweetTitles

User Panel

Recent Comments

Archive

使用java得到网页编码格式

作者:Java伴侣 日期:2010-01-11

Search

SweetTitles

User Panel

Recent Comments

Archive

作者:Java伴侣日期:2010-01-11