0) {StringBuffer strEncodingBuffer = new StringBuffer();JQueryBase.getTagText(strBuffer.toString(), "charset=", "\"", strEncodingBuffer, 0);if (strEncodingBuffer.length() > 0) {strEncoding = strEncodingBuffer.toString();method.releaseConnection();return strEncoding;}strBuffer = new StringBuffer();pos = JQueryBase.getTagText(strHtml, " }//分析字节得到网页编码strEncoding = getFileEncoding(strUrl, timeout);//设置默认网页字符编码if (strEncoding == null)strEncoding = "GBK";}method.releaseConnection();} catch (Exception e) {// TODO Auto-generated catch blockSystem.out.println(e.getClass() + "对" + strUrl + "提取网页编码信息出错");return null;}return strEncoding;}/***方法说明:通过http头得到网页编码信息*输入参数:contentheade rhttp头*返回类型:网页编码*/protected static String getContentCharSet(Header contentheader) {String charset = null;if (contentheader != null) {HeaderElement values[] = contentheader.getElements();if (values.length == 1) {NameValuePair param = values[0].getParameterByName("charset");if (param != null) {charset = param.getValue();}}}return charset;},Java伴侣 - 关于工作、学习与生活" /> 使用java得到网页编码格式 - Java伴侣
10-01
11

使用java得到网页编码格式

package com.tag;

import java.net.MalformedURLException;

import java.net.URL;

import org.apache.commons.httpclient.Header;

import org.apache.commons.httpclient.HeaderElement;

import org.apache.commons.httpclient.HttpClient;

import org.apache.commons.httpclient.NameValuePair;

import org.apache.commons.httpclient.methods.GetMethod;

import toptrack.tools.JQueryBase;

/**

* 得到网页编码格式

* @author dl

*/

public class JHtmlUpdateCheck {

/**文本内容编码识别类*/

private static cpdetector.io.CodepageDetectorProxy detector = cpdetector.io.CodepageDetectorProxy.getInstance();

static {

detector.add(new cpdetector.io.HTMLCodepageDetector(false));

detector.add(cpdetector.io.JChardetFacade.getInstance());

}

/**

*
方法说明:得到网页编码格式

*
输入参数:strUrl 网页链接; timeout 超时设置

*
返回类型:网页编码

*/

public static String getEncoding(String strUrl, int timeout) {

String strEncoding = null;

HttpClient client = new HttpClient();

client.getHttpConnectionManager().getParams().setConnectionTimeout(timeout);

GetMethod method = new GetMethod(strUrl);

method.setFollowRedirects( true );

int statusCode;

try {

statusCode = client.executeMethod(method);

if( statusCode != -1) {

//从http头得到网页编码

strEncoding = getContentCharSet(method.getResponseHeader("Content-Type"));

if (strEncoding != null) {

method.releaseConnection();

return strEncoding;

}

//通过解析meta得到网页编码

String strHtml = method.getResponseBodyAsString().toLowerCase();

StringBuffer strBuffer = new StringBuffer();

int pos = JQueryBase.getTagText(strHtml, "

while (strBuffer.length() > 0) {

StringBuffer strEncodingBuffer = new StringBuffer();

JQueryBase.getTagText(strBuffer.toString(), "charset=", "\"", strEncodingBuffer, 0);

if (strEncodingBuffer.length() > 0) {

strEncoding = strEncodingBuffer.toString();

method.releaseConnection();

return strEncoding;

}

strBuffer = new StringBuffer();

pos = JQueryBase.getTagText(strHtml, "

}

//分析字节得到网页编码

strEncoding = getFileEncoding(strUrl, timeout);

//设置默认网页字符编码

if (strEncoding == null)

strEncoding = "GBK";

}

method.releaseConnection();

} catch (Exception e) {

// TODO Auto-generated catch block

System.out.println(e.getClass() + "对" + strUrl + "提取网页编码信息出错");

return null;

}

return strEncoding;

}

/**

*
方法说明:通过http头得到网页编码信息

*
输入参数:contentheade rhttp头

*
返回类型:网页编码

*/

protected static String getContentCharSet(Header contentheader) {

String charset = null;

if (contentheader != null) {

HeaderElement values[] = contentheader.getElements();

if (values.length == 1) {

NameValuePair param = values[0].getParameterByName("charset");

if (param != null) {

charset = param.getValue();

}

}

}

return charset;

}


文章来自: 本站原创
引用通告: 查看所有引用 | 我要引用此文章
Tags: 编码
相关日志:
评论: 0 | 引用: 0 | 查看次数: -
发表评论
昵 称:
密 码: 游客发言不需要密码.
内 容:
验证码: 验证码
选 项:
虽然发表评论不用注册,但是为了保护您的发言权,建议您注册帐号.