10-01
11

使用java得到网页编码格式

package com.tag;

import java.net.MalformedURLException;

import java.net.URL;

import org.apache.commons.httpclient.Header;

import org.apache.commons.httpclient.HeaderElement;

import org.apache.commons.httpclient.HttpClient;

import org.apache.commons.httpclient.NameValuePair;

import org.apache.commons.httpclient.methods.GetMethod;

import toptrack.tools.JQueryBase;

/**

* 得到网页编码格式

* @author dl

*/

public class JHtmlUpdateCheck {

/**文本内容编码识别类*/

private static cpdetector.io.CodepageDetectorProxy detector = cpdetector.io.CodepageDetectorProxy.getInstance();

static {

detector.add(new cpdetector.io.HTMLCodepageDetector(false));

detector.add(cpdetector.io.JChardetFacade.getInstance());

}

/**

*
方法说明:得到网页编码格式

*
输入参数:strUrl 网页链接; timeout 超时设置

*
返回类型:网页编码

*/

public static String getEncoding(String strUrl, int timeout) {

String strEncoding = null;

HttpClient client = new HttpClient();

client.getHttpConnectionManager().getParams().setConnectionTimeout(timeout);

GetMethod method = new GetMethod(strUrl);

method.setFollowRedirects( true );

int statusCode;

try {

statusCode = client.executeMethod(method);

if( statusCode != -1) {

//从http头得到网页编码

strEncoding = getContentCharSet(method.getResponseHeader("Content-Type"));

if (strEncoding != null) {

method.releaseConnection();

return strEncoding;

}

//通过解析meta得到网页编码

String strHtml = method.getResponseBodyAsString().toLowerCase();

StringBuffer strBuffer = new StringBuffer();

int pos = JQueryBase.getTagText(strHtml, "

while (strBuffer.length() > 0) {

StringBuffer strEncodingBuffer = new StringBuffer();

JQueryBase.getTagText(strBuffer.toString(), "charset=", "\"", strEncodingBuffer, 0);

if (strEncodingBuffer.length() > 0) {

strEncoding = strEncodingBuffer.toString();

method.releaseConnection();

return strEncoding;

}

strBuffer = new StringBuffer();

pos = JQueryBase.getTagText(strHtml, "

}

//分析字节得到网页编码

strEncoding = getFileEncoding(strUrl, timeout);

//设置默认网页字符编码

if (strEncoding == null)

strEncoding = "GBK";

}

method.releaseConnection();

} catch (Exception e) {

// TODO Auto-generated catch block

System.out.println(e.getClass() + "对" + strUrl + "提取网页编码信息出错");

return null;

}

return strEncoding;

}

/**

*
方法说明:通过http头得到网页编码信息

*
输入参数:contentheade rhttp头

*
返回类型:网页编码

*/

protected static String getContentCharSet(Header contentheader) {

String charset = null;

if (contentheader != null) {

HeaderElement values[] = contentheader.getElements();

if (values.length == 1) {

NameValuePair param = values[0].getParameterByName("charset");

if (param != null) {

charset = param.getValue();

}

}

}

return charset;

}


文章来自: 本站原创
引用通告: 查看所有引用 | 我要引用此文章
Tags: 编码
相关日志:
评论: 0 | 引用: 0 | 查看次数: 267
发表评论
昵 称:
密 码: 游客发言不需要密码.
内 容:
验证码: 验证码
选 项:
虽然发表评论不用注册,但是为了保护您的发言权,建议您注册帐号.
字数限制 1000 字 | UBB代码 开启 | [img]标签 关闭