10-01
16
htmlparser 编码问题
作者:Java伴侣 日期:2010-01-16
有时候,在抓取网站的时候,网站的编码方式可能不统一,这样的情况,可能有些网页编码不成功,而htmlparser报了错,不能正常的读取。抛出来的异常为:org.htmlparser.util.EncodingChangeException: character mismatch (new: 中 [0x4e2d] != old: [0xd6?]) for encoding change from ISO-8859-1 to GB2312 at character offset 23
为了解决不管它是用何种编码方式,都能够正常读取数据。我在htmlparser的Page类中加了一个字段,之所以要在Page类中加,那是因为它历遍所有的标签过程中,把meta标签属性content捕获到了,并已经传值到setEncoding(String charset)办法中。
代码如下:
Java代码
// Decompiled by Jad v1.5.8f. Copyright 2001 Pavel Kouznetsov.
// Jad home page: http://www.kpdus.com/jad.html
// Decompiler options: packimports(3)
// Source File Name: Page.java
package org.htmlparser.lexer;
import java.io.*;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.*;
import java.util.zip.*;
import org.htmlparser.http.ConnectionManager;
import org.htmlparser.util.ParserException;
// Referenced classes of package org.htmlparser.lexer:
// InputStreamSource, PageIndex, StringSource, Cursor,
// Stream, Source
public class Page
implements Serializable
{
public Page()
{
this("");
}
public Page(URLConnection connection)
throws ParserException
{
if(null == connection)
{
throw new IllegalArgumentException("connection cannot be null");
} else
{
setConnection(connection);
mBaseUrl = null;
return;
}
}
public Page(InputStream stream, String charset)
throws UnsupportedEncodingException
{
if(null == stream)
throw new IllegalArgumentException("stream cannot be null");
if(null == charset)
charset = "ISO-8859-1";
mSource = new InputStreamSource(stream, charset);
mIndex = new PageIndex(this);
mConnection = null;
mUrl = null;
mBaseUrl = null;
}
public Page(String text, String charset)
{
if(null == text)
throw new IllegalArgumentException("text cannot be null");
if(null == charset)
charset = "ISO-8859-1";
mSource = new StringSource(text, charset);
mIndex = new PageIndex(this);
mConnection = null;
mUrl = null;
mBaseUrl = null;
}
public Page(String text)
{
this(text, null);
}
public Page(Source source)
{
if(null == source)
{
throw new IllegalArgumentException("source cannot be null");
} else
{
mSource = source;
mIndex = new PageIndex(this);
mConnection = null;
mUrl = null;
mBaseUrl = null;
return;
}
}
public static ConnectionManager getConnectionManager()
{
return mConnectionManager;
}
public static void setConnectionManager(ConnectionManager manager)
{
mConnectionManager = manager;
}
public String getCharset(String content)
{
String CHARSET_STRING = "charset";
String ret;
if(null == mSource)
ret = "ISO-8859-1";
else
ret = mSource.getEncoding();
if(null != content)
{
int index = content.indexOf("charset");
if(index != -1)
{
content = content.substring(index + "charset".length()).trim();
if(content.startsWith("="))
{
content = content.substring(1).trim();
index = content.indexOf(";");
if(index != -1)
content = content.substring(0, index);
if(content.startsWith("\"") && content.endsWith("\"") && 1 < content.length())
content = content.substring(1, content.length() - 1);
if(content.startsWith("'") && content.endsWith("'") && 1 < content.length())
content = content.substring(1, content.length() - 1);
ret = findCharset(content, ret);
}
}
}
return ret;
}
public static String findCharset(String name, String fallback)
{
String ret;
try
{
Class cls = Class.forName("java.nio.charset.Charset");
Method method = cls.getMethod("forName", new Class[] {
java.lang.String.class
});
Object object = method.invoke(null, new Object[] {
name
});
method = cls.getMethod("name", new Class[0]);
object = method.invoke(object, new Object[0]);
ret = (String)object;
}
catch(ClassNotFoundException cnfe)
{
ret = name;
}
catch(NoSuchMethodException nsme)
{
ret = name;
}
catch(IllegalAccessException ia)
{
ret = name;
}
catch(InvocationTargetException ita)
{
ret = fallback;
System.out.println("unable to determine cannonical charset name for " + name + " - using " + fallback);
}
return ret;
}
private void writeObject(ObjectOutputStream out)
throws IOException
{
if(null != getConnection())
{
out.writeBoolean(true);
out.writeInt(mSource.offset());
String href = getUrl();
out.writeObject(href);
setUrl(getConnection().getURL().toExternalForm());
Source source = getSource();
mSource = null;
PageIndex index = mIndex;
mIndex = null;
out.defaultWriteObject();
mSource = source;
mIndex = index;
} else
{
out.writeBoolean(false);
String href = getUrl();
out.writeObject(href);
setUrl(null);
out.defaultWriteObject();
setUrl(href);
}
}
private void readObject(ObjectInputStream in)
throws IOException, ClassNotFoundException
{
boolean fromurl = in.readBoolean();
if(fromurl)
{
int offset = in.readInt();
String href = (String)in.readObject();
in.defaultReadObject();
if(null != getUrl())
{
URL url = new URL(getUrl());
try
{
setConnection(url.openConnection());
}
catch(ParserException pe)
{
throw new IOException(pe.getMessage());
}
}
Cursor cursor = new Cursor(this, 0);
for(int i = 0; i < offset; i++)
try
{
getCharacter(cursor);
}
catch(ParserException pe)
{
throw new IOException(pe.getMessage());
}
setUrl(href);
} else
{
String href = (String)in.readObject();
in.defaultReadObject();
setUrl(href);
}
}
public void reset()
{
getSource().reset();
mIndex = new PageIndex(this);
}
public void close()
throws IOException
{
if(null != getSource())
getSource().destroy();
}
protected void finalize()
throws Throwable
{
close();
}
public URLConnection getConnection()
{
return mConnection;
}
public void setConnection(URLConnection connection)
throws ParserException
{
mConnection = connection;
mConnection.setConnectTimeout(6000);
mConnection.setReadTimeout(6000);
try
{
getConnection().connect();
}
catch(UnknownHostException uhe)
{
throw new ParserException("Connect to " + mConnection.getURL().toExternalForm() + " failed.", uhe);
}
catch(IOException ioe)
{
throw new ParserException("Exception connecting to " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);
}
String type = getContentType();
String charset = getCharset(type);
try
{
String contentEncoding = connection.getContentEncoding();
System.out.println("contentEncoding="+contentEncoding);
Stream stream;
if(null != contentEncoding && -1 != contentEncoding.indexOf("gzip"))
stream = new Stream(new GZIPInputStream(getConnection().getInputStream()));
else
if(null != contentEncoding && -1 != contentEncoding.indexOf("deflate"))
stream = new Stream(new InflaterInputStream(getConnection().getInputStream(), new Inflater(true)));
else{
stream = new Stream(getConnection().getInputStream());
}
try
{
/*
* 时间:2008年12月23日
* 原因:当String charset = getCharset(type);返回来的是ISO-8859-1的时候,需要处理一下
*/
if(charset.indexOf("ISO-8859-1")!=-1){
charset = getQICHAODEFAULT_CHARSET();
}
mSource = new InputStreamSource(stream, charset);
}
catch(UnsupportedEncodingException uee)
{
charset = "ISO-8859-1";
mSource = new InputStreamSource(stream, charset);
}
}
catch(IOException ioe)
{
throw new ParserException("Exception getting input stream from " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);
}
mUrl = connection.getURL().toExternalForm();
mIndex = new PageIndex(this);
}
public String getUrl()
{
return mUrl;
}
public void setUrl(String url)
{
mUrl = url;
}
public String getBaseUrl()
{
return mBaseUrl;
}
public void setBaseUrl(String url)
{
mBaseUrl = url;
}
public Source getSource()
{
return mSource;
}
public String getContentType()
{
String ret = "text/html";
URLConnection connection = getConnection();
if(null != connection)
{
String content = connection.getHeaderField("Content-Type");
if(null != content)
ret = content;
}
return ret;
}
public char getCharacter(Cursor cursor)
throws ParserException
{
int i = cursor.getPosition();
int offset = mSource.offset();
char ret;
if(offset == i)
try
{
i = mSource.read();
if(-1 == i)
{
ret = '\uFFFF';
} else
{
ret = (char)i;
cursor.advance();
}
}
catch(IOException ioe)
{
throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);
}
else
if(offset > i)
{
try
{
ret = mSource.getCharacter(i);
}
catch(IOException ioe)
{
throw new ParserException("can't read a character at position " + i, ioe);
}
cursor.advance();
} else
{
throw new ParserException("attempt to read future characters from source " + i + " > " + mSource.offset());
}
if('\r' == ret)
{
ret = '\n';
if(mSource.offset() == cursor.getPosition())
try
{
i = mSource.read();
if(-1 != i)
if('\n' == (char)i)
cursor.advance();
else
try
{
mSource.unread();
}
catch(IOException ioe)
{
throw new ParserException("can't unread a character at position " + cursor.getPosition(), ioe);
}
}
catch(IOException ioe)
{
throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);
}
else
try
{
if('\n' == mSource.getCharacter(cursor.getPosition()))
cursor.advance();
}
catch(IOException ioe)
{
throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);
}
}
if('\n' == ret)
mIndex.add(cursor);
return ret;
}
public void ungetCharacter(Cursor cursor)
throws ParserException
{
cursor.retreat();
int i = cursor.getPosition();
try
{
char ch = mSource.getCharacter(i);
if('\n' == ch && 0 != i)
{
ch = mSource.getCharacter(i - 1);
if('\r' == ch)
cursor.retreat();
}
}
catch(IOException ioe)
{
throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);
}
}
public String getEncoding()
{
return getSource().getEncoding();
}
public void setEncoding(String character_set)
throws ParserException
{
this.QICHAODEFAULT_CHARSET = character_set;
getSource().setEncoding(character_set);
}
public URL constructUrl(String link, String base)
throws MalformedURLException
{
return constructUrl(link, base, false);
}
public URL constructUrl(String link, String base, boolean strict)
throws MalformedURLException
{
int index;
URL url;
if(!strict && '?' == link.charAt(0))
{
if(-1 != (index = base.lastIndexOf('?')))
base = base.substring(0, index);
url = new URL(base + link);
} else
{
url = new URL(new URL(base), link);
}
String path = url.getFile();
boolean modified = false;
boolean absolute = link.startsWith("/");
if(!absolute)
do
{
if(!path.startsWith("/."))
break;
if(path.startsWith("/../"))
{
path = path.substring(3);
modified = true;
continue;
}
if(!path.startsWith("/./") && !path.startsWith("/."))
break;
path = path.substring(2);
modified = true;
} while(true);
while(-1 != (index = path.indexOf("/\\")))
{
path = path.substring(0, index + 1) + path.substring(index + 2);
modified = true;
}
if(modified)
url = new URL(url, path);
return url;
}
public String getAbsoluteURL(String link)
{
return getAbsoluteURL(link, false);
}
public String getAbsoluteURL(String link, boolean strict)
{
String ret;
if(null == link || "".equals(link))
ret = "";
else
try
{
String base = getBaseUrl();
if(null == base)
base = getUrl();
if(null == base)
{
ret = link;
} else
{
URL url = constructUrl(link, base, strict);
ret = url.toExternalForm();
}
}
catch(MalformedURLException murle)
{
ret = link;
}
return ret;
}
public int row(Cursor cursor)
{
return mIndex.row(cursor);
}
public int row(int position)
{
return mIndex.row(position);
}
public int column(Cursor cursor)
{
return mIndex.column(cursor);
}
public int column(int position)
{
return mIndex.column(position);
}
public String getText(int start, int end)
throws IllegalArgumentException
{
String ret;
try
{
ret = mSource.getString(start, end - start);
}
catch(IOException ioe)
{
throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
}
return ret;
}
public void getText(StringBuffer buffer, int start, int end)
throws IllegalArgumentException
{
if(mSource.offset() < start || mSource.offset() < end)
throw new IllegalArgumentException("attempt to extract future characters from source" + start + "|" + end + " > " + mSource.offset());
int length;
if(end < start)
{
length = end;
end = start;
start = length;
}
length = end - start;
try
{
mSource.getCharacters(buffer, start, length);
}
catch(IOException ioe)
{
throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
}
}
public String getText()
{
return getText(0, mSource.offset());
}
public void getText(StringBuffer buffer)
{
getText(buffer, 0, mSource.offset());
}
public void getText(char array[], int offset, int start, int end)
throws IllegalArgumentException
{
if(mSource.offset() < start || mSource.offset() < end)
throw new IllegalArgumentException("attempt to extract future characters from source");
int length;
if(end < start)
{
length = end;
end = start;
start = length;
}
length = end - start;
try
{
mSource.getCharacters(array, offset, start, end);
}
catch(IOException ioe)
{
throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
}
}
public String getLine(Cursor cursor)
{
int line = row(cursor);
int size = mIndex.size();
int start;
int end;
if(line < size)
{
start = mIndex.elementAt(line);
if(++line <= size)
end = mIndex.elementAt(line);
else
end = mSource.offset();
} else
{
start = mIndex.elementAt(line - 1);
end = mSource.offset();
}
return getText(start, end);
}
public String getLine(int position)
{
return getLine(new Cursor(this, position));
}
public String toString()
{
String ret;
if(mSource.offset() > 0)
{
StringBuffer buffer = new StringBuffer(43);
int start = mSource.offset() - 40;
if(0 > start)
start = 0;
else
buffer.append("...");
getText(buffer, start, mSource.offset());
ret = buffer.toString();
} else
{
ret = super.toString();
}
return ret;
}
public static final String DEFAULT_CHARSET = "ISO-8859-1";
public static String QICHAODEFAULT_CHARSET = "gb2312";
public static final String DEFAULT_CONTENT_TYPE = "text/html";
public static final char EOF = 65535;
protected String mUrl;
protected String mBaseUrl;
protected Source mSource;
protected PageIndex mIndex;
protected transient URLConnection mConnection;
protected static ConnectionManager mConnectionManager = new ConnectionManager();
public static String getQICHAODEFAULT_CHARSET() {
return QICHAODEFAULT_CHARSET;
}
}
// Decompiled by Jad v1.5.8f. Copyright 2001 Pavel Kouznetsov.
// Jad home page: http://www.kpdus.com/jad.html
// Decompiler options: packimports(3)
// Source File Name: Page.java
package org.htmlparser.lexer;
import java.io.*;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.*;
import java.util.zip.*;
import org.htmlparser.http.ConnectionManager;
import org.htmlparser.util.ParserException;
// Referenced classes of package org.htmlparser.lexer:
// InputStreamSource, PageIndex, StringSource, Cursor,
// Stream, Source
public class Page
implements Serializable
{
public Page()
{
this("");
}
public Page(URLConnection connection)
throws ParserException
{
if(null == connection)
{
throw new IllegalArgumentException("connection cannot be null");
} else
{
setConnection(connection);
mBaseUrl = null;
return;
}
}
public Page(InputStream stream, String charset)
throws UnsupportedEncodingException
{
if(null == stream)
throw new IllegalArgumentException("stream cannot be null");
if(null == charset)
charset = "ISO-8859-1";
mSource = new InputStreamSource(stream, charset);
mIndex = new PageIndex(this);
mConnection = null;
mUrl = null;
mBaseUrl = null;
}
public Page(String text, String charset)
{
if(null == text)
throw new IllegalArgumentException("text cannot be null");
if(null == charset)
charset = "ISO-8859-1";
mSource = new StringSource(text, charset);
mIndex = new PageIndex(this);
mConnection = null;
mUrl = null;
mBaseUrl = null;
}
public Page(String text)
{
this(text, null);
}
public Page(Source source)
{
if(null == source)
{
throw new IllegalArgumentException("source cannot be null");
} else
{
mSource = source;
mIndex = new PageIndex(this);
mConnection = null;
mUrl = null;
mBaseUrl = null;
return;
}
}
public static ConnectionManager getConnectionManager()
{
return mConnectionManager;
}
public static void setConnectionManager(ConnectionManager manager)
{
mConnectionManager = manager;
}
public String getCharset(String content)
{
String CHARSET_STRING = "charset";
String ret;
if(null == mSource)
ret = "ISO-8859-1";
else
ret = mSource.getEncoding();
if(null != content)
{
int index = content.indexOf("charset");
if(index != -1)
{
content = content.substring(index + "charset".length()).trim();
if(content.startsWith("="))
{
content = content.substring(1).trim();
index = content.indexOf(";");
if(index != -1)
content = content.substring(0, index);
if(content.startsWith("\"") && content.endsWith("\"") && 1 < content.length())
content = content.substring(1, content.length() - 1);
if(content.startsWith("'") && content.endsWith("'") && 1 < content.length())
content = content.substring(1, content.length() - 1);
ret = findCharset(content, ret);
}
}
}
return ret;
}
public static String findCharset(String name, String fallback)
{
String ret;
try
{
Class cls = Class.forName("java.nio.charset.Charset");
Method method = cls.getMethod("forName", new Class[] {
java.lang.String.class
});
Object object = method.invoke(null, new Object[] {
name
});
method = cls.getMethod("name", new Class[0]);
object = method.invoke(object, new Object[0]);
ret = (String)object;
}
catch(ClassNotFoundException cnfe)
{
ret = name;
}
catch(NoSuchMethodException nsme)
{
ret = name;
}
catch(IllegalAccessException ia)
{
ret = name;
}
catch(InvocationTargetException ita)
{
ret = fallback;
System.out.println("unable to determine cannonical charset name for " + name + " - using " + fallback);
}
return ret;
}
private void writeObject(ObjectOutputStream out)
throws IOException
{
if(null != getConnection())
{
out.writeBoolean(true);
out.writeInt(mSource.offset());
String href = getUrl();
out.writeObject(href);
setUrl(getConnection().getURL().toExternalForm());
Source source = getSource();
mSource = null;
PageIndex index = mIndex;
mIndex = null;
out.defaultWriteObject();
mSource = source;
mIndex = index;
} else
{
out.writeBoolean(false);
String href = getUrl();
out.writeObject(href);
setUrl(null);
out.defaultWriteObject();
setUrl(href);
}
}
private void readObject(ObjectInputStream in)
throws IOException, ClassNotFoundException
{
boolean fromurl = in.readBoolean();
if(fromurl)
{
int offset = in.readInt();
String href = (String)in.readObject();
in.defaultReadObject();
if(null != getUrl())
{
URL url = new URL(getUrl());
try
{
setConnection(url.openConnection());
}
catch(ParserException pe)
{
throw new IOException(pe.getMessage());
}
}
Cursor cursor = new Cursor(this, 0);
for(int i = 0; i < offset; i++)
try
{
getCharacter(cursor);
}
catch(ParserException pe)
{
throw new IOException(pe.getMessage());
}
setUrl(href);
} else
{
String href = (String)in.readObject();
in.defaultReadObject();
setUrl(href);
}
}
public void reset()
{
getSource().reset();
mIndex = new PageIndex(this);
}
public void close()
throws IOException
{
if(null != getSource())
getSource().destroy();
}
protected void finalize()
throws Throwable
{
close();
}
public URLConnection getConnection()
{
return mConnection;
}
public void setConnection(URLConnection connection)
throws ParserException
{
mConnection = connection;
mConnection.setConnectTimeout(6000);
mConnection.setReadTimeout(6000);
try
{
getConnection().connect();
}
catch(UnknownHostException uhe)
{
throw new ParserException("Connect to " + mConnection.getURL().toExternalForm() + " failed.", uhe);
}
catch(IOException ioe)
{
throw new ParserException("Exception connecting to " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);
}
String type = getContentType();
String charset = getCharset(type);
try
{
String contentEncoding = connection.getContentEncoding();
System.out.println("contentEncoding="+contentEncoding);
Stream stream;
if(null != contentEncoding && -1 != contentEncoding.indexOf("gzip"))
stream = new Stream(new GZIPInputStream(getConnection().getInputStream()));
else
if(null != contentEncoding && -1 != contentEncoding.indexOf("deflate"))
stream = new Stream(new InflaterInputStream(getConnection().getInputStream(), new Inflater(true)));
else{
stream = new Stream(getConnection().getInputStream());
}
try
{
/*
* 时间:2008年12月23日
* 原因:当String charset = getCharset(type);返回来的是ISO-8859-1的时候,需要处理一下
*/
if(charset.indexOf("ISO-8859-1")!=-1){
charset = getQICHAODEFAULT_CHARSET();
}
mSource = new InputStreamSource(stream, charset);
}
catch(UnsupportedEncodingException uee)
{
charset = "ISO-8859-1";
mSource = new InputStreamSource(stream, charset);
}
}
catch(IOException ioe)
{
throw new ParserException("Exception getting input stream from " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);
}
mUrl = connection.getURL().toExternalForm();
mIndex = new PageIndex(this);
}
public String getUrl()
{
return mUrl;
}
public void setUrl(String url)
{
mUrl = url;
}
public String getBaseUrl()
{
return mBaseUrl;
}
public void setBaseUrl(String url)
{
mBaseUrl = url;
}
public Source getSource()
{
return mSource;
}
public String getContentType()
{
String ret = "text/html";
URLConnection connection = getConnection();
if(null != connection)
{
String content = connection.getHeaderField("Content-Type");
if(null != content)
ret = content;
}
return ret;
}
public char getCharacter(Cursor cursor)
throws ParserException
{
int i = cursor.getPosition();
int offset = mSource.offset();
char ret;
if(offset == i)
try
{
i = mSource.read();
if(-1 == i)
{
ret = '\uFFFF';
} else
{
ret = (char)i;
cursor.advance();
}
}
catch(IOException ioe)
{
throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);
}
else
if(offset > i)
{
try
{
ret = mSource.getCharacter(i);
}
catch(IOException ioe)
{
throw new ParserException("can't read a character at position " + i, ioe);
}
cursor.advance();
} else
{
throw new ParserException("attempt to read future characters from source " + i + " > " + mSource.offset());
}
if('\r' == ret)
{
ret = '\n';
if(mSource.offset() == cursor.getPosition())
try
{
i = mSource.read();
if(-1 != i)
if('\n' == (char)i)
cursor.advance();
else
try
{
mSource.unread();
}
catch(IOException ioe)
{
throw new ParserException("can't unread a character at position " + cursor.getPosition(), ioe);
}
}
catch(IOException ioe)
{
throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);
}
else
try
{
if('\n' == mSource.getCharacter(cursor.getPosition()))
cursor.advance();
}
catch(IOException ioe)
{
throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);
}
}
if('\n' == ret)
mIndex.add(cursor);
return ret;
}
public void ungetCharacter(Cursor cursor)
throws ParserException
{
cursor.retreat();
int i = cursor.getPosition();
try
{
char ch = mSource.getCharacter(i);
if('\n' == ch && 0 != i)
{
ch = mSource.getCharacter(i - 1);
if('\r' == ch)
cursor.retreat();
}
}
catch(IOException ioe)
{
throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);
}
}
public String getEncoding()
{
return getSource().getEncoding();
}
public void setEncoding(String character_set)
throws ParserException
{
this.QICHAODEFAULT_CHARSET = character_set;
getSource().setEncoding(character_set);
}
public URL constructUrl(String link, String base)
throws MalformedURLException
{
return constructUrl(link, base, false);
}
public URL constructUrl(String link, String base, boolean strict)
throws MalformedURLException
{
int index;
URL url;
if(!strict && '?' == link.charAt(0))
{
if(-1 != (index = base.lastIndexOf('?')))
base = base.substring(0, index);
url = new URL(base + link);
} else
{
url = new URL(new URL(base), link);
}
String path = url.getFile();
boolean modified = false;
boolean absolute = link.startsWith("/");
if(!absolute)
do
{
if(!path.startsWith("/."))
break;
if(path.startsWith("/../"))
{
path = path.substring(3);
modified = true;
continue;
}
if(!path.startsWith("/./") && !path.startsWith("/."))
break;
path = path.substring(2);
modified = true;
} while(true);
while(-1 != (index = path.indexOf("/\\")))
{
path = path.substring(0, index + 1) + path.substring(index + 2);
modified = true;
}
if(modified)
url = new URL(url, path);
return url;
}
public String getAbsoluteURL(String link)
{
return getAbsoluteURL(link, false);
}
public String getAbsoluteURL(String link, boolean strict)
{
String ret;
if(null == link || "".equals(link))
ret = "";
else
try
{
String base = getBaseUrl();
if(null == base)
base = getUrl();
if(null == base)
{
ret = link;
} else
{
URL url = constructUrl(link, base, strict);
ret = url.toExternalForm();
}
}
catch(MalformedURLException murle)
{
ret = link;
}
return ret;
}
public int row(Cursor cursor)
{
return mIndex.row(cursor);
}
public int row(int position)
{
return mIndex.row(position);
}
public int column(Cursor cursor)
{
return mIndex.column(cursor);
}
public int column(int position)
{
return mIndex.column(position);
}
public String getText(int start, int end)
throws IllegalArgumentException
{
String ret;
try
{
ret = mSource.getString(start, end - start);
}
catch(IOException ioe)
{
throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
}
return ret;
}
public void getText(StringBuffer buffer, int start, int end)
throws IllegalArgumentException
{
if(mSource.offset() < start || mSource.offset() < end)
throw new IllegalArgumentException("attempt to extract future characters from source" + start + "|" + end + " > " + mSource.offset());
int length;
if(end < start)
{
length = end;
end = start;
start = length;
}
length = end - start;
try
{
mSource.getCharacters(buffer, start, length);
}
catch(IOException ioe)
{
throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
}
}
public String getText()
{
return getText(0, mSource.offset());
}
public void getText(StringBuffer buffer)
{
getText(buffer, 0, mSource.offset());
}
public void getText(char array[], int offset, int start, int end)
throws IllegalArgumentException
{
if(mSource.offset() < start || mSource.offset() < end)
throw new IllegalArgumentException("attempt to extract future characters from source");
int length;
if(end < start)
{
length = end;
end = start;
start = length;
}
length = end - start;
try
{
mSource.getCharacters(array, offset, start, end);
}
catch(IOException ioe)
{
throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
}
}
public String getLine(Cursor cursor)
{
int line = row(cursor);
int size = mIndex.size();
int start;
int end;
if(line < size)
{
start = mIndex.elementAt(line);
if(++line <= size)
end = mIndex.elementAt(line);
else
end = mSource.offset();
} else
{
start = mIndex.elementAt(line - 1);
end = mSource.offset();
}
return getText(start, end);
}
public String getLine(int position)
{
return getLine(new Cursor(this, position));
}
public String toString()
{
String ret;
if(mSource.offset() > 0)
{
StringBuffer buffer = new StringBuffer(43);
int start = mSource.offset() - 40;
if(0 > start)
start = 0;
else
buffer.append("...");
getText(buffer, start, mSource.offset());
ret = buffer.toString();
} else
{
ret = super.toString();
}
return ret;
}
public static final String DEFAULT_CHARSET = "ISO-8859-1";
public static String QICHAODEFAULT_CHARSET = "gb2312";
public static final String DEFAULT_CONTENT_TYPE = "text/html";
public static final char EOF = 65535;
protected String mUrl;
protected String mBaseUrl;
protected Source mSource;
protected PageIndex mIndex;
protected transient URLConnection mConnection;
protected static ConnectionManager mConnectionManager = new ConnectionManager();
public static String getQICHAODEFAULT_CHARSET() {
return QICHAODEFAULT_CHARSET;
}
}
在调用的时候,代码如下:
Java代码
Parser parser = new Parser(url);
parser.setEncoding(parser.getLexer().getPage().getQICHAODEFAULT_CHARSET());
Parser parser = new Parser(url);
parser.setEncoding(parser.getLexer().getPage().getQICHAODEFAULT_CHARSET()); 一般情况下,设置成这样应该是没问题的啦,但是,你有时候看到的编码方式并不一定是它该网页的编码方式。比如说,肉眼看到页面中有<meta http-equiv="Content-Type" content="text/html; charset=gb2312" />,htmlparser可以正常的获取到,代码如下:
Java代码
public void doSemanticAction()
throws ParserException
{
String httpEquiv = getHttpEquiv();
if("Content-Type".equalsIgnoreCase(httpEquiv))
{
String charset = getPage().getCharset(getAttribute("CONTENT"));
getPage().setEncoding(charset);
}
}
public void doSemanticAction()
throws ParserException
{
String httpEquiv = getHttpEquiv();
if("Content-Type".equalsIgnoreCase(httpEquiv))
{
String charset = getPage().getCharset(getAttribute("CONTENT"));
getPage().setEncoding(charset);
}
} 但是,你不要认为这个就是它的编码方式啦,在htmlparser,还进行了一次判断,在类Page中,有个方法是获取报头字段Content-Type的。代码如下:
Java代码
public String getContentType()
{
String ret = "text/html";
URLConnection connection = getConnection();
if(null != connection)
{
String content = connection.getHeaderField("Content-Type");
if(null != content)
ret = content;
}
return ret;
}
public String getContentType()
{
String ret = "text/html";
URLConnection connection = getConnection();
if(null != connection)
{
String content = connection.getHeaderField("Content-Type");
if(null != content)
ret = content;
}
return ret;
} 两个进行比较,如果不一样的话,它就报了
Java代码
org.htmlparser.util.EncodingChangeException: character mismatch (new: 中 [0x4e2d] != old: [0xd6?]) for encoding change from ISO-8859-1 to GB2312 at character offset 23
org.htmlparser.util.EncodingChangeException: character mismatch (new: 中 [0x4e2d] != old: [0xd6?]) for encoding change from ISO-8859-1 to GB2312 at character offset 23 ,所以,我在类InputStreamSource再一次进行修改:
代码如下:
Java代码
public void setEncoding(String character_set)
throws ParserException
{
String encoding = getEncoding();
/**
* time:2008年12月23日
*/
if(encoding!=null){
character_set = encoding;
}
if(!encoding.equalsIgnoreCase(character_set))
{
InputStream stream = getStream();
try
{
char buffer[] = mBuffer;
int offset = mOffset;
stream.reset();
try
{
mEncoding = character_set;
mReader = new InputStreamReader(stream, character_set);
mBuffer = new char[mBuffer.length];
mLevel = 0;
mOffset = 0;
mMark = -1;
if(0 != offset)
{
char new_chars[] = new char[offset];
if(offset != read(new_chars))
throw new ParserException("reset stream failed");
for(int i = 0; i < offset; i++)
if(new_chars[i] != buffer[i])
throw new EncodingChangeException("character mismatch (new: " + new_chars[i] + " [0x" + Integer.toString(new_chars[i], 16) + "] != old: " + " [0x" + Integer.toString(buffer[i], 16) + buffer[i] + "]) for encoding change from " + encoding + " to " + character_set + " at character offset " + i);
}
}
catch(IOException ioe)
{
throw new ParserException(ioe.getMessage(), ioe);
}
}
catch(IOException ioe)
{
throw new ParserException("Stream reset failed (" + ioe.getMessage() + "), try wrapping it with a org.htmlparser.lexer.Stream", ioe);
}
}
}
public void setEncoding(String character_set)
throws ParserException
{
String encoding = getEncoding();
/**
* time:2008年12月23日
*/
if(encoding!=null){
character_set = encoding;
}
if(!encoding.equalsIgnoreCase(character_set))
{
InputStream stream = getStream();
try
{
char buffer[] = mBuffer;
int offset = mOffset;
stream.reset();
try
{
mEncoding = character_set;
mReader = new InputStreamReader(stream, character_set);
mBuffer = new char[mBuffer.length];
mLevel = 0;
mOffset = 0;
mMark = -1;
if(0 != offset)
{
char new_chars[] = new char[offset];
if(offset != read(new_chars))
throw new ParserException("reset stream failed");
for(int i = 0; i < offset; i++)
if(new_chars[i] != buffer[i])
throw new EncodingChangeException("character mismatch (new: " + new_chars[i] + " [0x" + Integer.toString(new_chars[i], 16) + "] != old: " + " [0x" + Integer.toString(buffer[i], 16) + buffer[i] + "]) for encoding change from " + encoding + " to " + character_set + " at character offset " + i);
}
}
catch(IOException ioe)
{
throw new ParserException(ioe.getMessage(), ioe);
}
}
catch(IOException ioe)
{
throw new ParserException("Stream reset failed (" + ioe.getMessage() + "), try wrapping it with a org.htmlparser.lexer.Stream", ioe);
}
}
} 这样应该来说,不管什么方式都OK的。
为了解决不管它是用何种编码方式,都能够正常读取数据。我在htmlparser的Page类中加了一个字段,之所以要在Page类中加,那是因为它历遍所有的标签过程中,把meta标签属性content捕获到了,并已经传值到setEncoding(String charset)办法中。
代码如下:
Java代码
// Decompiled by Jad v1.5.8f. Copyright 2001 Pavel Kouznetsov.
// Jad home page: http://www.kpdus.com/jad.html
// Decompiler options: packimports(3)
// Source File Name: Page.java
package org.htmlparser.lexer;
import java.io.*;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.*;
import java.util.zip.*;
import org.htmlparser.http.ConnectionManager;
import org.htmlparser.util.ParserException;
// Referenced classes of package org.htmlparser.lexer:
// InputStreamSource, PageIndex, StringSource, Cursor,
// Stream, Source
public class Page
implements Serializable
{
public Page()
{
this("");
}
public Page(URLConnection connection)
throws ParserException
{
if(null == connection)
{
throw new IllegalArgumentException("connection cannot be null");
} else
{
setConnection(connection);
mBaseUrl = null;
return;
}
}
public Page(InputStream stream, String charset)
throws UnsupportedEncodingException
{
if(null == stream)
throw new IllegalArgumentException("stream cannot be null");
if(null == charset)
charset = "ISO-8859-1";
mSource = new InputStreamSource(stream, charset);
mIndex = new PageIndex(this);
mConnection = null;
mUrl = null;
mBaseUrl = null;
}
public Page(String text, String charset)
{
if(null == text)
throw new IllegalArgumentException("text cannot be null");
if(null == charset)
charset = "ISO-8859-1";
mSource = new StringSource(text, charset);
mIndex = new PageIndex(this);
mConnection = null;
mUrl = null;
mBaseUrl = null;
}
public Page(String text)
{
this(text, null);
}
public Page(Source source)
{
if(null == source)
{
throw new IllegalArgumentException("source cannot be null");
} else
{
mSource = source;
mIndex = new PageIndex(this);
mConnection = null;
mUrl = null;
mBaseUrl = null;
return;
}
}
public static ConnectionManager getConnectionManager()
{
return mConnectionManager;
}
public static void setConnectionManager(ConnectionManager manager)
{
mConnectionManager = manager;
}
public String getCharset(String content)
{
String CHARSET_STRING = "charset";
String ret;
if(null == mSource)
ret = "ISO-8859-1";
else
ret = mSource.getEncoding();
if(null != content)
{
int index = content.indexOf("charset");
if(index != -1)
{
content = content.substring(index + "charset".length()).trim();
if(content.startsWith("="))
{
content = content.substring(1).trim();
index = content.indexOf(";");
if(index != -1)
content = content.substring(0, index);
if(content.startsWith("\"") && content.endsWith("\"") && 1 < content.length())
content = content.substring(1, content.length() - 1);
if(content.startsWith("'") && content.endsWith("'") && 1 < content.length())
content = content.substring(1, content.length() - 1);
ret = findCharset(content, ret);
}
}
}
return ret;
}
public static String findCharset(String name, String fallback)
{
String ret;
try
{
Class cls = Class.forName("java.nio.charset.Charset");
Method method = cls.getMethod("forName", new Class[] {
java.lang.String.class
});
Object object = method.invoke(null, new Object[] {
name
});
method = cls.getMethod("name", new Class[0]);
object = method.invoke(object, new Object[0]);
ret = (String)object;
}
catch(ClassNotFoundException cnfe)
{
ret = name;
}
catch(NoSuchMethodException nsme)
{
ret = name;
}
catch(IllegalAccessException ia)
{
ret = name;
}
catch(InvocationTargetException ita)
{
ret = fallback;
System.out.println("unable to determine cannonical charset name for " + name + " - using " + fallback);
}
return ret;
}
private void writeObject(ObjectOutputStream out)
throws IOException
{
if(null != getConnection())
{
out.writeBoolean(true);
out.writeInt(mSource.offset());
String href = getUrl();
out.writeObject(href);
setUrl(getConnection().getURL().toExternalForm());
Source source = getSource();
mSource = null;
PageIndex index = mIndex;
mIndex = null;
out.defaultWriteObject();
mSource = source;
mIndex = index;
} else
{
out.writeBoolean(false);
String href = getUrl();
out.writeObject(href);
setUrl(null);
out.defaultWriteObject();
setUrl(href);
}
}
private void readObject(ObjectInputStream in)
throws IOException, ClassNotFoundException
{
boolean fromurl = in.readBoolean();
if(fromurl)
{
int offset = in.readInt();
String href = (String)in.readObject();
in.defaultReadObject();
if(null != getUrl())
{
URL url = new URL(getUrl());
try
{
setConnection(url.openConnection());
}
catch(ParserException pe)
{
throw new IOException(pe.getMessage());
}
}
Cursor cursor = new Cursor(this, 0);
for(int i = 0; i < offset; i++)
try
{
getCharacter(cursor);
}
catch(ParserException pe)
{
throw new IOException(pe.getMessage());
}
setUrl(href);
} else
{
String href = (String)in.readObject();
in.defaultReadObject();
setUrl(href);
}
}
public void reset()
{
getSource().reset();
mIndex = new PageIndex(this);
}
public void close()
throws IOException
{
if(null != getSource())
getSource().destroy();
}
protected void finalize()
throws Throwable
{
close();
}
public URLConnection getConnection()
{
return mConnection;
}
public void setConnection(URLConnection connection)
throws ParserException
{
mConnection = connection;
mConnection.setConnectTimeout(6000);
mConnection.setReadTimeout(6000);
try
{
getConnection().connect();
}
catch(UnknownHostException uhe)
{
throw new ParserException("Connect to " + mConnection.getURL().toExternalForm() + " failed.", uhe);
}
catch(IOException ioe)
{
throw new ParserException("Exception connecting to " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);
}
String type = getContentType();
String charset = getCharset(type);
try
{
String contentEncoding = connection.getContentEncoding();
System.out.println("contentEncoding="+contentEncoding);
Stream stream;
if(null != contentEncoding && -1 != contentEncoding.indexOf("gzip"))
stream = new Stream(new GZIPInputStream(getConnection().getInputStream()));
else
if(null != contentEncoding && -1 != contentEncoding.indexOf("deflate"))
stream = new Stream(new InflaterInputStream(getConnection().getInputStream(), new Inflater(true)));
else{
stream = new Stream(getConnection().getInputStream());
}
try
{
/*
* 时间:2008年12月23日
* 原因:当String charset = getCharset(type);返回来的是ISO-8859-1的时候,需要处理一下
*/
if(charset.indexOf("ISO-8859-1")!=-1){
charset = getQICHAODEFAULT_CHARSET();
}
mSource = new InputStreamSource(stream, charset);
}
catch(UnsupportedEncodingException uee)
{
charset = "ISO-8859-1";
mSource = new InputStreamSource(stream, charset);
}
}
catch(IOException ioe)
{
throw new ParserException("Exception getting input stream from " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);
}
mUrl = connection.getURL().toExternalForm();
mIndex = new PageIndex(this);
}
public String getUrl()
{
return mUrl;
}
public void setUrl(String url)
{
mUrl = url;
}
public String getBaseUrl()
{
return mBaseUrl;
}
public void setBaseUrl(String url)
{
mBaseUrl = url;
}
public Source getSource()
{
return mSource;
}
public String getContentType()
{
String ret = "text/html";
URLConnection connection = getConnection();
if(null != connection)
{
String content = connection.getHeaderField("Content-Type");
if(null != content)
ret = content;
}
return ret;
}
public char getCharacter(Cursor cursor)
throws ParserException
{
int i = cursor.getPosition();
int offset = mSource.offset();
char ret;
if(offset == i)
try
{
i = mSource.read();
if(-1 == i)
{
ret = '\uFFFF';
} else
{
ret = (char)i;
cursor.advance();
}
}
catch(IOException ioe)
{
throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);
}
else
if(offset > i)
{
try
{
ret = mSource.getCharacter(i);
}
catch(IOException ioe)
{
throw new ParserException("can't read a character at position " + i, ioe);
}
cursor.advance();
} else
{
throw new ParserException("attempt to read future characters from source " + i + " > " + mSource.offset());
}
if('\r' == ret)
{
ret = '\n';
if(mSource.offset() == cursor.getPosition())
try
{
i = mSource.read();
if(-1 != i)
if('\n' == (char)i)
cursor.advance();
else
try
{
mSource.unread();
}
catch(IOException ioe)
{
throw new ParserException("can't unread a character at position " + cursor.getPosition(), ioe);
}
}
catch(IOException ioe)
{
throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);
}
else
try
{
if('\n' == mSource.getCharacter(cursor.getPosition()))
cursor.advance();
}
catch(IOException ioe)
{
throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);
}
}
if('\n' == ret)
mIndex.add(cursor);
return ret;
}
public void ungetCharacter(Cursor cursor)
throws ParserException
{
cursor.retreat();
int i = cursor.getPosition();
try
{
char ch = mSource.getCharacter(i);
if('\n' == ch && 0 != i)
{
ch = mSource.getCharacter(i - 1);
if('\r' == ch)
cursor.retreat();
}
}
catch(IOException ioe)
{
throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);
}
}
public String getEncoding()
{
return getSource().getEncoding();
}
public void setEncoding(String character_set)
throws ParserException
{
this.QICHAODEFAULT_CHARSET = character_set;
getSource().setEncoding(character_set);
}
public URL constructUrl(String link, String base)
throws MalformedURLException
{
return constructUrl(link, base, false);
}
public URL constructUrl(String link, String base, boolean strict)
throws MalformedURLException
{
int index;
URL url;
if(!strict && '?' == link.charAt(0))
{
if(-1 != (index = base.lastIndexOf('?')))
base = base.substring(0, index);
url = new URL(base + link);
} else
{
url = new URL(new URL(base), link);
}
String path = url.getFile();
boolean modified = false;
boolean absolute = link.startsWith("/");
if(!absolute)
do
{
if(!path.startsWith("/."))
break;
if(path.startsWith("/../"))
{
path = path.substring(3);
modified = true;
continue;
}
if(!path.startsWith("/./") && !path.startsWith("/."))
break;
path = path.substring(2);
modified = true;
} while(true);
while(-1 != (index = path.indexOf("/\\")))
{
path = path.substring(0, index + 1) + path.substring(index + 2);
modified = true;
}
if(modified)
url = new URL(url, path);
return url;
}
public String getAbsoluteURL(String link)
{
return getAbsoluteURL(link, false);
}
public String getAbsoluteURL(String link, boolean strict)
{
String ret;
if(null == link || "".equals(link))
ret = "";
else
try
{
String base = getBaseUrl();
if(null == base)
base = getUrl();
if(null == base)
{
ret = link;
} else
{
URL url = constructUrl(link, base, strict);
ret = url.toExternalForm();
}
}
catch(MalformedURLException murle)
{
ret = link;
}
return ret;
}
public int row(Cursor cursor)
{
return mIndex.row(cursor);
}
public int row(int position)
{
return mIndex.row(position);
}
public int column(Cursor cursor)
{
return mIndex.column(cursor);
}
public int column(int position)
{
return mIndex.column(position);
}
public String getText(int start, int end)
throws IllegalArgumentException
{
String ret;
try
{
ret = mSource.getString(start, end - start);
}
catch(IOException ioe)
{
throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
}
return ret;
}
public void getText(StringBuffer buffer, int start, int end)
throws IllegalArgumentException
{
if(mSource.offset() < start || mSource.offset() < end)
throw new IllegalArgumentException("attempt to extract future characters from source" + start + "|" + end + " > " + mSource.offset());
int length;
if(end < start)
{
length = end;
end = start;
start = length;
}
length = end - start;
try
{
mSource.getCharacters(buffer, start, length);
}
catch(IOException ioe)
{
throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
}
}
public String getText()
{
return getText(0, mSource.offset());
}
public void getText(StringBuffer buffer)
{
getText(buffer, 0, mSource.offset());
}
public void getText(char array[], int offset, int start, int end)
throws IllegalArgumentException
{
if(mSource.offset() < start || mSource.offset() < end)
throw new IllegalArgumentException("attempt to extract future characters from source");
int length;
if(end < start)
{
length = end;
end = start;
start = length;
}
length = end - start;
try
{
mSource.getCharacters(array, offset, start, end);
}
catch(IOException ioe)
{
throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
}
}
public String getLine(Cursor cursor)
{
int line = row(cursor);
int size = mIndex.size();
int start;
int end;
if(line < size)
{
start = mIndex.elementAt(line);
if(++line <= size)
end = mIndex.elementAt(line);
else
end = mSource.offset();
} else
{
start = mIndex.elementAt(line - 1);
end = mSource.offset();
}
return getText(start, end);
}
public String getLine(int position)
{
return getLine(new Cursor(this, position));
}
public String toString()
{
String ret;
if(mSource.offset() > 0)
{
StringBuffer buffer = new StringBuffer(43);
int start = mSource.offset() - 40;
if(0 > start)
start = 0;
else
buffer.append("...");
getText(buffer, start, mSource.offset());
ret = buffer.toString();
} else
{
ret = super.toString();
}
return ret;
}
public static final String DEFAULT_CHARSET = "ISO-8859-1";
public static String QICHAODEFAULT_CHARSET = "gb2312";
public static final String DEFAULT_CONTENT_TYPE = "text/html";
public static final char EOF = 65535;
protected String mUrl;
protected String mBaseUrl;
protected Source mSource;
protected PageIndex mIndex;
protected transient URLConnection mConnection;
protected static ConnectionManager mConnectionManager = new ConnectionManager();
public static String getQICHAODEFAULT_CHARSET() {
return QICHAODEFAULT_CHARSET;
}
}
// Decompiled by Jad v1.5.8f. Copyright 2001 Pavel Kouznetsov.
// Jad home page: http://www.kpdus.com/jad.html
// Decompiler options: packimports(3)
// Source File Name: Page.java
package org.htmlparser.lexer;
import java.io.*;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.*;
import java.util.zip.*;
import org.htmlparser.http.ConnectionManager;
import org.htmlparser.util.ParserException;
// Referenced classes of package org.htmlparser.lexer:
// InputStreamSource, PageIndex, StringSource, Cursor,
// Stream, Source
public class Page
implements Serializable
{
public Page()
{
this("");
}
public Page(URLConnection connection)
throws ParserException
{
if(null == connection)
{
throw new IllegalArgumentException("connection cannot be null");
} else
{
setConnection(connection);
mBaseUrl = null;
return;
}
}
public Page(InputStream stream, String charset)
throws UnsupportedEncodingException
{
if(null == stream)
throw new IllegalArgumentException("stream cannot be null");
if(null == charset)
charset = "ISO-8859-1";
mSource = new InputStreamSource(stream, charset);
mIndex = new PageIndex(this);
mConnection = null;
mUrl = null;
mBaseUrl = null;
}
public Page(String text, String charset)
{
if(null == text)
throw new IllegalArgumentException("text cannot be null");
if(null == charset)
charset = "ISO-8859-1";
mSource = new StringSource(text, charset);
mIndex = new PageIndex(this);
mConnection = null;
mUrl = null;
mBaseUrl = null;
}
public Page(String text)
{
this(text, null);
}
public Page(Source source)
{
if(null == source)
{
throw new IllegalArgumentException("source cannot be null");
} else
{
mSource = source;
mIndex = new PageIndex(this);
mConnection = null;
mUrl = null;
mBaseUrl = null;
return;
}
}
public static ConnectionManager getConnectionManager()
{
return mConnectionManager;
}
public static void setConnectionManager(ConnectionManager manager)
{
mConnectionManager = manager;
}
public String getCharset(String content)
{
String CHARSET_STRING = "charset";
String ret;
if(null == mSource)
ret = "ISO-8859-1";
else
ret = mSource.getEncoding();
if(null != content)
{
int index = content.indexOf("charset");
if(index != -1)
{
content = content.substring(index + "charset".length()).trim();
if(content.startsWith("="))
{
content = content.substring(1).trim();
index = content.indexOf(";");
if(index != -1)
content = content.substring(0, index);
if(content.startsWith("\"") && content.endsWith("\"") && 1 < content.length())
content = content.substring(1, content.length() - 1);
if(content.startsWith("'") && content.endsWith("'") && 1 < content.length())
content = content.substring(1, content.length() - 1);
ret = findCharset(content, ret);
}
}
}
return ret;
}
public static String findCharset(String name, String fallback)
{
String ret;
try
{
Class cls = Class.forName("java.nio.charset.Charset");
Method method = cls.getMethod("forName", new Class[] {
java.lang.String.class
});
Object object = method.invoke(null, new Object[] {
name
});
method = cls.getMethod("name", new Class[0]);
object = method.invoke(object, new Object[0]);
ret = (String)object;
}
catch(ClassNotFoundException cnfe)
{
ret = name;
}
catch(NoSuchMethodException nsme)
{
ret = name;
}
catch(IllegalAccessException ia)
{
ret = name;
}
catch(InvocationTargetException ita)
{
ret = fallback;
System.out.println("unable to determine cannonical charset name for " + name + " - using " + fallback);
}
return ret;
}
private void writeObject(ObjectOutputStream out)
throws IOException
{
if(null != getConnection())
{
out.writeBoolean(true);
out.writeInt(mSource.offset());
String href = getUrl();
out.writeObject(href);
setUrl(getConnection().getURL().toExternalForm());
Source source = getSource();
mSource = null;
PageIndex index = mIndex;
mIndex = null;
out.defaultWriteObject();
mSource = source;
mIndex = index;
} else
{
out.writeBoolean(false);
String href = getUrl();
out.writeObject(href);
setUrl(null);
out.defaultWriteObject();
setUrl(href);
}
}
private void readObject(ObjectInputStream in)
throws IOException, ClassNotFoundException
{
boolean fromurl = in.readBoolean();
if(fromurl)
{
int offset = in.readInt();
String href = (String)in.readObject();
in.defaultReadObject();
if(null != getUrl())
{
URL url = new URL(getUrl());
try
{
setConnection(url.openConnection());
}
catch(ParserException pe)
{
throw new IOException(pe.getMessage());
}
}
Cursor cursor = new Cursor(this, 0);
for(int i = 0; i < offset; i++)
try
{
getCharacter(cursor);
}
catch(ParserException pe)
{
throw new IOException(pe.getMessage());
}
setUrl(href);
} else
{
String href = (String)in.readObject();
in.defaultReadObject();
setUrl(href);
}
}
public void reset()
{
getSource().reset();
mIndex = new PageIndex(this);
}
public void close()
throws IOException
{
if(null != getSource())
getSource().destroy();
}
protected void finalize()
throws Throwable
{
close();
}
public URLConnection getConnection()
{
return mConnection;
}
public void setConnection(URLConnection connection)
throws ParserException
{
mConnection = connection;
mConnection.setConnectTimeout(6000);
mConnection.setReadTimeout(6000);
try
{
getConnection().connect();
}
catch(UnknownHostException uhe)
{
throw new ParserException("Connect to " + mConnection.getURL().toExternalForm() + " failed.", uhe);
}
catch(IOException ioe)
{
throw new ParserException("Exception connecting to " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);
}
String type = getContentType();
String charset = getCharset(type);
try
{
String contentEncoding = connection.getContentEncoding();
System.out.println("contentEncoding="+contentEncoding);
Stream stream;
if(null != contentEncoding && -1 != contentEncoding.indexOf("gzip"))
stream = new Stream(new GZIPInputStream(getConnection().getInputStream()));
else
if(null != contentEncoding && -1 != contentEncoding.indexOf("deflate"))
stream = new Stream(new InflaterInputStream(getConnection().getInputStream(), new Inflater(true)));
else{
stream = new Stream(getConnection().getInputStream());
}
try
{
/*
* 时间:2008年12月23日
* 原因:当String charset = getCharset(type);返回来的是ISO-8859-1的时候,需要处理一下
*/
if(charset.indexOf("ISO-8859-1")!=-1){
charset = getQICHAODEFAULT_CHARSET();
}
mSource = new InputStreamSource(stream, charset);
}
catch(UnsupportedEncodingException uee)
{
charset = "ISO-8859-1";
mSource = new InputStreamSource(stream, charset);
}
}
catch(IOException ioe)
{
throw new ParserException("Exception getting input stream from " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);
}
mUrl = connection.getURL().toExternalForm();
mIndex = new PageIndex(this);
}
public String getUrl()
{
return mUrl;
}
public void setUrl(String url)
{
mUrl = url;
}
public String getBaseUrl()
{
return mBaseUrl;
}
public void setBaseUrl(String url)
{
mBaseUrl = url;
}
public Source getSource()
{
return mSource;
}
public String getContentType()
{
String ret = "text/html";
URLConnection connection = getConnection();
if(null != connection)
{
String content = connection.getHeaderField("Content-Type");
if(null != content)
ret = content;
}
return ret;
}
public char getCharacter(Cursor cursor)
throws ParserException
{
int i = cursor.getPosition();
int offset = mSource.offset();
char ret;
if(offset == i)
try
{
i = mSource.read();
if(-1 == i)
{
ret = '\uFFFF';
} else
{
ret = (char)i;
cursor.advance();
}
}
catch(IOException ioe)
{
throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);
}
else
if(offset > i)
{
try
{
ret = mSource.getCharacter(i);
}
catch(IOException ioe)
{
throw new ParserException("can't read a character at position " + i, ioe);
}
cursor.advance();
} else
{
throw new ParserException("attempt to read future characters from source " + i + " > " + mSource.offset());
}
if('\r' == ret)
{
ret = '\n';
if(mSource.offset() == cursor.getPosition())
try
{
i = mSource.read();
if(-1 != i)
if('\n' == (char)i)
cursor.advance();
else
try
{
mSource.unread();
}
catch(IOException ioe)
{
throw new ParserException("can't unread a character at position " + cursor.getPosition(), ioe);
}
}
catch(IOException ioe)
{
throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);
}
else
try
{
if('\n' == mSource.getCharacter(cursor.getPosition()))
cursor.advance();
}
catch(IOException ioe)
{
throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);
}
}
if('\n' == ret)
mIndex.add(cursor);
return ret;
}
public void ungetCharacter(Cursor cursor)
throws ParserException
{
cursor.retreat();
int i = cursor.getPosition();
try
{
char ch = mSource.getCharacter(i);
if('\n' == ch && 0 != i)
{
ch = mSource.getCharacter(i - 1);
if('\r' == ch)
cursor.retreat();
}
}
catch(IOException ioe)
{
throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);
}
}
public String getEncoding()
{
return getSource().getEncoding();
}
public void setEncoding(String character_set)
throws ParserException
{
this.QICHAODEFAULT_CHARSET = character_set;
getSource().setEncoding(character_set);
}
public URL constructUrl(String link, String base)
throws MalformedURLException
{
return constructUrl(link, base, false);
}
public URL constructUrl(String link, String base, boolean strict)
throws MalformedURLException
{
int index;
URL url;
if(!strict && '?' == link.charAt(0))
{
if(-1 != (index = base.lastIndexOf('?')))
base = base.substring(0, index);
url = new URL(base + link);
} else
{
url = new URL(new URL(base), link);
}
String path = url.getFile();
boolean modified = false;
boolean absolute = link.startsWith("/");
if(!absolute)
do
{
if(!path.startsWith("/."))
break;
if(path.startsWith("/../"))
{
path = path.substring(3);
modified = true;
continue;
}
if(!path.startsWith("/./") && !path.startsWith("/."))
break;
path = path.substring(2);
modified = true;
} while(true);
while(-1 != (index = path.indexOf("/\\")))
{
path = path.substring(0, index + 1) + path.substring(index + 2);
modified = true;
}
if(modified)
url = new URL(url, path);
return url;
}
public String getAbsoluteURL(String link)
{
return getAbsoluteURL(link, false);
}
public String getAbsoluteURL(String link, boolean strict)
{
String ret;
if(null == link || "".equals(link))
ret = "";
else
try
{
String base = getBaseUrl();
if(null == base)
base = getUrl();
if(null == base)
{
ret = link;
} else
{
URL url = constructUrl(link, base, strict);
ret = url.toExternalForm();
}
}
catch(MalformedURLException murle)
{
ret = link;
}
return ret;
}
public int row(Cursor cursor)
{
return mIndex.row(cursor);
}
public int row(int position)
{
return mIndex.row(position);
}
public int column(Cursor cursor)
{
return mIndex.column(cursor);
}
public int column(int position)
{
return mIndex.column(position);
}
public String getText(int start, int end)
throws IllegalArgumentException
{
String ret;
try
{
ret = mSource.getString(start, end - start);
}
catch(IOException ioe)
{
throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
}
return ret;
}
public void getText(StringBuffer buffer, int start, int end)
throws IllegalArgumentException
{
if(mSource.offset() < start || mSource.offset() < end)
throw new IllegalArgumentException("attempt to extract future characters from source" + start + "|" + end + " > " + mSource.offset());
int length;
if(end < start)
{
length = end;
end = start;
start = length;
}
length = end - start;
try
{
mSource.getCharacters(buffer, start, length);
}
catch(IOException ioe)
{
throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
}
}
public String getText()
{
return getText(0, mSource.offset());
}
public void getText(StringBuffer buffer)
{
getText(buffer, 0, mSource.offset());
}
public void getText(char array[], int offset, int start, int end)
throws IllegalArgumentException
{
if(mSource.offset() < start || mSource.offset() < end)
throw new IllegalArgumentException("attempt to extract future characters from source");
int length;
if(end < start)
{
length = end;
end = start;
start = length;
}
length = end - start;
try
{
mSource.getCharacters(array, offset, start, end);
}
catch(IOException ioe)
{
throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
}
}
public String getLine(Cursor cursor)
{
int line = row(cursor);
int size = mIndex.size();
int start;
int end;
if(line < size)
{
start = mIndex.elementAt(line);
if(++line <= size)
end = mIndex.elementAt(line);
else
end = mSource.offset();
} else
{
start = mIndex.elementAt(line - 1);
end = mSource.offset();
}
return getText(start, end);
}
public String getLine(int position)
{
return getLine(new Cursor(this, position));
}
public String toString()
{
String ret;
if(mSource.offset() > 0)
{
StringBuffer buffer = new StringBuffer(43);
int start = mSource.offset() - 40;
if(0 > start)
start = 0;
else
buffer.append("...");
getText(buffer, start, mSource.offset());
ret = buffer.toString();
} else
{
ret = super.toString();
}
return ret;
}
public static final String DEFAULT_CHARSET = "ISO-8859-1";
public static String QICHAODEFAULT_CHARSET = "gb2312";
public static final String DEFAULT_CONTENT_TYPE = "text/html";
public static final char EOF = 65535;
protected String mUrl;
protected String mBaseUrl;
protected Source mSource;
protected PageIndex mIndex;
protected transient URLConnection mConnection;
protected static ConnectionManager mConnectionManager = new ConnectionManager();
public static String getQICHAODEFAULT_CHARSET() {
return QICHAODEFAULT_CHARSET;
}
}
在调用的时候,代码如下:
Java代码
Parser parser = new Parser(url);
parser.setEncoding(parser.getLexer().getPage().getQICHAODEFAULT_CHARSET());
Parser parser = new Parser(url);
parser.setEncoding(parser.getLexer().getPage().getQICHAODEFAULT_CHARSET()); 一般情况下,设置成这样应该是没问题的啦,但是,你有时候看到的编码方式并不一定是它该网页的编码方式。比如说,肉眼看到页面中有<meta http-equiv="Content-Type" content="text/html; charset=gb2312" />,htmlparser可以正常的获取到,代码如下:
Java代码
public void doSemanticAction()
throws ParserException
{
String httpEquiv = getHttpEquiv();
if("Content-Type".equalsIgnoreCase(httpEquiv))
{
String charset = getPage().getCharset(getAttribute("CONTENT"));
getPage().setEncoding(charset);
}
}
public void doSemanticAction()
throws ParserException
{
String httpEquiv = getHttpEquiv();
if("Content-Type".equalsIgnoreCase(httpEquiv))
{
String charset = getPage().getCharset(getAttribute("CONTENT"));
getPage().setEncoding(charset);
}
} 但是,你不要认为这个就是它的编码方式啦,在htmlparser,还进行了一次判断,在类Page中,有个方法是获取报头字段Content-Type的。代码如下:
Java代码
public String getContentType()
{
String ret = "text/html";
URLConnection connection = getConnection();
if(null != connection)
{
String content = connection.getHeaderField("Content-Type");
if(null != content)
ret = content;
}
return ret;
}
public String getContentType()
{
String ret = "text/html";
URLConnection connection = getConnection();
if(null != connection)
{
String content = connection.getHeaderField("Content-Type");
if(null != content)
ret = content;
}
return ret;
} 两个进行比较,如果不一样的话,它就报了
Java代码
org.htmlparser.util.EncodingChangeException: character mismatch (new: 中 [0x4e2d] != old: [0xd6?]) for encoding change from ISO-8859-1 to GB2312 at character offset 23
org.htmlparser.util.EncodingChangeException: character mismatch (new: 中 [0x4e2d] != old: [0xd6?]) for encoding change from ISO-8859-1 to GB2312 at character offset 23 ,所以,我在类InputStreamSource再一次进行修改:
代码如下:
Java代码
public void setEncoding(String character_set)
throws ParserException
{
String encoding = getEncoding();
/**
* time:2008年12月23日
*/
if(encoding!=null){
character_set = encoding;
}
if(!encoding.equalsIgnoreCase(character_set))
{
InputStream stream = getStream();
try
{
char buffer[] = mBuffer;
int offset = mOffset;
stream.reset();
try
{
mEncoding = character_set;
mReader = new InputStreamReader(stream, character_set);
mBuffer = new char[mBuffer.length];
mLevel = 0;
mOffset = 0;
mMark = -1;
if(0 != offset)
{
char new_chars[] = new char[offset];
if(offset != read(new_chars))
throw new ParserException("reset stream failed");
for(int i = 0; i < offset; i++)
if(new_chars[i] != buffer[i])
throw new EncodingChangeException("character mismatch (new: " + new_chars[i] + " [0x" + Integer.toString(new_chars[i], 16) + "] != old: " + " [0x" + Integer.toString(buffer[i], 16) + buffer[i] + "]) for encoding change from " + encoding + " to " + character_set + " at character offset " + i);
}
}
catch(IOException ioe)
{
throw new ParserException(ioe.getMessage(), ioe);
}
}
catch(IOException ioe)
{
throw new ParserException("Stream reset failed (" + ioe.getMessage() + "), try wrapping it with a org.htmlparser.lexer.Stream", ioe);
}
}
}
public void setEncoding(String character_set)
throws ParserException
{
String encoding = getEncoding();
/**
* time:2008年12月23日
*/
if(encoding!=null){
character_set = encoding;
}
if(!encoding.equalsIgnoreCase(character_set))
{
InputStream stream = getStream();
try
{
char buffer[] = mBuffer;
int offset = mOffset;
stream.reset();
try
{
mEncoding = character_set;
mReader = new InputStreamReader(stream, character_set);
mBuffer = new char[mBuffer.length];
mLevel = 0;
mOffset = 0;
mMark = -1;
if(0 != offset)
{
char new_chars[] = new char[offset];
if(offset != read(new_chars))
throw new ParserException("reset stream failed");
for(int i = 0; i < offset; i++)
if(new_chars[i] != buffer[i])
throw new EncodingChangeException("character mismatch (new: " + new_chars[i] + " [0x" + Integer.toString(new_chars[i], 16) + "] != old: " + " [0x" + Integer.toString(buffer[i], 16) + buffer[i] + "]) for encoding change from " + encoding + " to " + character_set + " at character offset " + i);
}
}
catch(IOException ioe)
{
throw new ParserException(ioe.getMessage(), ioe);
}
}
catch(IOException ioe)
{
throw new ParserException("Stream reset failed (" + ioe.getMessage() + "), try wrapping it with a org.htmlparser.lexer.Stream", ioe);
}
}
} 这样应该来说,不管什么方式都OK的。
评论: 0 | 引用: 0 | 查看次数: 498
发表评论