
htmlparser 编码问题

     有时候,在抓取网站的时候,网站的编码方式可能不统一,这样的情况,可能有些网页编码不成功,而htmlparser报了错,不能正常的读取。抛出来的异常为:org.htmlparser.util.EncodingChangeException: character mismatch (new: 中 [0x4e2d] != old:  [0xd6?]) for encoding change from ISO-8859-1 to GB2312 at character offset 23

为了解决不管它是用何种编码方式,都能够正常读取数据。我在htmlparser的Page类中加了一个字段,之所以要在Page类中加,那是因为它历遍所有的标签过程中,把meta标签属性content捕获到了,并已经传值到setEncoding(String charset)办法中。


// Decompiled by Jad v1.5.8f. Copyright 2001 Pavel Kouznetsov.  
// Jad home page: http://www.kpdus.com/jad.html  
// Decompiler options: packimports(3)    
// Source File Name:   Page.java  
package org.htmlparser.lexer;  
import java.io.*;  
import java.lang.reflect.InvocationTargetException;  
import java.lang.reflect.Method;  
import java.net.*;  
import java.util.zip.*;  
import org.htmlparser.http.ConnectionManager;  
import org.htmlparser.util.ParserException;  
// Referenced classes of package org.htmlparser.lexer:  
//            InputStreamSource, PageIndex, StringSource, Cursor,    
//            Stream, Source  
public class Page  
    implements Serializable  
    public Page()  
    public Page(URLConnection connection)  
        throws ParserException  
        if(null == connection)  
            throw new IllegalArgumentException("connection cannot be null");  
        } else  
            mBaseUrl = null;  
    public Page(InputStream stream, String charset)  
        throws UnsupportedEncodingException  
        if(null == stream)  
            throw new IllegalArgumentException("stream cannot be null");  
        if(null == charset)  
            charset = "ISO-8859-1";  
        mSource = new InputStreamSource(stream, charset);  
        mIndex = new PageIndex(this);  
        mConnection = null;  
        mUrl = null;  
        mBaseUrl = null;  
    public Page(String text, String charset)  
        if(null == text)  
            throw new IllegalArgumentException("text cannot be null");  
        if(null == charset)  
            charset = "ISO-8859-1";  
        mSource = new StringSource(text, charset);  
        mIndex = new PageIndex(this);  
        mConnection = null;  
        mUrl = null;  
        mBaseUrl = null;  
    public Page(String text)  
        this(text, null);  
    public Page(Source source)  
        if(null == source)  
            throw new IllegalArgumentException("source cannot be null");  
        } else  
            mSource = source;  
            mIndex = new PageIndex(this);  
            mConnection = null;  
            mUrl = null;  
            mBaseUrl = null;  
    public static ConnectionManager getConnectionManager()  
        return mConnectionManager;  
    public static void setConnectionManager(ConnectionManager manager)  
        mConnectionManager = manager;  
    public String getCharset(String content)  
        String CHARSET_STRING = "charset";  
        String ret;  
        if(null == mSource)  
            ret = "ISO-8859-1";  
            ret = mSource.getEncoding();  
        if(null != content)  
            int index = content.indexOf("charset");  
            if(index != -1)  
                content = content.substring(index + "charset".length()).trim();  
                    content = content.substring(1).trim();  
                    index = content.indexOf(";");  
                    if(index != -1)  
                        content = content.substring(0, index);  
                    if(content.startsWith("\"") && content.endsWith("\"") && 1 < content.length())  
                        content = content.substring(1, content.length() - 1);  
                    if(content.startsWith("'") && content.endsWith("'") && 1 < content.length())  
                        content = content.substring(1, content.length() - 1);  
                    ret = findCharset(content, ret);  
        return ret;  
    public static String findCharset(String name, String fallback)  
        String ret;  
            Class cls = Class.forName("java.nio.charset.Charset");  
            Method method = cls.getMethod("forName", new Class[] {  
            Object object = method.invoke(null, new Object[] {  
            method = cls.getMethod("name", new Class[0]);  
            object = method.invoke(object, new Object[0]);  
            ret = (String)object;  
        catch(ClassNotFoundException cnfe)  
            ret = name;  
        catch(NoSuchMethodException nsme)  
            ret = name;  
        catch(IllegalAccessException ia)  
            ret = name;  
        catch(InvocationTargetException ita)  
            ret = fallback;  
            System.out.println("unable to determine cannonical charset name for " + name + " - using " + fallback);  
        return ret;  
    private void writeObject(ObjectOutputStream out)  
        throws IOException  
        if(null != getConnection())  
            String href = getUrl();  
            Source source = getSource();  
            mSource = null;  
            PageIndex index = mIndex;  
            mIndex = null;  
            mSource = source;  
            mIndex = index;  
        } else  
            String href = getUrl();  
    private void readObject(ObjectInputStream in)  
        throws IOException, ClassNotFoundException  
        boolean fromurl = in.readBoolean();  
            int offset = in.readInt();  
            String href = (String)in.readObject();  
            if(null != getUrl())  
                URL url = new URL(getUrl());  
                catch(ParserException pe)  
                    throw new IOException(pe.getMessage());  
            Cursor cursor = new Cursor(this, 0);  
            for(int i = 0; i < offset; i++)  
                catch(ParserException pe)  
                    throw new IOException(pe.getMessage());  
        } else  
            String href = (String)in.readObject();  
    public void reset()  
        mIndex = new PageIndex(this);  
    public void close()  
        throws IOException  
        if(null != getSource())  
    protected void finalize()  
        throws Throwable  
    public URLConnection getConnection()  
        return mConnection;  
    public void setConnection(URLConnection connection)  
        throws ParserException  
        mConnection = connection;  
        catch(UnknownHostException uhe)  
            throw new ParserException("Connect to " + mConnection.getURL().toExternalForm() + " failed.", uhe);  
        catch(IOException ioe)  
            throw new ParserException("Exception connecting to " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);  
        String type = getContentType();  
        String charset = getCharset(type);  
            String contentEncoding = connection.getContentEncoding();  
            Stream stream;  
            if(null != contentEncoding && -1 != contentEncoding.indexOf("gzip"))  
                stream = new Stream(new GZIPInputStream(getConnection().getInputStream()));  
            if(null != contentEncoding && -1 != contentEncoding.indexOf("deflate"))  
                stream = new Stream(new InflaterInputStream(getConnection().getInputStream(), new Inflater(true)));  
                stream = new Stream(getConnection().getInputStream());  
                 * 时间:2008年12月23日  
                 * 原因:当String charset = getCharset(type);返回来的是ISO-8859-1的时候,需要处理一下  
                    charset = getQICHAODEFAULT_CHARSET();  
         mSource = new InputStreamSource(stream, charset);  
            catch(UnsupportedEncodingException uee)  
                charset = "ISO-8859-1";  
                mSource = new InputStreamSource(stream, charset);  
        catch(IOException ioe)  
            throw new ParserException("Exception getting input stream from " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);  
        mUrl = connection.getURL().toExternalForm();  
        mIndex = new PageIndex(this);  
    public String getUrl()  
        return mUrl;  
    public void setUrl(String url)  
        mUrl = url;  
    public String getBaseUrl()  
        return mBaseUrl;  
    public void setBaseUrl(String url)  
        mBaseUrl = url;  
    public Source getSource()  
        return mSource;  
    public String getContentType()  
        String ret = "text/html";  
        URLConnection connection = getConnection();  
        if(null != connection)  
            String content = connection.getHeaderField("Content-Type");  
            if(null != content)  
                ret = content;  
        return ret;  
    public char getCharacter(Cursor cursor)  
        throws ParserException  
        int i = cursor.getPosition();  
        int offset = mSource.offset();  
        char ret;  
        if(offset == i)  
                i = mSource.read();  
                if(-1 == i)  
                    ret = '\uFFFF';  
                } else  
                    ret = (char)i;  
            catch(IOException ioe)  
                throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);  
        if(offset > i)  
                ret = mSource.getCharacter(i);  
            catch(IOException ioe)  
                throw new ParserException("can't read a character at position " + i, ioe);  
        } else  
            throw new ParserException("attempt to read future characters from source " + i + " > " + mSource.offset());  
        if('\r' == ret)  
            ret = '\n';  
            if(mSource.offset() == cursor.getPosition())  
                    i = mSource.read();  
                    if(-1 != i)  
                        if('\n' == (char)i)  
                            catch(IOException ioe)  
                                throw new ParserException("can't unread a character at position " + cursor.getPosition(), ioe);  
                catch(IOException ioe)  
                    throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);  
                    if('\n' == mSource.getCharacter(cursor.getPosition()))  
                catch(IOException ioe)  
                    throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);  
        if('\n' == ret)  
        return ret;  
    public void ungetCharacter(Cursor cursor)  
        throws ParserException  
        int i = cursor.getPosition();  
            char ch = mSource.getCharacter(i);  
            if('\n' == ch && 0 != i)  
                ch = mSource.getCharacter(i - 1);  
                if('\r' == ch)  
        catch(IOException ioe)  
            throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);  
    public String getEncoding()  
        return getSource().getEncoding();  
    public void setEncoding(String character_set)  
        throws ParserException  
        this.QICHAODEFAULT_CHARSET = character_set;  
    public URL constructUrl(String link, String base)  
        throws MalformedURLException  
        return constructUrl(link, base, false);  
    public URL constructUrl(String link, String base, boolean strict)  
        throws MalformedURLException  
        int index;  
        URL url;  
        if(!strict && '?' == link.charAt(0))  
            if(-1 != (index = base.lastIndexOf('?')))  
                base = base.substring(0, index);  
            url = new URL(base + link);  
        } else  
            url = new URL(new URL(base), link);  
        String path = url.getFile();  
        boolean modified = false;  
        boolean absolute = link.startsWith("/");  
                    path = path.substring(3);  
                    modified = true;  
                if(!path.startsWith("/./") && !path.startsWith("/."))  
                path = path.substring(2);  
                modified = true;  
            } while(true);  
        while(-1 != (index = path.indexOf("/\\")))    
            path = path.substring(0, index + 1) + path.substring(index + 2);  
            modified = true;  
            url = new URL(url, path);  
        return url;  
    public String getAbsoluteURL(String link)  
        return getAbsoluteURL(link, false);  
    public String getAbsoluteURL(String link, boolean strict)  
        String ret;  
        if(null == link || "".equals(link))  
            ret = "";  
                String base = getBaseUrl();  
                if(null == base)  
                    base = getUrl();  
                if(null == base)  
                    ret = link;  
                } else  
                    URL url = constructUrl(link, base, strict);  
                    ret = url.toExternalForm();  
            catch(MalformedURLException murle)  
                ret = link;  
        return ret;  
    public int row(Cursor cursor)  
        return mIndex.row(cursor);  
    public int row(int position)  
        return mIndex.row(position);  
    public int column(Cursor cursor)  
        return mIndex.column(cursor);  
    public int column(int position)  
        return mIndex.column(position);  
    public String getText(int start, int end)  
        throws IllegalArgumentException  
        String ret;  
            ret = mSource.getString(start, end - start);  
        catch(IOException ioe)  
            throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());  
        return ret;  
    public void getText(StringBuffer buffer, int start, int end)  
        throws IllegalArgumentException  
        if(mSource.offset() < start || mSource.offset() < end)  
            throw new IllegalArgumentException("attempt to extract future characters from source" + start + "|" + end + " > " + mSource.offset());  
        int length;  
        if(end < start)  
            length = end;  
            end = start;  
            start = length;  
        length = end - start;  
            mSource.getCharacters(buffer, start, length);  
        catch(IOException ioe)  
            throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());  
    public String getText()  
        return getText(0, mSource.offset());  
    public void getText(StringBuffer buffer)  
        getText(buffer, 0, mSource.offset());  
    public void getText(char array[], int offset, int start, int end)  
        throws IllegalArgumentException  
        if(mSource.offset() < start || mSource.offset() < end)  
            throw new IllegalArgumentException("attempt to extract future characters from source");  
        int length;  
        if(end < start)  
            length = end;  
            end = start;  
            start = length;  
        length = end - start;  
            mSource.getCharacters(array, offset, start, end);  
        catch(IOException ioe)  
            throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());  
    public String getLine(Cursor cursor)  
        int line = row(cursor);  
        int size = mIndex.size();  
        int start;  
        int end;  
        if(line < size)  
            start = mIndex.elementAt(line);  
            if(++line <= size)  
                end = mIndex.elementAt(line);  
                end = mSource.offset();  
        } else  
            start = mIndex.elementAt(line - 1);  
            end = mSource.offset();  
        return getText(start, end);  
    public String getLine(int position)  
        return getLine(new Cursor(this, position));  
    public String toString()  
        String ret;  
        if(mSource.offset() > 0)  
            StringBuffer buffer = new StringBuffer(43);  
            int start = mSource.offset() - 40;  
            if(0 > start)  
                start = 0;  
            getText(buffer, start, mSource.offset());  
            ret = buffer.toString();  
        } else  
            ret = super.toString();  
        return ret;  
    public static final String DEFAULT_CHARSET = "ISO-8859-1";  
    public static String QICHAODEFAULT_CHARSET = "gb2312";  
    public static final String DEFAULT_CONTENT_TYPE = "text/html";  
    public static final char EOF = 65535;  
    protected String mUrl;  
    protected String mBaseUrl;  
    protected Source mSource;  
    protected PageIndex mIndex;  
    protected transient URLConnection mConnection;  
    protected static ConnectionManager mConnectionManager = new ConnectionManager();  
    public static String getQICHAODEFAULT_CHARSET() {  

// Decompiled by Jad v1.5.8f. Copyright 2001 Pavel Kouznetsov.
// Jad home page: http://www.kpdus.com/jad.html
// Decompiler options: packimports(3)
// Source File Name:   Page.java

package org.htmlparser.lexer;

import java.io.*;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.*;
import java.util.zip.*;
import org.htmlparser.http.ConnectionManager;
import org.htmlparser.util.ParserException;

// Referenced classes of package org.htmlparser.lexer:
//            InputStreamSource, PageIndex, StringSource, Cursor,
//            Stream, Source

public class Page
    implements Serializable

    public Page()

    public Page(URLConnection connection)
        throws ParserException
        if(null == connection)
            throw new IllegalArgumentException("connection cannot be null");
        } else
            mBaseUrl = null;

    public Page(InputStream stream, String charset)
        throws UnsupportedEncodingException
        if(null == stream)
            throw new IllegalArgumentException("stream cannot be null");
        if(null == charset)
            charset = "ISO-8859-1";
        mSource = new InputStreamSource(stream, charset);
        mIndex = new PageIndex(this);
        mConnection = null;
        mUrl = null;
        mBaseUrl = null;

    public Page(String text, String charset)
        if(null == text)
            throw new IllegalArgumentException("text cannot be null");
        if(null == charset)
            charset = "ISO-8859-1";
        mSource = new StringSource(text, charset);
        mIndex = new PageIndex(this);
        mConnection = null;
        mUrl = null;
        mBaseUrl = null;

    public Page(String text)
        this(text, null);

    public Page(Source source)
        if(null == source)
            throw new IllegalArgumentException("source cannot be null");
        } else
            mSource = source;
            mIndex = new PageIndex(this);
            mConnection = null;
            mUrl = null;
            mBaseUrl = null;

    public static ConnectionManager getConnectionManager()
        return mConnectionManager;

    public static void setConnectionManager(ConnectionManager manager)
        mConnectionManager = manager;

    public String getCharset(String content)
        String CHARSET_STRING = "charset";
        String ret;
        if(null == mSource)
            ret = "ISO-8859-1";
            ret = mSource.getEncoding();
        if(null != content)
            int index = content.indexOf("charset");
            if(index != -1)
                content = content.substring(index + "charset".length()).trim();
                    content = content.substring(1).trim();
                    index = content.indexOf(";");
                    if(index != -1)
                        content = content.substring(0, index);
                    if(content.startsWith("\"") && content.endsWith("\"") && 1 < content.length())
                        content = content.substring(1, content.length() - 1);
                    if(content.startsWith("'") && content.endsWith("'") && 1 < content.length())
                        content = content.substring(1, content.length() - 1);
                    ret = findCharset(content, ret);
        return ret;

    public static String findCharset(String name, String fallback)
        String ret;
            Class cls = Class.forName("java.nio.charset.Charset");
            Method method = cls.getMethod("forName", new Class[] {
            Object object = method.invoke(null, new Object[] {
            method = cls.getMethod("name", new Class[0]);
            object = method.invoke(object, new Object[0]);
            ret = (String)object;
        catch(ClassNotFoundException cnfe)
            ret = name;
        catch(NoSuchMethodException nsme)
            ret = name;
        catch(IllegalAccessException ia)
            ret = name;
        catch(InvocationTargetException ita)
            ret = fallback;
            System.out.println("unable to determine cannonical charset name for " + name + " - using " + fallback);
        return ret;

    private void writeObject(ObjectOutputStream out)
        throws IOException
        if(null != getConnection())
            String href = getUrl();
            Source source = getSource();
            mSource = null;
            PageIndex index = mIndex;
            mIndex = null;
            mSource = source;
            mIndex = index;
        } else
            String href = getUrl();

    private void readObject(ObjectInputStream in)
        throws IOException, ClassNotFoundException
        boolean fromurl = in.readBoolean();
            int offset = in.readInt();
            String href = (String)in.readObject();
            if(null != getUrl())
                URL url = new URL(getUrl());
                catch(ParserException pe)
                    throw new IOException(pe.getMessage());
            Cursor cursor = new Cursor(this, 0);
            for(int i = 0; i < offset; i++)
                catch(ParserException pe)
                    throw new IOException(pe.getMessage());

        } else
            String href = (String)in.readObject();

    public void reset()
        mIndex = new PageIndex(this);

    public void close()
        throws IOException
        if(null != getSource())

    protected void finalize()
        throws Throwable

    public URLConnection getConnection()
        return mConnection;

    public void setConnection(URLConnection connection)
        throws ParserException
        mConnection = connection;
        catch(UnknownHostException uhe)
            throw new ParserException("Connect to " + mConnection.getURL().toExternalForm() + " failed.", uhe);
        catch(IOException ioe)
            throw new ParserException("Exception connecting to " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);
        String type = getContentType();
        String charset = getCharset(type);
            String contentEncoding = connection.getContentEncoding();
            Stream stream;
            if(null != contentEncoding && -1 != contentEncoding.indexOf("gzip"))
                stream = new Stream(new GZIPInputStream(getConnection().getInputStream()));
            if(null != contentEncoding && -1 != contentEncoding.indexOf("deflate"))
                stream = new Stream(new InflaterInputStream(getConnection().getInputStream(), new Inflater(true)));
                stream = new Stream(getConnection().getInputStream());

                 * 时间:2008年12月23日
                 * 原因:当String charset = getCharset(type);返回来的是ISO-8859-1的时候,需要处理一下
                    charset = getQICHAODEFAULT_CHARSET();
         mSource = new InputStreamSource(stream, charset);
            catch(UnsupportedEncodingException uee)
                charset = "ISO-8859-1";
                mSource = new InputStreamSource(stream, charset);
        catch(IOException ioe)
            throw new ParserException("Exception getting input stream from " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);
        mUrl = connection.getURL().toExternalForm();
        mIndex = new PageIndex(this);

    public String getUrl()
        return mUrl;

    public void setUrl(String url)
        mUrl = url;

    public String getBaseUrl()
        return mBaseUrl;

    public void setBaseUrl(String url)
        mBaseUrl = url;

    public Source getSource()
        return mSource;

    public String getContentType()
        String ret = "text/html";
        URLConnection connection = getConnection();
        if(null != connection)
            String content = connection.getHeaderField("Content-Type");
            if(null != content)
                ret = content;
        return ret;

    public char getCharacter(Cursor cursor)
        throws ParserException
        int i = cursor.getPosition();
        int offset = mSource.offset();
        char ret;
        if(offset == i)
                i = mSource.read();
                if(-1 == i)
                    ret = '\uFFFF';
                } else
                    ret = (char)i;
            catch(IOException ioe)
                throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);
        if(offset > i)
                ret = mSource.getCharacter(i);
            catch(IOException ioe)
                throw new ParserException("can't read a character at position " + i, ioe);
        } else
            throw new ParserException("attempt to read future characters from source " + i + " > " + mSource.offset());
        if('\r' == ret)
            ret = '\n';
            if(mSource.offset() == cursor.getPosition())
                    i = mSource.read();
                    if(-1 != i)
                        if('\n' == (char)i)
                            catch(IOException ioe)
                                throw new ParserException("can't unread a character at position " + cursor.getPosition(), ioe);
                catch(IOException ioe)
                    throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);
                    if('\n' == mSource.getCharacter(cursor.getPosition()))
                catch(IOException ioe)
                    throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);
        if('\n' == ret)
        return ret;

    public void ungetCharacter(Cursor cursor)
        throws ParserException
        int i = cursor.getPosition();
            char ch = mSource.getCharacter(i);
            if('\n' == ch && 0 != i)
                ch = mSource.getCharacter(i - 1);
                if('\r' == ch)
        catch(IOException ioe)
            throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);

    public String getEncoding()
        return getSource().getEncoding();

    public void setEncoding(String character_set)
        throws ParserException
        this.QICHAODEFAULT_CHARSET = character_set;

    public URL constructUrl(String link, String base)
        throws MalformedURLException
        return constructUrl(link, base, false);

    public URL constructUrl(String link, String base, boolean strict)
        throws MalformedURLException
        int index;
        URL url;
        if(!strict && '?' == link.charAt(0))
            if(-1 != (index = base.lastIndexOf('?')))
                base = base.substring(0, index);
            url = new URL(base + link);
        } else
            url = new URL(new URL(base), link);
        String path = url.getFile();
        boolean modified = false;
        boolean absolute = link.startsWith("/");
                    path = path.substring(3);
                    modified = true;
                if(!path.startsWith("/./") && !path.startsWith("/."))
                path = path.substring(2);
                modified = true;
            } while(true);
        while(-1 != (index = path.indexOf("/\\")))
            path = path.substring(0, index + 1) + path.substring(index + 2);
            modified = true;
            url = new URL(url, path);
        return url;

    public String getAbsoluteURL(String link)
        return getAbsoluteURL(link, false);

    public String getAbsoluteURL(String link, boolean strict)
        String ret;
        if(null == link || "".equals(link))
            ret = "";
                String base = getBaseUrl();
                if(null == base)
                    base = getUrl();
                if(null == base)
                    ret = link;
                } else
                    URL url = constructUrl(link, base, strict);
                    ret = url.toExternalForm();
            catch(MalformedURLException murle)
                ret = link;
        return ret;

    public int row(Cursor cursor)
        return mIndex.row(cursor);

    public int row(int position)
        return mIndex.row(position);

    public int column(Cursor cursor)
        return mIndex.column(cursor);

    public int column(int position)
        return mIndex.column(position);

    public String getText(int start, int end)
        throws IllegalArgumentException
        String ret;
            ret = mSource.getString(start, end - start);
        catch(IOException ioe)
            throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
        return ret;

    public void getText(StringBuffer buffer, int start, int end)
        throws IllegalArgumentException
        if(mSource.offset() < start || mSource.offset() < end)
            throw new IllegalArgumentException("attempt to extract future characters from source" + start + "|" + end + " > " + mSource.offset());
        int length;
        if(end < start)
            length = end;
            end = start;
            start = length;
        length = end - start;
            mSource.getCharacters(buffer, start, length);
        catch(IOException ioe)
            throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());

    public String getText()
        return getText(0, mSource.offset());

    public void getText(StringBuffer buffer)
        getText(buffer, 0, mSource.offset());

    public void getText(char array[], int offset, int start, int end)
        throws IllegalArgumentException
        if(mSource.offset() < start || mSource.offset() < end)
            throw new IllegalArgumentException("attempt to extract future characters from source");
        int length;
        if(end < start)
            length = end;
            end = start;
            start = length;
        length = end - start;
            mSource.getCharacters(array, offset, start, end);
        catch(IOException ioe)
            throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());

    public String getLine(Cursor cursor)
        int line = row(cursor);
        int size = mIndex.size();
        int start;
        int end;
        if(line < size)
            start = mIndex.elementAt(line);
            if(++line <= size)
                end = mIndex.elementAt(line);
                end = mSource.offset();
        } else
            start = mIndex.elementAt(line - 1);
            end = mSource.offset();
        return getText(start, end);

    public String getLine(int position)
        return getLine(new Cursor(this, position));

    public String toString()
        String ret;
        if(mSource.offset() > 0)
            StringBuffer buffer = new StringBuffer(43);
            int start = mSource.offset() - 40;
            if(0 > start)
                start = 0;
            getText(buffer, start, mSource.offset());
            ret = buffer.toString();
        } else
            ret = super.toString();
        return ret;

    public static final String DEFAULT_CHARSET = "ISO-8859-1";
    public static String QICHAODEFAULT_CHARSET = "gb2312";
    public static final String DEFAULT_CONTENT_TYPE = "text/html";
    public static final char EOF = 65535;
    protected String mUrl;
    protected String mBaseUrl;
    protected Source mSource;
    protected PageIndex mIndex;
    protected transient URLConnection mConnection;
    protected static ConnectionManager mConnectionManager = new ConnectionManager();
    public static String getQICHAODEFAULT_CHARSET() {


Parser parser = new Parser(url);  

            Parser parser = new Parser(url);
            parser.setEncoding(parser.getLexer().getPage().getQICHAODEFAULT_CHARSET());  一般情况下,设置成这样应该是没问题的啦,但是,你有时候看到的编码方式并不一定是它该网页的编码方式。比如说,肉眼看到页面中有<meta http-equiv="Content-Type" content="text/html; charset=gb2312" />,htmlparser可以正常的获取到,代码如下:

public void doSemanticAction()  
    throws ParserException  
    String httpEquiv = getHttpEquiv();  
        String charset = getPage().getCharset(getAttribute("CONTENT"));  

    public void doSemanticAction()
        throws ParserException
        String httpEquiv = getHttpEquiv();
            String charset = getPage().getCharset(getAttribute("CONTENT"));
    } 但是,你不要认为这个就是它的编码方式啦,在htmlparser,还进行了一次判断,在类Page中,有个方法是获取报头字段Content-Type的。代码如下:

public String getContentType()  
    String ret = "text/html";  
    URLConnection connection = getConnection();  
    if(null != connection)  
        String content = connection.getHeaderField("Content-Type");  
        if(null != content)  
            ret = content;  
    return ret;  

    public String getContentType()
        String ret = "text/html";
        URLConnection connection = getConnection();
        if(null != connection)
            String content = connection.getHeaderField("Content-Type");
            if(null != content)
                ret = content;
        return ret;
    } 两个进行比较,如果不一样的话,它就报了

org.htmlparser.util.EncodingChangeException: character mismatch (new: 中 [0x4e2d] != old:  [0xd6?]) for encoding change from ISO-8859-1 to GB2312 at character offset 23  

org.htmlparser.util.EncodingChangeException: character mismatch (new: 中 [0x4e2d] != old:  [0xd6?]) for encoding change from ISO-8859-1 to GB2312 at character offset 23 ,所以,我在类InputStreamSource再一次进行修改:


public void setEncoding(String character_set)  
    throws ParserException  
    String encoding = getEncoding();  
     * time:2008年12月23日  
        character_set = encoding;  
        InputStream stream = getStream();  
            char buffer[] = mBuffer;  
            int offset = mOffset;  
                mEncoding = character_set;  
                mReader = new InputStreamReader(stream, character_set);  
                mBuffer = new char[mBuffer.length];  
                mLevel = 0;  
                mOffset = 0;  
                mMark = -1;  
                if(0 != offset)  
                    char new_chars[] = new char[offset];  
                    if(offset != read(new_chars))  
                        throw new ParserException("reset stream failed");  
                    for(int i = 0; i < offset; i++)  
                        if(new_chars[i] != buffer[i])  
                            throw new EncodingChangeException("character mismatch (new: " + new_chars[i] + " [0x" + Integer.toString(new_chars[i], 16) + "] != old: " + " [0x" + Integer.toString(buffer[i], 16) + buffer[i] + "]) for encoding change from " + encoding + " to " + character_set + " at character offset " + i);  
            catch(IOException ioe)  
                throw new ParserException(ioe.getMessage(), ioe);  
        catch(IOException ioe)  
            throw new ParserException("Stream reset failed (" + ioe.getMessage() + "), try wrapping it with a org.htmlparser.lexer.Stream", ioe);  

    public void setEncoding(String character_set)
        throws ParserException
        String encoding = getEncoding();
         * time:2008年12月23日
            character_set = encoding;
            InputStream stream = getStream();
                char buffer[] = mBuffer;
                int offset = mOffset;
                    mEncoding = character_set;
                    mReader = new InputStreamReader(stream, character_set);
                    mBuffer = new char[mBuffer.length];
                    mLevel = 0;
                    mOffset = 0;
                    mMark = -1;
                    if(0 != offset)
                        char new_chars[] = new char[offset];
                        if(offset != read(new_chars))
                            throw new ParserException("reset stream failed");
                        for(int i = 0; i < offset; i++)
                            if(new_chars[i] != buffer[i])
                                throw new EncodingChangeException("character mismatch (new: " + new_chars[i] + " [0x" + Integer.toString(new_chars[i], 16) + "] != old: " + " [0x" + Integer.toString(buffer[i], 16) + buffer[i] + "]) for encoding change from " + encoding + " to " + character_set + " at character offset " + i);

                catch(IOException ioe)
                    throw new ParserException(ioe.getMessage(), ioe);
            catch(IOException ioe)
                throw new ParserException("Stream reset failed (" + ioe.getMessage() + "), try wrapping it with a org.htmlparser.lexer.Stream", ioe);
    } 这样应该来说,不管什么方式都OK的。

文章来自: 本站原创
引用通告: 查看所有引用 | 我要引用此文章
评论: 0 | 引用: 0 | 查看次数: 464
昵 称:
密 码: 游客发言不需要密码.
内 容:
验证码: 验证码
选 项:
字数限制 1000 字 | UBB代码 开启 | [img]标签 关闭