11-11
23

java 写的http代理的获得和验证 --用于数据抽取 防止IP被封

package page;

import java.net.URL;
import java.io.BufferedReader;
import java.net.HttpURLConnection;
import java.util.*;
import java.io.InputStreamReader;
import java.util.regex.Pattern;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.util.regex.Matcher;

/**
* <p>Title: </p>
*
* <p>Description: </p>
*
* <p>Copyright: Copyright (c) 2008</p>
*
* <p>Company: </p>
*
* @author not attributable
* @version 1.0
*/
public class GetProxy {
    HashSet<String> hs=new HashSet();
    public GetProxy() {
    }
    public void getProxyFromProxyCN()//从代理中国访问
    {
        try {
        URL url = new URL("http://www.proxycn.com/html_proxy/30fastproxy-1.html");
        HttpURLConnection connection = (HttpURLConnection) url.openConnection();
        connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT)");
        InputStreamReader isr = new InputStreamReader(connection.getInputStream());
        BufferedReader   br   =   new   BufferedReader(isr);
        String   sCurrentLine   =   "";
        StringBuffer   sTotalString   =   new StringBuffer();
        while   ((sCurrentLine   =   br.readLine())   !=   null)
        {
              sTotalString.append(sCurrentLine+"\n");
              //System.out.println(sCurrentLine);
        }
        //System.out.println(sTotalString);
        /**
         * 抽取信息
         */


        // 表达式对象
        Pattern p = Pattern.compile("(\\d{3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}:\\d+)");


    // 创建 Matcher 对象
    Matcher m = p.matcher(sTotalString.toString());

    // 是否找到匹配
boolean found = m.find();
   while(m.find())
   {
        String foundstring = m.group();
        System.out.println(m.group(1));
        String []pp=m.group(1).split(":");
        int portnum=Integer.parseInt(pp[1]);
        if(portnum<65000){
            hs.add(m.group(1));
       }
        //学校编号 学校名 城市 名字区间
        //String temp=m.group(1)+","+m.group(2)+","+m.group(3)+","+i+"\n";
        //Buff.write(temp.getBytes());
        //int beginPos = m.start();
        //int endPos = m.end();
    }


    }
    catch(Exception ex)
    {
    System.out.println(ex.toString());
    ConstDatas.log(ex.toString());
    }

    }
    public boolean isVaildProxy(String proxyAndport)
    {
           try {
           String []pp=proxyAndport.split(":");
           if(pp.length>=2){//格式正确
               Properties systemProperties = System.getProperties();
               systemProperties.setProperty("http.proxyHost", pp[0].trim());
               systemProperties.setProperty("http.proxyPort", pp[1].trim());
               URL url = new URL("http://www.baidu.com/");
               HttpURLConnection connection = (HttpURLConnection) url.
                                              openConnection();
               connection.setRequestProperty("User-Agent",
                       "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT)");
               InputStreamReader isr = new InputStreamReader(connection.getInputStream());
               BufferedReader br = new BufferedReader(isr);
               String sCurrentLine = "";
               StringBuffer sTotalString = new StringBuffer();
               while ((sCurrentLine = br.readLine()) != null) {
                   sTotalString.append(sCurrentLine + "\n");
               }
               //System.out.println(sTotalString);
               if(sTotalString.indexOf("把百度设为首页")!=-1)
               {   System.out.println(proxyAndport +"   sucess!");
                   return true;
               }
               else
               return false;
           }
           else
               return false;
           }
           catch(Exception ex)
           {
               ex.printStackTrace();
               ConstDatas.log(ex.toString());
               return false;
           }
    }
    public void getproxyFromHust()
    {

       try {
       URL url = new URL("http://info.hustonline.net/index/proxyshow.aspx");
       HttpURLConnection connection = (HttpURLConnection) url.openConnection();
       connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT)");
       InputStreamReader isr = new InputStreamReader(connection.getInputStream());
       BufferedReader   br   =   new   BufferedReader(isr);
       String   sCurrentLine   =   "";
       StringBuffer   sTotalString   =   new StringBuffer();
       while   ((sCurrentLine   =   br.readLine())   !=   null)
       {
             sTotalString.append(sCurrentLine+"\n");
             //System.out.println(sCurrentLine);
       }
       //System.out.println(sTotalString);
       /**
        * 抽取信息
        */


       // 表达式对象
Pattern p = Pattern.compile("<b>(\\d{3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}:\\d+)</b>\\n                                  </td><td>\\n                                    <font color=.+>.+</font>\\n                                  </td><td>\\n                                    <font color=[blue]*[orange]*>.+</font>\\n                                  </td><td>\\n                                    <font color=.+>.+</font>");


   // 创建 Matcher 对象
   Matcher m = p.matcher(sTotalString.toString());

   // 是否找到匹配
boolean found = m.find();
while(m.find())
{
       String foundstring = m.group();
       System.out.println(m.group(1));
      // if(isVaildProxy(m.group(1)))
       String []pp=m.group(1).split(":");
       int portnum=Integer.parseInt(pp[1]);
       if(portnum<65000){
           hs.add(m.group(1));
       }
       //学校编号 学校名 城市 名字区间
       //String temp=m.group(1)+","+m.group(2)+","+m.group(3)+","+i+"\n";
       //Buff.write(temp.getBytes());
       //int beginPos = m.start();
       //int endPos = m.end();
   }


   }
   catch(Exception ex)
   {
   System.out.println(ex.toString());
   ConstDatas.log(ex.toString());
   }

    }
    public String ChangeProxy()
    {
        Random rd=new Random(System.currentTimeMillis()) ;
        int i=rd.nextInt(ConstDatas.proxy.size());
        String proxyAndport= (String)(ConstDatas.proxy.toArray())[i];
        String []pp=proxyAndport.split(":");
        System.out.println(Thread.currentThread()+"修改代理:"+proxyAndport);
          if(pp.length>=2){//格式正确
              Properties systemProperties = System.getProperties();
              systemProperties.setProperty("http.proxyHost", pp[0].trim());
              systemProperties.setProperty("http.proxyPort", pp[1].trim());
          }
          return proxyAndport;
    }
    public static void main(String[] args) {
        GetProxy g = new GetProxy();
        g.getProxyFromProxyCN();
       //g.getproxyFromHust();
       //g.isVaildProxy("211.99.188.220:80");
    }
}


[本日志由 admin 于 2012-02-02 00:19 AM 编辑]
文章来自: 本站原创
引用通告: 查看所有引用 | 我要引用此文章
Tags:
相关日志:
评论: 0 | 引用: 0 | 查看次数: 322
发表评论
昵 称:
密 码: 游客发言不需要密码.
内 容:
验证码: 验证码
选 项:
虽然发表评论不用注册,但是为了保护您的发言权,建议您注册帐号.
字数限制 1000 字 | UBB代码 开启 | [img]标签 关闭