功能:
验证 http://www.5uproxy.net/ 最新HTTP代理,匿名HTTP代理,透明HTTP代理,SOCKS5代理 的有效性。
给出可用代理列表,按响应时间从小到大顺序排列。
缺点:
目前这个版本采用的是单线程,所有数据验证完毕大概需要20分钟左右。过两天有时间的话改成多进程的。
源码如下:
# coding:gbk # 验证最新可用代理 http://www.5uproxy.net # by redice 2010.12.05 import sys reload(sys) sys.setdefaultencoding('gbk') import urllib import urllib2 from urllib2 import URLError, HTTPError DEBUG = True #html页面下载函数 def getHtml(url,post_data=None,cookie=None): """Fetch the target html url - URL to fetch post_data - POST Entity cookie - Cookie Header """ if DEBUG: print "getHtml: ",url result ='' try: #create a request request = urllib2.Request(url) #change User-Agent request.add_header('User-Agent','Mozilla/5.0') #change Referrer request.add_header('Referrer',url) #if has cookie,add cookie header if cookie: request.add_header('Cookie',cookie) #create a opener opener = urllib2.build_opener() #if has post entity if post_data: #encode post data post_data = urllib.urlencode(post_data) response = opener.open(request,post_data) else: response = opener.open(request) result = response.read() response.close() #no content,don't save if not result or len(result)==0: return '' return result except HTTPError, e: if DEBUG: print 'Error retrieving data:',e print 'Server error document follows:\n' #print e.read() return '' except URLError, e: if hasattr(e, 'reason'): if DEBUG: print 'Failed to reach a server.' print 'Reason: ', e.reason return '' elif hasattr(e, 'code'): if DEBUG: print 'The server couldn\'t fulfill the request.' print 'Error code: ', e.code return '' except Exception, e: if DEBUG: print e return '' #需要验证的代理列表 proxys = [] proxys.append({'url':'http://www.5uproxy.net/http_fast.html','type':'http_fast'}) proxys.append({'url':'http://www.5uproxy.net/http_anonymous.html','type':'http_anonymous'}) proxys.append({'url':'http://www.5uproxy.net/http_non_anonymous.html','type':'http_transparent'}) proxys.append({'url':'http://www.5uproxy.net/socks5.html','type':'socks5'}) import re import socket import time result =[] for proxy in proxys: html = getHtml(proxy['url']) #正则匹配获取每一代理 rs = re.compile(r'''<tr .*?>[\s\S]*?<td .*?>\d+?</td>[\s\S]*?<td>(\S+?)</td>[\s\S]*?<td .*?>(\S+?)</td>[\s\S]*?<td .*?>(\S+?)</td>[\s\S]*?</tr>''',re.DOTALL).findall(html) for r in rs: #代理域名 proxy_domain = r[0] #代理端口 proxy_port = r[1] #代理国家 proxy_state = r[2] print "正在验证:%s,%s" % (proxy_domain,proxy_port) #验证代理的可用性 #创建一个TCP连接套接字 sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) #设置10超时 sock.settimeout(10) try: start = time.clock() #连接代理服务器 sock.connect((proxy_domain, int(proxy_port))) offset = int((time.clock() - start) * 1000) item={} item['domain'] = proxy_domain item['port']= proxy_port item['state']=proxy_state item['type']=proxy['type'] item['offset']= offset if not (item in result): result.append(item) print "%s,%s 验证通过,响应时间:%d ms,已加入报表!" % (proxy_domain,proxy_port,offset) sock.close() except Exception, e: if DEBUG: print e print "%s,%s 验证失败!" % (proxy_domain,proxy_port) continue #结果按响应时间从小到大排序 result.sort(lambda x,y: cmp(x['offset'], y['offset'])) file = open('result.txt','w') print "验证结果如下:" for item in result: str = '%s,%s,%s,%s,%d' % (item['type'],item['domain'],item['port'],item['state'],item['offset']) print str file.write(str+'\n') file.close() print "所有数据已验证完毕,共计%d个,验证通过的代理已存入result.txt" % (len(result))
最后给出一个我跑的结果:
Latest Free proxy list(最新免费代理列表)
http://zhongguo-proxy.appspot.com/
呵呵,谢谢
VaTG790i.最好的<a href=http://www.kyfei.com>网站推广软件</a>,
非常好
....................
;ui;普i;uighur;ui;ui;个
在unix网络编程中看到了关于TCP/IP的一些内容,我感觉还是写的不够。正在下载中,一定
下载地址呢