Python Web数据抓取（xpath版）

鲲鹏Web数据抓取 - 专业Web数据采集服务提供者

这个版本较之前的“正则表达式版”而言，主要有以下几个改进：

（1）采用SQLite缓存抓取的HTML页面，大大提高了二次数据处理的效率。第一次运行程序大约耗时6小时，以后只需3分钟左右即可完成。
（2）采用xpath替换之前的正则表达式进行HTML解析。xpath定位更加简单、方便，而且能够自动修正html错误语法。xpath真强大！！
（3）去掉了重复的结果。

程序代码如下：

# coding:utf-8
# Practice of scraping web data with xpath
# by redice 2010.11.05

import codecs
import sys  
reload(sys)  
sys.setdefaultencoding('utf-8')

import urllib2
from urllib2 import URLError, HTTPError
import zlib
import sqlite3

try:
    import cPickle as pickle
except ImportError:
    import pickle
    

conn = sqlite3.connect("html_cache.db")
conn.text_factory = lambda x: unicode(x, 'utf-8', 'replace')
curs = conn.cursor()

#if htmls tables not exist,create it
#curs.execute('''CREATE TABLE if not exists htmls(url VARCHAR(255) UNIQUE,content BLOG,size INTEGER);''')
curs.execute('''CREATE TABLE if not exists htmls(url VARCHAR(255) UNIQUE,content TEXT,size INTEGER);''')
conn.commit()

def serialize(value):
    """convert object to a compressed pickled string to save in the db
    """
    #return sqlite3.Binary(zlib.compress(pickle.dumps(value, protocol=pickle.HIGHEST_PROTOCOL), 5))
    #return sqlite3.Binary(value)
    return value
    
def deserialize(value):
    """convert compressed pickled string from database back into an object
    """
    #return pickle.loads(zlib.decompress(value)) if value else value
    return value

# Fetch the target html
def gethtml(url):
    '''Fetch the target html'''
    try:
        # look up the html_cache.db first

        curs.execute("select * from htmls where url=?;" ,(url,))
        row = curs.fetchone()
        if row:
            # find the target
            #print deserialize(str(row[1]))
            return deserialize(str(row[1]))

        response = urllib2.urlopen(url)
        result = response.read()
        # insert into the html_cache.db
        curs.execute("insert into htmls values(?,?,?);", (url,serialize(result),len(result)))
        conn.commit()
        
        print "saved %s into html_cache.db" % (url)
        
        return  result
    except URLError, e:
        if hasattr(e, 'reason'):
            print 'Failed to reach a server.'
            print 'Reason: ', e.reason
            return 'None'
        elif hasattr(e, 'code'):
            print 'The server couldn't fulfill the request.'
            print 'Error code: ', e.code
            return 'None'
    #except:
        #return 'None'
    
# end def gethtml


import re

#Fetch the all string matched. Return a list.
def regexmatch(rule,str):
    '''Fetch the all string matched. Return a list.'''
    p = re.compile(rule)
    return p.findall(str)
#end def regexmatch


# decodeHtmlEntity
def decodeHtmlEntity(s) :
    '''decodeHtmlEntity'''
    if s=='' or not s:
       return ''
    result = s
    
    import locale
    result = result.decode(locale.getdefaultlocale()[1],"ignore").encode(locale.getdefaultlocale()[1]).replace("xc2xa0"," ")
    
    return result
# end def decodeHtmlEntity

#final result
dining_db = []

total = 0;

#debug
debug = 0

# Fetch menupalace.com's html
print 'Fetching html from http://menupalace.com ...'
html = gethtml('http://menupalace.com')

from lxml import etree

if html=='' or  html=='None':
    print "Can't get them html from http://menupalace.com"
    sys.exit()

try:
    tree = etree.HTML(html)
    nodes = tree.xpath("//table[@class='n_table']")
except:
    f = open("log.txt","wa")
    f.write(html)
    print("error to resolve the html http://menupalace.com")
    sys.exit()

for node in nodes:
    if debug and total>=10:
        break;

    n = node.xpath("./tr[1]/td[1]/img")
    # Fetch country
    country = ""
    if len(n)>0:
        country = decodeHtmlEntity(n[0].tail)
        country = country.strip()

    # Fetch all link    
    ls = node.xpath(".//a")

    # Through all link
    for l in ls:
        if debug and total>=10:
            break;
        
        #city
        city = decodeHtmlEntity(l.text)
        city = city.strip()
        
        prelink = l.get("href")
        link = prelink + "restaurants/restaurants.aspx"

        #print 'Fetching html from '+ link +' ...'
        html = gethtml(link)
        if html=='' or html == 'None':
            print "Can't get them html from " + link
            continue
        
        try:
            subtree = etree.HTML(html)
            subnodes = subtree.xpath("//td[@class='frame_style_padding']")
        except:
            if debug:
                f = open("log.txt","wa")
                f.write(html)
                print("error to resolve the html " + link)
                sys.exit()
            else:
                continue
            
        for sn in subnodes:
            if debug and total>=10:
                break;
                            
            sls = sn.xpath(".//a")
            for sl in sls:
                if debug and total>=10:
                    break;
                            
                link = prelink + "restaurants/" + sl.get("href")

                print 'Fetching html from '+ link +' ...'                
                html = gethtml(link)
                if  html=='' or html == 'None':
                    print "Can't get them html from " + link
                    continue
                
                try:
                    sstree = etree.HTML(html)
                    ssnodes = sstree.xpath("//table[@width='94%'][@height='80px']")
                except:
                    if debug:
                        f = open("log.txt","wa")
                        f.write(html)
                        f.write(" ")
                        print("error to resolve the html" + link)
                        sys.exit()
                    else:
                        continue
                    
                for ssn in ssnodes:
                    if debug and total>=10:
                       break;
                            
                    #name
                    n = ssn.xpath(".//tr[1]/td[1]/a[1]")
                    name = ''
                    
                    if len(n)>0:
                        name = decodeHtmlEntity(n[0].text)
                        name = name.strip()
                        #print name

                    #address
                    n = ssn.xpath(".//tr[2]/td[1]")

                    #address array                    
                    address_arr =[]                    
                    address = ''
                    state = ''

                    if len(n)>0:
                        address = decodeHtmlEntity(n[0].text)
                        #has many locations
                        
                        if address.strip()=='Various Locations':
                            n = ssn.xpath(".//tr[1]/td[1]/div[1]/span[1]")
                            if len(n)>0:
                                
                               address = decodeHtmlEntity(n[0].text)
                               addrlist = address.split()
                               if len(addrlist)>4:
                                    state = addrlist[-2]
                                    city = addrlist[-3]
                                    #remove state and city from the address
                                    address = address.replace(state,'')
                                    address = address.replace(city,'')
                                    address = address.replace(addrlist[-1],'')
                                    address = address.strip()
                                    address_arr.append((address,city,state))
                                    

                                    brn = ssn.xpath(".//tr[1]/td[1]/div[1]/span[1]/br")
                                    for n in brn:
                                        address = decodeHtmlEntity(n.tail)
                                        addrlist = address.split()
                                        if len(addrlist)>4:
                                            state = addrlist[-2]
                                            city = addrlist[-3]
                                            #remove state and city from the address
                                            address = address.replace(state,'')
                                            address = address.replace(city,'')
                                            address = address.replace(addrlist[-1],'')
                                            address = address.strip()
                                            address_arr.append((address,city,state))
                                
                            else:
                                address_arr.append(('','',''))
                        else:         
                            addrlist = address.split()
                            if len(addrlist)>3:
                                state = addrlist[-1]
                                city = addrlist[-2]
                                #remove state and city from the address
                                address = address.replace(state,'')
                                address = address.replace(city,'')
                                address = address.strip()
                                address_arr.append((address,city,state))

                    #website
                    website = ''
                    n = ssn.xpath(".//tr[3]/td[1]/a[1]")
                    if len(n)>0:
                        website = decodeHtmlEntity(n[0].text)
                        website = website.strip()


                    if name and len(address)>0:
                        for addr in address_arr:
                             dining = {}
                             dining['name'] = name
                             if addr[0] == 'Various Locations':
                                 dining['address'] = ''
                             else:
                                 dining['address'] = addr[0]
                             dining['city'] = addr[1]
                             dining['state'] = addr[2]
                             dining['country'] = country
                             dining['website'] = website

                             # Avoid duplication
                             if not (dining in dining_db):
                                 dining_db.append(dining)
                                 total = total + 1

                             if debug and total>=10:
                                 break;
                    

                

#Close database link
conn.close()
       
#print and save the final result
import csv
cf = open("scraping_result.csv", "w")
writer = csv.writer(cf)
writer.writerow(['name','address','city','state','country','website'])

for  item in dining_db:
     #print item['name'],item['address'],item['city'],item['state'],item['country'],item['website']
     rlist=[]
     rlist.append(item['name'])
     rlist.append(item['address'])
     rlist.append(item['city'])
     rlist.append(item['state'])
     rlist.append(item['country'])
     rlist.append(item['website'])
     writer.writerow(rlist)

cf.close()


print 'The result has been saved into scraping_result.csv!'

部分结果：

源码下载：

File: Click to Download

MongoDB导出CSV - mongoexport工具	Python跨进程级锁的一种实现
MySQLdb取回大结果集的技巧	Python字符串IP转整型
使用PIL实现多张图片垂直合并	pyodbc如何获取刚插入记录的ID

redice's Blog

现专注于Web数据抓取

Python Web数据抓取（xpath版）

[日志分享]

[日志信息]

[相关日志]

关于我

日志分类

热门日志

最新日志

网友评论

标签云

友情链接

redice's Blog is powered by DedeCms | Theme by Monkeii.Lee | 网站地图 | 本服务器由西安鲲之鹏网络信息技术有限公司友情提供

redice's Blog

现专注于Web数据抓取

Python Web数据抓取（xpath版）

[日志分享]

[日志信息]

[相关日志]

关于我

搜索

日志分类

热门日志

最新日志

网友评论

标签云

友情链接

redice's Blog is powered by DedeCms | Theme by Monkeii.Lee | 网站地图 | 本服务器由西安鲲之鹏网络信息技术有限公司友情提供