Python怎么抓取聚劃算商品分析頁面獲取商品信息并以XML格式保存到本地

發布時間：2021-05-22 14:11:04 來源：億速云閱讀：166 作者：小新欄目：開發技術

這篇文章主要介紹Python怎么抓取聚劃算商品分析頁面獲取商品信息并以XML格式保存到本地，文中介紹的非常詳細，具有一定的參考價值，感興趣的小伙伴們一定要看完！

具體內容如下

#!/user/bin/python 
# -*- coding: gbk -*- 
#Spider.py 
 
import urllib2 
import httplib 
import StringIO 
import gzip 
import re 
import chardet 
import sys 
import os 
import datetime 
from xml.dom.minidom import Document 
from BeautifulSoup import BeautifulSoup 
 
## 這段代碼是用于解決控制臺打印漢字報錯的問題 
reload(sys) 
sys.setdefaultencoding("utf8") 
##################################################### 
 
## debug模式開關，開啟后可以看到Http請求的頭部信息以及debug日志 
DEBUG = 1 
NO_DEBUG = 0 
httplib.HTTPConnection.debuglevel = DEBUG 
## 是否顯示爬取網頁源代碼開關 
showSrcCode = False 
## 壓縮方式 
ZIP_TYPE = "gzip" 
 
fileName = "auctions" 
location = "d://spiderData/" 
 
## header 
headerConfig = {"User-Agent":"taobao-yanyuan.qzs", "Accept-encoding":ZIP_TYPE} 
##################################################### 
 
 
#############class SpiderConfig ##################### 
class SpiderConfig: 
 """ 
  configuration for spider name and url 
 """ 
 def __init__(self, name, url): 
  self.name = name 
  self.url = url 
##################################################### 
 
##############class SpiderAuctionDomain############## 
class SpiderAuctionDomain: 
 """ 
  Store information with auctions spidered by python 
 """ 
 title = "" 
 url = "" 
 img = "" 
 price = "" 
 
 def __init__(self): 
  pass 
 
##################################################### 
 
########class SpiderDefaultErrorHandler############## 
class SpiderDefaultErrorHandler(urllib2.HTTPDefaultErrorHandler): 
 def http_error_default(self, req, fp, code, msg, hdrs): 
  """ 
   default error process handler for spider 
  """ 
  result = urllib2.HTTPError(req.get_full_url(), code, msg, hdrs, fp) 
  result.status = code 
  result.url = req.get_full_url() 
 
  print "<", result.url, "Exception code :", result.status, ">" 
 
  return result 
##################################################### 
 
#############class SpiderHandler##################### 
class SpiderHandler: 
 """ 
  spider handler 
 """ 
 
 def spider(self, spiderConfig): 
  try: 
   request = urllib2.Request(spiderConfig.url) 
 
   ## configure request hreader 
   for key,val in headerConfig.items(): 
    request.add_header(key, val) 
 
   ## build opener 
   opener = urllib2.build_opener(SpiderDefaultErrorHandler()) 
 
   ## open request 
   openRequest = opener.open(request) 
 
   ## read data 
   spiderData = openRequest.read() 
 
   ## close 
   opener.close() 
 
   if 0 == len(spiderData): 
    return 
 
   if ZIP_TYPE== openRequest.headers.get("Content-Encoding"): 
    spiderData = SpiderHandler.gzipData(self, spiderData) 
 
   if httplib.HTTPConnection.debuglevel == DEBUG and showSrcCode: 
    print spiderData 
 
   # parse html 
   SpiderHandler.parse(self, spiderData) 
 
  except Exception,x: 
   print "spider process Exception:", x 
 
 
 
 def parse(self, spiderData): 
  """ 
   parse html content 
  """ 
 
  if httplib.HTTPConnection.debuglevel == DEBUG: 
   charsetAnalyze = chardet.detect(spiderData) 
   print "analyze spider data encode :",charsetAnalyze["encoding"] 
 
  print "執行解析", fileName 
 
  soup = BeautifulSoup(spiderData) 
  encode = soup.originalEncoding 
 
  encoding = lambda x : x.encode(encode) 
 
  if httplib.HTTPConnection.debuglevel == DEBUG: 
   print "識別到編碼：", encode 
   title = soup.head.title.string 
   print encoding(title) 
 
  spiderContents = soup.findAll(name="div", attrs={"class":"main-box avil"}) 
  auctions = ["%s" % s for s in spiderContents] 
 
  if auctions is None: 
   return 
 
  auctionList = [] 
 
  for auc in auctions: 
   auctionDomain = SpiderAuctionDomain() 
   # parse auction link 
   links = re.search(re.compile(r'<a href=[\"|\']http://ju.taobao.com/tg/life_home.htm\?item_id=([^>]*)[\"|\']', re.IGNORECASE), auc) 
   if links is not None : 
    auctionDomain.link = encoding("http://ju.taobao.com/tg/life_home.htm?item_id=" + "".join(["%s" % s for s in links.groups() if len(s) > 0])) 
 
   #parse auction title 
   titles = re.search(re.compile(r"([^>]*)</a></h3>", re.IGNORECASE), auc) 
   if titles is not None: 
    auctionDomain.title = encoding("".join(["%s" % t for t in titles.groups() if len(t) > 0])) 
 
   #parse auction price 
   price = re.search(re.compile(r"<strong class=\"J_juPrices\".*</b>([^<]*)</strong>", re.IGNORECASE), auc) 
   if price is not None: 
    auctionDomain.price = "".join(["%s" % p for p in price.groups() if len(p) > 0]) 
 
   #parse image url 
   imgs = re.search(re.compile(r"<img src=[\'\"]([^>]*)[\'\"]", re.IGNORECASE), auc) 
   if imgs is not None: 
    auctionDomain.img = "".join(["%s" % i for i in imgs.groups() if len(i) > 0]) 
 
   auctionList.append(auctionDomain) 
 
  print "成功解析商品信息：" 
  for a in auctionList: 
   print "--->",a.title 
 
  # sort auction list 
  auctionList = SpiderHandler.sortAuctionList(self, auctionList) 
 
  # save in file 
  SpiderHandler.save(self, auctionList) 
 
  print "解析完成" 
 
  pass 
 
 def sortAuctionList(self, auctionList): 
  """ 
   冒泡排序，按照價格排序 
  """ 
  length = len(auctionList) 
  if length < 2: 
   return auctionList 
  else: 
   for i in range(length-1): 
    for j in range(length - i -1): 
     if float(auctionList[j].price) > float(auctionList[j+1].price): 
      auctionList[j], auctionList[j+1] = auctionList[j+1], auctionList[j] 
  return auctionList 
  pass 
 
 def save(self, auctionList): 
  if auctionList is not None: 
   doc = Document() 
 
   auctions = doc.createElement("auctions") 
   doc.appendChild(auctions) 
 
   for auc in auctionList: 
    auction = doc.createElement("auction") 
    auctions.appendChild(auction) 
 
    SpiderHandler.generateXML(self, doc, auction, "title", auc.title) 
    SpiderHandler.generateXML(self, doc, auction, "price", auc.price) 
    SpiderHandler.generateXML(self, doc, auction, "img", auc.img) 
    SpiderHandler.generateXML(self, doc, auction, "link", auc.link) 
 
   if False == os.path.exists(location): 
    os.mkdir(location) 
 
   file = open(location+fileName+".xml", 'w') 
   file.write(doc.toprettyxml()) 
   file.close() 
 
   if httplib.HTTPConnection.debuglevel == DEBUG: 
    print doc.toprettyxml() 
 
 def generateXML(self, doc, f, name, txt): 
  c = doc.createElement(name) 
  f.appendChild(c) 
  c.appendChild(doc.createTextNode(txt)) 
 
 def gzipData(self, spiderData): 
  """ 
   get data from gzip 
  """ 
  if 0 == len(spiderData): 
   return spiderData 
  spiderDataStream = StringIO.StringIO(spiderData) 
  spiderData = gzip.GzipFile(fileobj=spiderDataStream).read() 
  return spiderData 
##################################################### 
 
if __name__ == "__main__": 
 nowtime = lambda:datetime.datetime.strftime(datetime.datetime.now(),"%Y年%m月%d日 %H時%m分%S秒") 
 
 needSpiderUrl = {"suzhou":"http://ju.taobao.com/suzhou", 
      "hangzhou":"http://ju.taobao.com/hangzhou", 
      "shanghai":"http://ju.taobao.com/shanghai", 
      "beijing":"http://ju.taobao.com/beijing", 
      "chengdu":"http://ju.taobao.com/chengdu"} 
 
 configList = [] 
 for k,v in needSpiderUrl.items(): 
  spiderConfig = SpiderConfig(k, v) 
  configList.append(spiderConfig) 
 
 spiderHandler = SpiderHandler() 
 
 print "爬蟲執行開始時間：",nowtime() 
 for spiderConfig in configList: 
  fileName = spiderConfig.name 
  spiderHandler.spider(spiderConfig) 
 
 print "爬蟲執行完畢時間：",nowtime()

python是什么意思

Python是一種跨平臺的、具有解釋性、編譯性、互動性和面向對象的腳本語言，其最初的設計是用于編寫自動化腳本，隨著版本的不斷更新和新功能的添加，常用于用于開發獨立的項目和大型項目。

以上是“Python怎么抓取聚劃算商品分析頁面獲取商品信息并以XML格式保存到本地”這篇文章的所有內容，感謝各位的閱讀！希望分享的內容對大家有幫助，更多相關知識，歡迎關注億速云行業資訊頻道！

向AI問一下細節

中文字幕av专区_日韩电影在线播放_精品国产精品久久一区免费式_av在线免费观看网站

Python怎么抓取聚劃算商品分析頁面獲取商品信息并以XML格式保存到本地

python是什么意思

猜你喜歡

中文字幕av专区_日韩电影在线播放_精品国产精品久久一区免费式_av在线免费观看网站

Python怎么抓取聚劃算商品分析頁面獲取商品信息并以XML格式保存到本地

python是什么意思

猜你喜歡

最新資訊

相關推薦

相關標簽