python--爬虫--获取和解析存储网页内容--以薄荷网为例（16）

论坛元老

Rank: 8 Rank: 8

UID: 1066743

1^#

打印

字体大小: tT

look_w发表于 2019-5-16 15:08 | 只看该作者

python--爬虫--获取和解析存储网页内容--以薄荷网为例（16）

抓取网页原内容

import urllib
import urllib.request
import json
from mgap_spider.dao.categoryPageLinkDao import *
from bs4 import *
import string
import time
import re

def fetchraw(link):
f = urllib.request.Request(link)
response = urllib.request.urlopen(f)
the_page = response.read()
content = the_page.decode("utf8")
time.sleep(1)
#print(content)
return content
#contentjson = json.loads(content)
#print(contentjson)
#print(contentjson['total_pages'])

for i in range(0, 30000, 100):
links=findNoDealedLimit(i,100)
for x in links:
      content=fetchraw(x['link'])
      insertCategoryPageRaw(x['link'],content,x['type'],x['source'])
      dealCategoryPagelink(x['link'])
      print("dealed %s %s %s" % (x['source'], x['type'], x['link']))

解析

from mgap_spider.dao.itemLinkDao import *
from mgap_spider.dao.categoryPageLinkDao import *
import json
from bs4 import *
import string
import time
import re
import _thread

def parserawauto(begin,size):
linkhead = "http://food.boohee.com/fb/v1/foods/"
linkend = "/mode_show?token=&user_key=&app_version=2.6.2.1&app_device=Android&os_version=7.1.2&phone_model=M6+Note&channel=meizu"
while 1:
      try:
         count=countNoDealedPageRaw()
         if count==0:
            break
         raws=findNoDealedRawLimit(begin,size)
         for raw in raws:
            if raw['source'] == '食物库app':
                  content=raw['content']
                  contentjson = json.loads(content)
                  foods = contentjson['foods']
                  for food in foods:
                     link=linkhead+food['code']+linkend
                     print("dealed %s %s" % (food['code'], food['name']))
                     insertItemLink(food['code'],food['name'],raw['link'],link,raw['type'],raw['source'])
                     dealCategoryPageRaw(raw['link'])
                  print("dealed %s %s %s" % (raw['source'], raw['type'], raw['link']))
            else:
                  content=raw['content']
                  soup = BeautifulSoup(content)
                  div = soup.find("div", class_="widget-food-list")
                  ul = div.find("ul", class_="food-list")
                  boxs = ul.find_all("div", class_="text-box")
                  for box in boxs:
                     node = box.find('a', href=re.compile(r'/shiwu/\w+'))
                     code=node['href'].replace("/shiwu/","")
                     #code=ahref.replace("/shiwu/","")
                     name=node['title']
                     link=linkhead+code+linkend
                     print("dealed %s %s" % (code, name))
                     insertItemLink(code,name,raw['link'],link,raw['type'],raw['source'])
                     dealCategoryPageRaw(raw['link'])
                  print("dealed %s %s %s" % (raw['source'], raw['type'], raw['link']))
      except Exception as e:
         print(e)
return "begin "+str(begin)+" finish"+datetime.now()

def run():
# 创建两个线程
try:
      _thread.start_new_thread(parserawauto, (0, 100))
      _thread.start_new_thread(parserawauto, (100, 100))
except Exception as e:
   print(e)
   print("Error: unable to start thread")

run()

while 1:
pass

收藏分享评分

回复引用

订阅 TOP

返回列表