python--爬虫--获取和解析存储网页内容--以薄荷网为例(16)
- UID
- 1066743
|
python--爬虫--获取和解析存储网页内容--以薄荷网为例(16)
抓取网页原内容
import urllib
import urllib.request
import json
from mgap_spider.dao.categoryPageLinkDao import *
from bs4 import *
import string
import time
import re
def fetchraw(link):
f = urllib.request.Request(link)
response = urllib.request.urlopen(f)
the_page = response.read()
content = the_page.decode("utf8")
time.sleep(1)
#print(content)
return content
#contentjson = json.loads(content)
#print(contentjson)
#print(contentjson['total_pages'])
for i in range(0, 30000, 100):
links=findNoDealedLimit(i,100)
for x in links:
content=fetchraw(x['link'])
insertCategoryPageRaw(x['link'],content,x['type'],x['source'])
dealCategoryPagelink(x['link'])
print("dealed %s %s %s" % (x['source'], x['type'], x['link']))
解析
from mgap_spider.dao.itemLinkDao import *
from mgap_spider.dao.categoryPageLinkDao import *
import json
from bs4 import *
import string
import time
import re
import _thread
def parserawauto(begin,size):
linkhead = "http://food.boohee.com/fb/v1/foods/"
linkend = "/mode_show?token=&user_key=&app_version=2.6.2.1&app_device=Android&os_version=7.1.2&phone_model=M6+Note&channel=meizu"
while 1:
try:
count=countNoDealedPageRaw()
if count==0:
break
raws=findNoDealedRawLimit(begin,size)
for raw in raws:
if raw['source'] == '食物库app':
content=raw['content']
contentjson = json.loads(content)
foods = contentjson['foods']
for food in foods:
link=linkhead+food['code']+linkend
print("dealed %s %s" % (food['code'], food['name']))
insertItemLink(food['code'],food['name'],raw['link'],link,raw['type'],raw['source'])
dealCategoryPageRaw(raw['link'])
print("dealed %s %s %s" % (raw['source'], raw['type'], raw['link']))
else:
content=raw['content']
soup = BeautifulSoup(content)
div = soup.find("div", class_="widget-food-list")
ul = div.find("ul", class_="food-list")
boxs = ul.find_all("div", class_="text-box")
for box in boxs:
node = box.find('a', href=re.compile(r'/shiwu/\w+'))
code=node['href'].replace("/shiwu/","")
#code=ahref.replace("/shiwu/","")
name=node['title']
link=linkhead+code+linkend
print("dealed %s %s" % (code, name))
insertItemLink(code,name,raw['link'],link,raw['type'],raw['source'])
dealCategoryPageRaw(raw['link'])
print("dealed %s %s %s" % (raw['source'], raw['type'], raw['link']))
except Exception as e:
print(e)
return "begin "+str(begin)+" finish"+datetime.now()
def run():
# 创建两个线程
try:
_thread.start_new_thread(parserawauto, (0, 100))
_thread.start_new_thread(parserawauto, (100, 100))
except Exception as e:
print(e)
print("Error: unable to start thread")
run()
while 1:
pass |
|
|
|
|
|