python--爬虫--获取和解析存储网页内容--以薄荷网为例(15)
- UID
- 1066743
|
python--爬虫--获取和解析存储网页内容--以薄荷网为例(15)
薄荷网获取解析存库完整示例
抓取列表链接
def fetchcategoryview(value, source):
headlink="http://www.boohee.com/food/view_group/"
pagelink="?page="
content=fetchraw(headlink+str(value)+pagelink+str(1))
soup = BeautifulSoup(content)
div=soup.find("div", class_="widget-food-list")
#print(div)
h3s=div.h3
#print(h3s)
ss=h3s.stripped_strings
for inx,val in enumerate(ss):
if inx==0:
type = val.replace(" ","").replace(":","").strip('\n')
print(type)
span = div.find("span", class_="pagination-sum")
nums = span.string
recordnum = re.findall(r"\d+\.?\d*",nums)[0]
pagelimit = int(recordnum) // 10 + 1
print(pagelimit)
pagelimit = pagelimit + 1
order = 0
asc = 0
if pagelimit == 12:
pagelimit = 10
for page in range(1, pagelimit):
print("page:%s order_by:%s order_asc%s" % (page, order, asc))
link = headlink + str(value) + pagelink + str(page)
print(link)
insertCategoryPageLink(page, order, asc, link, type, source)
def fetchcategorygroup(value, source):
headlink="http://www.boohee.com/food/group/"
pagelink="?page="
content=fetchraw(headlink+str(value)+pagelink+str(1))
soup = BeautifulSoup(content)
div=soup.find("div", class_="widget-food-list")
#print(div)
h3s=div.h3
#print(h3s)
ss=h3s.stripped_strings
for inx,val in enumerate(ss):
if inx==0:
type = val.replace(" ","").replace(":","").strip('\n')
print(type)
span = div.find("span", class_="pagination-sum")
nums = span.string
recordnum = re.findall(r"\d+\.?\d*",nums)[0]
print(recordnum)
pagelimit=int(recordnum)//10+1
print(pagelimit)
pagelimit = pagelimit + 1
order=0
asc=0
if pagelimit==12:
pagelimit = 10
for page in range(1, pagelimit):
print("page:%s order_by:%s order_asc%s" % (page, order, asc))
link = headlink + str(value) + pagelink + str(page)
print(link)
insertCategoryPageLink(page, order, asc, link, type, source)
for value in range(1, 41):
fetchcategorygroup(value, "薄荷web-group")
for value in range(1, 132):
fetchcategoryview(value, "薄荷web-view") |
|
|
|
|
|