标题:
python--爬虫--获取和解析存储网页内容--以薄荷网为例(17)
[打印本页]
作者:
look_w
时间:
2019-5-16 15:09
标题:
python--爬虫--获取和解析存储网页内容--以薄荷网为例(17)
存储
import pymongo
from pymongo import MongoClient
from mgap_spider.settings import config
from datetime import datetime
def initMongoClient():
uri = "mongodb://"+config['mongo.username']+":"+config['mongo.password']+"@"+config['mongo.host']+":"+config['mongo.port']+"/admin"
print(uri)
client = MongoClient(uri)
return client
def insertCategoryPageLink(page,order,asc, link, type , source):
insert_record = {'page': page,'order': order,'asc': asc, 'link': link, 'dealed': 0, 'type': type, 'source': source, 'date':datetime.now()}
client = initMongoClient()
db = client['mydb_food']
collection = db['categorypagelink']
queryArgs = {'link': link}
linkcuont = collection.count(queryArgs)
if linkcuont == 0:
collection.insert_one(insert_record)
def findNoDealed():
familys = []
client = initMongoClient()
db = client['mydb_food']
collection = db['categorypagelink']
queryArgs = {'dealed': 0}
searchRes = collection.find(queryArgs)
for x in searchRes:
familys.append(x)
return familys
def findNoDealedLimit(skip,limit):
familys = []
client = initMongoClient()
db = client['mydb_food']
collection = db['categorypagelink']
queryArgs = {'dealed': 0}
searchRes = collection.find(queryArgs).skip(skip).limit(limit)
for x in searchRes:
familys.append(x)
return familys
def findNoDealedRawLimit(skip,limit):
familys = []
client = initMongoClient()
db = client['mydb_food']
collection = db['categorypageraw']
queryArgs = {'dealed': 0}
searchRes = collection.find(queryArgs).skip(skip).limit(limit)
for x in searchRes:
familys.append(x)
return familys
def countNoDealedPageRaw():
client = initMongoClient()
db = client['mydb_food']
collection = db['categorypageraw']
queryArgs = {'dealed': 0}
return collection.count(queryArgs)
def findNoDealedLinks():
barcodes = []
searchList = findNoDealed()
for x in searchList:
barcodes.append(x['link'])
print(barcodes)
return barcodes
def insertCategoryPageRaw(link,content,type,source):
insert_record = {'content': content, 'link': link, 'dealed': 0, 'type': type, 'source': source, 'date':datetime.now()}
client = initMongoClient()
db = client['mydb_food']
collection = db['categorypageraw']
collection.insert_one(insert_record)
def dealCategoryPagelink(link):
filterArgs = {'link': link}
updateArgs = {'$set': {'dealed': 1}}
client = initMongoClient()
db = client['mydb_food']
collection = db['categorypagelink']
updateRes = collection.update_many(filter=filterArgs, update=updateArgs)
def dealCategoryPageRaw(link):
filterArgs = {'link': link}
updateArgs = {'$set': {'dealed': 1}}
client = initMongoClient()
db = client['mydb_food']
collection = db['categorypageraw']
updateRes = collection.update_many(filter=filterArgs, update=updateArgs)
欢迎光临 电子技术论坛_中国专业的电子工程师学习交流社区-中电网技术论坛 (http://bbs.eccn.com/)
Powered by Discuz! 7.0.0