Board logo

标题: python--爬虫--获取和解析存储网页内容--以薄荷网为例(17) [打印本页]

作者: look_w    时间: 2019-5-16 15:09     标题: python--爬虫--获取和解析存储网页内容--以薄荷网为例(17)

存储

import pymongo
from pymongo import MongoClient
from mgap_spider.settings import config
from datetime import datetime


def initMongoClient():
    uri = "mongodb://"+config['mongo.username']+":"+config['mongo.password']+"@"+config['mongo.host']+":"+config['mongo.port']+"/admin"
    print(uri)
    client = MongoClient(uri)
    return client


def insertCategoryPageLink(page,order,asc, link, type , source):
    insert_record = {'page': page,'order': order,'asc': asc, 'link': link, 'dealed': 0, 'type': type, 'source': source, 'date':datetime.now()}
    client = initMongoClient()
    db = client['mydb_food']
    collection = db['categorypagelink']
    queryArgs = {'link': link}
    linkcuont = collection.count(queryArgs)
    if linkcuont == 0:
       collection.insert_one(insert_record)

def findNoDealed():
    familys = []
    client = initMongoClient()
    db = client['mydb_food']
    collection = db['categorypagelink']
    queryArgs = {'dealed': 0}
    searchRes = collection.find(queryArgs)
    for x in searchRes:
        familys.append(x)
    return familys


def findNoDealedLimit(skip,limit):
    familys = []
    client = initMongoClient()
    db = client['mydb_food']
    collection = db['categorypagelink']
    queryArgs = {'dealed': 0}
    searchRes = collection.find(queryArgs).skip(skip).limit(limit)
    for x in searchRes:
        familys.append(x)
    return familys


def findNoDealedRawLimit(skip,limit):
    familys = []
    client = initMongoClient()
    db = client['mydb_food']
    collection = db['categorypageraw']
    queryArgs = {'dealed': 0}
    searchRes = collection.find(queryArgs).skip(skip).limit(limit)
    for x in searchRes:
        familys.append(x)
    return familys


def countNoDealedPageRaw():
    client = initMongoClient()
    db = client['mydb_food']
    collection = db['categorypageraw']
    queryArgs = {'dealed': 0}
    return collection.count(queryArgs)


def findNoDealedLinks():
    barcodes = []
    searchList = findNoDealed()
    for x in searchList:
        barcodes.append(x['link'])
    print(barcodes)
    return barcodes

def insertCategoryPageRaw(link,content,type,source):
    insert_record = {'content': content, 'link': link, 'dealed': 0, 'type': type, 'source': source, 'date':datetime.now()}
    client = initMongoClient()
    db = client['mydb_food']
    collection = db['categorypageraw']
    collection.insert_one(insert_record)

def dealCategoryPagelink(link):
    filterArgs = {'link': link}
    updateArgs = {'$set': {'dealed': 1}}
    client = initMongoClient()
    db = client['mydb_food']
    collection = db['categorypagelink']
    updateRes = collection.update_many(filter=filterArgs, update=updateArgs)


def dealCategoryPageRaw(link):
    filterArgs = {'link': link}
    updateArgs = {'$set': {'dealed': 1}}
    client = initMongoClient()
    db = client['mydb_food']
    collection = db['categorypageraw']
    updateRes = collection.update_many(filter=filterArgs, update=updateArgs)




欢迎光临 电子技术论坛_中国专业的电子工程师学习交流社区-中电网技术论坛 (http://bbs.eccn.com/) Powered by Discuz! 7.0.0