首页 | 新闻 | 新品 | 文库 | 方案 | 视频 | 下载 | 商城 | 开发板 | 数据中心 | 座谈新版 | 培训 | 工具 | 博客 | 论坛 | 百科 | GEC | 活动 | 主题月 | 电子展
返回列表 回复 发帖

python--爬虫--获取和解析存储网页内容--以薄荷网为例(17)

python--爬虫--获取和解析存储网页内容--以薄荷网为例(17)

存储

import pymongo
from pymongo import MongoClient
from mgap_spider.settings import config
from datetime import datetime


def initMongoClient():
    uri = "mongodb://"+config['mongo.username']+":"+config['mongo.password']+"@"+config['mongo.host']+":"+config['mongo.port']+"/admin"
    print(uri)
    client = MongoClient(uri)
    return client


def insertCategoryPageLink(page,order,asc, link, type , source):
    insert_record = {'page': page,'order': order,'asc': asc, 'link': link, 'dealed': 0, 'type': type, 'source': source, 'date':datetime.now()}
    client = initMongoClient()
    db = client['mydb_food']
    collection = db['categorypagelink']
    queryArgs = {'link': link}
    linkcuont = collection.count(queryArgs)
    if linkcuont == 0:
       collection.insert_one(insert_record)

def findNoDealed():
    familys = []
    client = initMongoClient()
    db = client['mydb_food']
    collection = db['categorypagelink']
    queryArgs = {'dealed': 0}
    searchRes = collection.find(queryArgs)
    for x in searchRes:
        familys.append(x)
    return familys


def findNoDealedLimit(skip,limit):
    familys = []
    client = initMongoClient()
    db = client['mydb_food']
    collection = db['categorypagelink']
    queryArgs = {'dealed': 0}
    searchRes = collection.find(queryArgs).skip(skip).limit(limit)
    for x in searchRes:
        familys.append(x)
    return familys


def findNoDealedRawLimit(skip,limit):
    familys = []
    client = initMongoClient()
    db = client['mydb_food']
    collection = db['categorypageraw']
    queryArgs = {'dealed': 0}
    searchRes = collection.find(queryArgs).skip(skip).limit(limit)
    for x in searchRes:
        familys.append(x)
    return familys


def countNoDealedPageRaw():
    client = initMongoClient()
    db = client['mydb_food']
    collection = db['categorypageraw']
    queryArgs = {'dealed': 0}
    return collection.count(queryArgs)


def findNoDealedLinks():
    barcodes = []
    searchList = findNoDealed()
    for x in searchList:
        barcodes.append(x['link'])
    print(barcodes)
    return barcodes

def insertCategoryPageRaw(link,content,type,source):
    insert_record = {'content': content, 'link': link, 'dealed': 0, 'type': type, 'source': source, 'date':datetime.now()}
    client = initMongoClient()
    db = client['mydb_food']
    collection = db['categorypageraw']
    collection.insert_one(insert_record)

def dealCategoryPagelink(link):
    filterArgs = {'link': link}
    updateArgs = {'$set': {'dealed': 1}}
    client = initMongoClient()
    db = client['mydb_food']
    collection = db['categorypagelink']
    updateRes = collection.update_many(filter=filterArgs, update=updateArgs)


def dealCategoryPageRaw(link):
    filterArgs = {'link': link}
    updateArgs = {'$set': {'dealed': 1}}
    client = initMongoClient()
    db = client['mydb_food']
    collection = db['categorypageraw']
    updateRes = collection.update_many(filter=filterArgs, update=updateArgs)
返回列表