Python语言技术文档

微信小程序技术文档

php语言技术文档

jsp语言技术文档

asp语言技术文档

C#/.NET语言技术文档

html5/css技术文档

javascript

点击排行

您现在的位置:首页 > 技术文档 > Python网络爬虫

下载糗事百科的内容_python版

来源:中文源码网    浏览:170 次    日期:2024-05-10 21:22:24
【下载文档:  下载糗事百科的内容_python版.txt 】


下载糗事百科的内容_python版
复制代码 代码如下:#coding:utf-8 import urllib.request import xml.dom.minidom import sqlite3 import threading import time class logger(object): def log(self,*msg): for i in msg: print(i) Log = logger() Log.log('测试下') class downloader(object): def __init__(self,url): self.url = url def download(self): Log.log('开始下载',self.url) try: content = urllib.request.urlopen(self.url).read() #req = urllib.request.Request(url) #response = urllib.request.urlopen(req) #content = response.read() Log.log('下载完毕') return(content) except: Log.log('下载出错') return(None) class parser(object): def __init__(self,content): #获得根节点 self.html = xml.dom.minidom.parseString(content) def parse(self): Log.log('开始提取数据') contents = {'content':'','url':[]} #获得div节点 divs = self.html.getElementsByTagName('div') #获得content节点 for div in divs: if div.hasAttribute('class') and \ div.getAttribute('class') == 'content': #获得糗事百科的内容 textNode = div.childNodes[0] qContent = textNode.data #数据填充 contents['content'] = qContent #获得上一糗事、下一糗事节点 spans = self.html.getElementsByTagName('span') for span in spans: pspan = span.parentNode if pspan.tagName == 'a': #pspan为对应的链接,此时需要将对应的地址加入数据库 url = pspan.getAttribute('href') qid = url[10:][:-4] #数据填充 contents['url'].append(qid) Log.log('提取数据完毕') return(contents) def downloadPage(qid,db): url = 'http://www.qiushibaike.com/articles/'+str(qid)+'.htm' content = downloader(url).download() if content: contents = parser(content).parse() if contents['content']: db.updateContent(qid,contents['content']) for i in contents['url']: db.addQID(i) if len(contents['url']) == 2: db.updateStatus(qid,2) #下载池,表示同时允许下载的链接个数 class downloaderPool(object): def __init__(self,maxLength=15): self.downloaders = [None]*maxLength self.downloadList = [] self.db = None def setDownloadList(self,downloadList): self.downloadList = list(set(self.downloadList+downloadList)) def setdb(self,db): self.db = db def daemon(self): #每隔一秒查询线程的状态,为非活动线程则设置为None Log.log('设置守护进程') for index,downloader in enumerate(self.downloaders): if downloader: if not downloader.isAlive(): Log.log('将下载器置空',index) self.downloaders[index] = None #检查线程池状态 for index,downloader in enumerate(self.downloaders): if not downloader: qid = self.getQID() if qid: #创建线程 t = threading.Thread(target=downloadPage,args=(qid,self.db)) self.downloaders[index] = t t.start() t.join() Log.log('设置下载器',index) #间隔一秒执行一次 time.sleep(1) def getQID(self): try: tmp = self.downloadList[0] del self.downloadList[0] return(tmp) except: return(None) def beginDownload(self): #创建守护线程 daemon = threading.Thread(target=self.daemon) daemon.setDaemon(True) daemon.start() daemon.join() def getDownloader(self): for index,downloader in enumerate(self.downloaders): if not downloader: return(index) return(None) ADD_Q_ID = 'insert into qiushibaike(id,success) values(?,?)' UPDATE_Q_CONTENT = 'update qiushibaike set content=? where id=?' UPDATE_Q_STATUS = 'update qiushibaike set success=? where id=?' Q_LIST = 'select id from qiushibaike where success=?' Q_LIST_BY_ID = 'select count(*) from qiushibaike where id=?' class dbConnect(object): """ create table qiushibaike( id,Integer content,Varchar success,Interger ) #id表示糗事的ID #content表示糗事的内容 #success表示是否下载成功,当该糗事内容下载完成,且获得上一页、下一页ID时表示下载完成 1表示未完成 2表示完成 """ def __init__(self,dbpath='db.sqlite'): self.dbpath = dbpath def addQID(self,qid): Log.log('插入糗事百科',qid) #获得连接 cn = sqlite3.connect(self.dbpath) c = cn.cursor() try: #添加内容并提交 c.execute(ADD_Q_ID,(qid,1)) cn.commit() except: Log.log('添加ID出错',qid) #关闭连接 c.close() cn.close() Log.log('插入成功') def updateContent(self,qid,content): Log.log('更新糗事百科',qid,content) #获得连接 cn = sqlite3.connect(self.dbpath) c = cn.cursor() #添加内容并提交 c.execute(UPDATE_Q_CONTENT,(content,qid)) cn.commit() #关闭连接 c.close() cn.close() Log.log('更新成功') def updateStatus(self,qid,flag): Log.log('更新状态',qid,flag) #获得连接 cn = sqlite3.connect(self.dbpath) c = cn.cursor() #添加内容并提交 c.execute(UPDATE_Q_STATUS,(flag,qid)) cn.commit() #关闭连接 c.close() cn.close() Log.log('更新状态成功') def getList(self,unDonloaded=1): Log.log('获得列表') l = [] #获得连接 cn = sqlite3.connect(self.dbpath) c = cn.cursor() #获得数据 c.execute(Q_LIST,(unDonloaded,)) rows = c.fetchall() for i in rows: l.append(i[0]) #关闭连接 c.close() cn.close() Log.log('获得列表成功') return(l) class singleDownloader(object): def __init__(self): self.downloadList = [] def setdb(self,db): self.db = db def setDownloadList(self,downloadList): self.downloadList = list(set(self.downloadList+downloadList)) def beginDownload(self): for i in self.downloadList: downloadPage(i,self.db) def main(): db = dbConnect('db.sqlite') #dp = downloaderPool() #dp.setdb(db) sp = singleDownloader() sp.setdb(db) dp=sp unDownloadedList = db.getList() #当还有未下载的糗事时就要继续下载 while(len(unDownloadedList)): #使用该列表填充下载池 dp.setDownloadList(unDownloadedList) dp.beginDownload() time.sleep(1) #重置参数 unDownloadedList = db.getList() if __name__ == '__main__': main() 代码是没问题的,可以正常运行,但是希望做到以下2方面: 1、多线程下载 2、代码分离度更高,跟面向对象

相关内容