正则表达式在线检测工具:
进程:
1.源代码HTML
#将url转换为HTML源码def getHtml(url): try: page = urllib.request.urlopen(url) html = page.read() except: print("failed to geturl") return '' else: return html
2.爬取书名
#通过正则表达式获取该网页下每本书的title(换行符没去掉)def getTitle(html): nameList = re.findall(r'<a href="https.*?".*?target="_blank">(.*?)</a>',html,re.S) newNameList = []; global topnum for index,item in enumerate(nameList): if item.find("img") == -1:#通过检测img,只保留中文标题 #item.replace('\n','') #item.strip() #item.splitlines() #re.sub('\r|\n', '', item) if topnum%26 !=0: #newNameList.append("Top " + str(topnum) + " " + item); newNameList.append(item); topnum += 1; return newNameList
3.爬取图片
#通过正则表达式获取该网页下每本书的图片链接def getImg(html): imgList = re.findall(r'img.*?width=.*?src="(http.*?)"',html,re.S) newImgList = [] for index,item in enumerate(imgList): if item.find("js") == -1 and item.find("css") == -1 and item.find("dale") == -1 and item.find("icon") == -1and item.find("png") == -1: newImgList.append(item); return newImgList;
4.翻页
#实现翻页,每页25个for page in range(0,450,25): url = "https://www.douban.com/doulist/1264675/?start={}".format(page) html = getHtml(url).decode("UTF-8"); if html == '': namesUrl.extend('none'); imgsUrl.extend('none') scoresUrl.extend('none') commentsUrl.extend('none') introductionsUrl.extend('none') else: namesUrl.extend(getTitle(html)) imgsUrl.extend(getImg(html)) scoresUrl.extend(getScore(html)) commentsUrl.extend(getComment(html)) introductionsUrl.extend(getDetail(html))
暂时完成以上的模块