夕是四時落,阳是六時起,你之于我,就是最深最美的执念,你有没有一瞬间,心疼过我的执着
作者 时间 分类 学习 浏览 2175 评论

同学说,学爬虫不爬点小黄图,哪来的动力。
第二版明显要比第一版的优质很多。

----- 代码 ------
#coding:utf8

import re
import urllib

#读取页面的html
def getHtml(url):

page = urllib.urlopen(url);
html = page.read();
return html

#获取页面每个图库的的打开链接
def getTukuAllUrl(hmtl):

reg = r'href="(.*?\.html)">查看大图'
TuKuUrlre = re.compile(reg)
TuKuUrllist = re.findall(TuKuUrlre, hmtl)
return TuKuUrllist

#打开每一个图库的链接
def openTukuAllUrl(TuKuUrllist):

a = 1;
for tukuUrl in TuKuUrllist:
    openTukuUrl(tukuUrl, a)
    a +=1

#打开每个图库的链接
def openTukuUrl(tukuUrl, a):

tukuHtml = getHtml(tukuUrl)
tukuName = getTukuName(tukuHtml)
print a
print tukuName
tukuNum = int(getImgNum(tukuHtml))
print tukuNum
getTukuImg(tukuHtml, tukuName, tukuNum, a)

#获取页面每个图库的名字
def getTukuName(tukuHtml):

reg = r'<h1>(.+)</h1></div>'
TukuNamere = re.compile(reg)
TukuName = re.search(TukuNamere, tukuHtml).group(1)
return TukuName

#获取图片的数量
def getImgNum(tukuHtml):

reg = r'</span>/(.+)</strong>'
tukuNumre = re.compile(reg)
tukuNum = re.search(tukuNumre, tukuHtml).group(1)
return tukuNum

#获取每个图库的图片
def getTukuImg(tukuHtml, tukuName, tukuNum, a):

i = 0
while i < tukuNum:
    imgUrl = getTuku_ImgUrl(tukuHtml, tukuName, i, a)
    print imgUrl
    nextHtmlUrl = getNextHtmlUrl(tukuHtml)
    nextHtml = getHtml(nextHtmlUrl)
    tukuHtml = nextHtml
    i +=1

#获取页面的原图链接且下载
def getTuku_ImgUrl(imgHmtl, imgName, imgI, a):

reg = r'<a href="(.*?\.jpg)" target="_blank" class="original">查看原图</a>'
imgUrlre = re.compile(reg)
imgUrl = re.findall(imgUrlre, imgHmtl)
for imgs in imgUrl:
    urllib.urlretrieve(imgs, down +str(a) + '_' + str(imgI) + '.jpg')  
return imgUrl

#获取下一张图片的html链接
def getNextHtmlUrl(TukuImgHtml):

reg = r'"num-next" href="(.*?\.html)">下一张'
nextHtmlUrlre = re.compile(reg)
nextHtmlUrl = re.search(nextHtmlUrlre, TukuImgHtml).group(1)
return nextHtmlUrl

def limif(url):

html = getHtml(url)
TuKuUrllist = getTukuAllUrl(html)
openTukuAllUrl(TuKuUrllist) 

url = "http://www.win4000.com/meitu.html"; #放图片网页链接
down = "D:\a\"; #下载图片路径
limif(url)

-------- 效果 ----------

pachong1.jpg




微信公众号:八一四


上一篇: Lost control     |     下一篇: 深圳,揪心的痛



添加新评论