Python BeautifulSoup的网络爬虫,抓取笑话

BeautifulSoup 下载:http://www.crummy.com/software/BeautifulSoup/

# -*- coding: utf-8 -*-

import urllib2
import re
from bs4 import BeautifulSoup

homeUrl = 'http://www.jokeji.cn/'
startUrl = 'http://www.jokeji.cn/JokeHtml/bxnn/2014113023321188.htm' # 开始页面
nextPageUrlReg = r'(<div class=zw_page1>[\s\S]*<a href="../../)([\w\d\/\.]+)("[\s\S]*class=zw_page2>)' # 判断是否有下一页
page = 1
searchCount = 10
lists = []

def main(url):
        searchByUrl(url)
        for v in lists:
                print v

def searchByUrl(url):
        global page
        response = urllib2.urlopen(url)
        html = response.read()

        searchItems(html)

        # 如果有下一页并且小于要查询的页数,递归查询
        nextPage = hasNextPage(html)
        if nextPage != None and page < searchCount:
                searchByUrl(nextPage)
                page = page + 1

def searchItems(html):
        # 参数from_encoding为 gb18030 防止中文乱码
        soup = BeautifulSoup(html, from_encoding = "gb18030")
        # 查找ID为text110的元素,并获取内容
        content = soup.find(id='text110')
        if content != None:
                # 查找P的所有元素,不能使用 content.contents() 获取并遍历
                for v in content.findAll('p'):
                       #print v.get_text()
                       # get_text() 去除html标签,获取内容
                       lists.append(v.get_text())

        return lists

def hasNextPage(html):
        if re.search(nextPageUrlReg, html) != None:
                PageName_temp = re.findall(nextPageUrlReg, html)
                return homeUrl + PageName_temp[0][1]
        else:
                return None
        
main(startUrl)

 

发表评论