Python爬取新闻语料并用bs4库解析DOM树

Planeswalker23 2018年05月01日 共5,307字 124次浏览

背景

在做推荐系统的时候需要训练集,所以就自己写了一个爬虫,然后用 bs4 库对爬取的 html 数据处理得到新闻的 URL 链接。

get_url() 函数是用来爬取数据并处理 html 内容的,其余的函数是针对特定类别的新闻数据而写的。

以下是源代码:

# coding=utf-8
from urllib import urlopen
from bs4 import BeautifulSoup

def get_url(url, doc, label, name):
    html = urlopen(url)
    bs_obj = BeautifulSoup(html.read(), 'html.parser')
    #指针指向doc末尾写入
    f = open(doc, 'a')
    url_list = bs_obj.find_all(label, name)
    for address in url_list:
        # print(address.find('a')['href'])
        address_url = address.find('a')['href']
        if 'http' in address_url:
            if 'ifeng' in address_url or 'sina' in address_url:
                a = address_url
                f.write(a)
                f.write('\n')
            else:
                continue
        elif 'n1' in address_url:
            if 'sports' in doc:
                a = 'http://sports.people.com.cn' + address_url
            elif 'science' in doc:
                a = 'http://scitech.people.com.cn' + address_url
            elif 'culture' in doc:
                a = 'http://culture.people.com.cn' + address_url
            elif 'film' in doc:
                a = 'http://ent.people.com.cn' + address_url
            elif 'game' in doc:
                a = 'http://game.people.com.cn' + address_url
            elif 'education' in doc:
                a = 'http://edu.people.com.cn/GB' + address_url
            f.write(a)
            f.write('\n')
        #print(a)

    f.close()
    html.close()
    return

#教育
def education():
    smalltype = ['208610', '227065', '227057', '226718', '367001']
    page = 7
    for type in smalltype:
        for i in range(1, page+1):
            url =  'http://edu.people.com.cn/GB/' + type + '/index' + str(i) + '.html'
            print(url)
            get_url(url, 'education.txt', 'li', '')

#游戏
def game():
    smalltype = ['163384', '48661', '48662']
    page = 10
    for type in smalltype:
        for i in range(1, page+1):
            url =  'http://game.people.com.cn/GB/' + type + '/index' + str(i) + '.html'
            print(url)
            get_url(url, 'game.txt', 'li', '')

#影视1
def film1():
    smalltype = ['6', '1370']
    # 15*20
    #凤凰只允许访问前20页
    for type in smalltype:
        for i in range(1, 21):
            url = 'http://ent.ifeng.com/listpage/' + type + '/' + str(i) + '/list.shtml'
            print(url)
            get_url(url, 'film.txt', 'div', 'box_list clearfix')

#影视2
def film2():
    smalltype = ['81372', '81374']
    page = 7
    for type in smalltype:
        for i in range(1, page+1):
            url =  'http://ent.people.com.cn/GB/' + type + '/index' + str(i) + '.html'
            print(url)
            get_url(url, 'film.txt', 'li', '')

#文化1
def culture1():
    smalltype = ['87423', '40473/40476', '22219', '27296']
    page = 7
    for type in smalltype:
        for i in range(1, page+1):
            url = 'http://culture.people.com.cn/GB/' + type + '/index' + str(i) + '.html'
            print(url)
            get_url(url, 'culture.txt', 'li', '')

#文化2
def culture2():
    smalltype = ['206244/356129','206244/356132','206254/206256','206246/408046', '206246/408047', '206261', '206244/356130']
    page = ['6', '3', '2', '2', '5', '2', '3']
    typedict = {'206244/356129' : 6, '206244/356132' : 3, '206254/206256' : 2,'206246/408046' : 2, '206246/408047' : 5, '206261' : 2, '206244/356130' : 3}
    for type in typedict:
        #print(type + ":" + str(typedict[type]))
        for i in range(1, typedict[type]+1):
            url = 'http://art.people.com.cn/GB/' + type + '/index' + str(i) + '.html'
            print(url)
            get_url(url, 'culture.txt', 'div', 'hdNews clearfix')

culture1()
#科技
def science():
    smalltype = ['25895','1056','41163','25893']
    page = 7
    for type in smalltype:
        for i in range(1, page+1):
            url = 'http://scitech.people.com.cn/GB/' + type + '/index' + str(i) + '.html'
            print(url)
            get_url(url, 'science.txt', 'li', '')

#体育1
def sports1():
    smalltype = ['22155','22134','22141','22149']
    page = 7
    for type in smalltype:
        for i in range(1, page+1):
            url = 'http://sports.people.com.cn/GB/' + type + '/index' + str(i) + '.html'
            print(url)
            get_url(url, 'sports.txt', 'div', 'hdNews clearfix')

#体育2
def sports2():
    smalltype = ['22176', '35862', '22172']
    page = 4
    for type in smalltype:
        for i in range(1, page + 1):
            url = 'http://sports.people.com.cn/GB/' + type + '/index' + str(i) + '.html'
            print(url)
            get_url(url, 'sports.txt', 'li', '')

#娱乐
def entertainment():
    smalltype = ['3', '1370', '30741', '44169']
    for smallertype in smalltype:
        #15*20*4
        for i in range(1, 21):
            url = 'http://ent.ifeng.com/listpage/' + str(smallertype) + '/' + str(i) + '/list.shtml'
            print(url)
            get_url(url, 'entertainment.txt', 'div', 'box_list clearfix')

#社会
def society():
    smalltype = ['zqsk', 'fz-shyf', 'qwys', 'shwx']
    for type in smalltype:
        #40*10*4
        for i in range(1, 11):
            url = 'http://roll.news.sina.com.cn/news/shxw/' + str(type) + '/index_' + str(i) + '.shtml'
            print(url)
            get_url( url, 'society.txt', 'li', '')