Python 爬虫采集谜语

编程记录 · 2019-02-18
from bs4 import BeautifulSoup
import re
def get_html(urls):
    req = requests.get(url=urls)
    req.encoding = 'gbk'
    html = req.text
    bf = BeautifulSoup(html, 'html.parser')
    fl = bf.find_all('div', class_ = 'weizhi')
    title = bf.find_all('div', class_ = 'zy')
    texts = bf.find_all('div', class_ = 'md')
    return(fl[0].text.replace('\xa0'*8,'\n\n') + '\', \'' + title[0].text + '\', \'' + texts[0].text + '\'),\n')
def list(server, target):
    req = requests.get(url = target)
    req.encoding = 'gbk'
    html = req.text
    div_bf = BeautifulSoup(html, 'html.parser')
    div = div_bf.find_all('div', class_ = 'list')
    ul_bf = BeautifulSoup(str(div[0]) ,'html.parser')
    uls = ul_bf.find('ul')
    for link in uls.find_all('li'):
        try:
            name = link.a.string
            text = get_html(server + link.a.get('href'))
            f = open('F:\\python\\my3.sql', 'a+', encoding='utf-8')
            f.write('(NULL,\'' + name + '\', \'' + text)
            print('ok')
        except:
            print('采集失败')
        continue
    print('采集完成')
if __name__ == "__main__":
    for num in range(495, 1890):
        list('http://www.cmiyu.com/', 'http://www.cmiyu.com/new/list_81_' + str(num) + '.html')
        print('采集第',num,'页完成')
Theme Jasmine by Kent Liao
辽ICP备2021009421号-2