from bs4 import BeautifulSoup
import re
def get_html(urls):
req = requests.get(url=urls)
req.encoding = 'gbk'
html = req.text
bf = BeautifulSoup(html, 'html.parser')
fl = bf.find_all('div', class_ = 'weizhi')
title = bf.find_all('div', class_ = 'zy')
texts = bf.find_all('div', class_ = 'md')
return(fl[0].text.replace('\xa0'*8,'\n\n') + '\', \'' + title[0].text + '\', \'' + texts[0].text + '\'),\n')
def list(server, target):
req = requests.get(url = target)
req.encoding = 'gbk'
html = req.text
div_bf = BeautifulSoup(html, 'html.parser')
div = div_bf.find_all('div', class_ = 'list')
ul_bf = BeautifulSoup(str(div[0]) ,'html.parser')
uls = ul_bf.find('ul')
for link in uls.find_all('li'):
try:
name = link.a.string
text = get_html(server + link.a.get('href'))
f = open('F:\\python\\my3.sql', 'a+', encoding='utf-8')
f.write('(NULL,\'' + name + '\', \'' + text)
print('ok')
except:
print('采集失败')
continue
print('采集完成')
if __name__ == "__main__":
for num in range(495, 1890):
list('http://www.cmiyu.com/', 'http://www.cmiyu.com/new/list_81_' + str(num) + '.html')
print('采集第',num,'页完成')