python 简单爬虫

编程记录 · 2023-04-25

import re
import requests
from bs4 import BeautifulSoup
import lxml

url = 'https://www.zxcs.info/sort/3/page/2'

html = requests.get(url)

soup = BeautifulSoup(html.text, 'lxml')

data = soup.select('#plist>dt>a')

print(data)

for item in data:

resuls = {

'title':item.get_text(),

'link':item.get('href')

}

def pages(link):

info = requests.get(link)
soups = BeautifulSoup(info.text,'lxml')

name = soups.select('body > div.wrap > div.book-detail-wrap.center990 > div.book-information.cf > div.book-info > h1')
jieshao = soups.select('body > div.wrap > div.book-detail-wrap.center990 > div.book-content-wrap.cf > div > div.book-info-detail')
down = soups.select('#download > a')

print (html2text(name))
print (html2text(jieshao))
print (html2text(down))

print(pages('https://www.zxcs.info/post/14929'))

def html2text(text):

pat = re.compile(r'<[^>]+>',re.S)
result = pat.sub('',text)
return result
Theme Jasmine by Kent Liao
辽ICP备2021009421号-2