1. 用requests库和BeautifulSoup库,爬取校园新闻首页新闻的标题、链接、正文、show-info。
2. 分析info字符串,获取每篇新闻的发布时间,作者,来源,摄影等信息。
import requestsnewsurl='http://news.gzcc.cn/html/xiaoyuanxinwen/'res = requests.get(newsurl)res.encoding='utf-8'from bs4 import BeautifulSoupsoup = BeautifulSoup(res.text,'html.parser')for new in soup.select('li'): if len(new.select('.news-list-title')) > 0: url= new.select('a')[0].attrs['href'] print(url) res = requests.get(url) res.encoding = 'utf-8' e = BeautifulSoup(res.text, 'html.parser') content = e.select('#content') show = e.select('.show-info') print(show[0].text[0:25]) print(show[0].text[30:38]) print(show[0].text[38:45]) print(show[0].text[46:56]) print(show[0].text[62:])