标签,并形成一个列表。
soup = BeautifulSoup(driver.page_source, 'html.parser')
articles = soup.find_all('div', class_='kr-shadow-content')
soup = BeautifulSoup(driver.page_source, 'html.parser')
articles = soup.find_all('div', class_='kr-shadow-content')
遍历列表,用find方法一个个找出我们需要的信息。 get_text()方法能够用来提取标签中的文字,因此先用find找到标题、简介的标签, 再用get_text()提取文字即可。
不过,要取得新闻的链接会曲折一点,标签中没有提供新闻详情的链接。 我们打开一则新闻进行观察,发现新闻详情页的网址就是36氪的网址, 拼上标签栏中提供的一串数字,利用article.find("a")['href']找到数字, 再接上36氪的链接36kr.com/。
for article in articles:
title = article.find("a",class_="article-item-title").get_text()
link = "https://www.36kr.com" + str(article.find("a",class_="article-item-pic").get("href"))
summary = article.find("a",class_="article-item-description ellipsis-2").get_text()
现在,我们已经可爬到我们想要的信息。 下一步我们要把信息储存起来,我们把储存的过程和数据的获取写进同一个函数。
def save_page(driver):
n = 1 #行数
wb = xlwt.Workbook(encoding="utf-8", style_compression=0) #创建工作簿
sheet = wb.add_sheet("新闻", cell_overwrite_ok=True) #创建一个新闻表
#写入表头
sheet.write(0, 0, "序号")
sheet.write(0, 1, "名称")
sheet.write(0, 2, "链接")
sheet.write(0, 3, "观看次数")
#爬取数据
soup = BeautifulSoup(driver.page_source, 'html.parser')
articles = soup.find_all('div', class_='kr-shadow-content')
for article in articles:
title = article.find("a",class_="article-item-title").get_text()
link = "https://www.36kr.com" + article.find("a")['href']
summary = article.find("a",class_="article-item-description ellipsis-2").get_text()
#写入数据
sheet.write(n, 0, n)
sheet.write(n, 1, title)
sheet.write(n, 2, link)
sheet.write(n, 3, summary)
n = n + 1 #保存
wb.save("36k_news.xls")
最后放上完整代码,此外github上还有一个包含搜索功能的版本github.com/zhangaynami…
import time
import xlwt
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

page_num = 3

def get_article_info(url):
driver.get(url)
time.sleep(2) # 等待页面加载
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") #滚动到底
time.sleep(2) # 等待页面加载
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # 滚动到底
time.sleep(2) # 等待页面加载
pages(driver,page_num)
save_page(driver) #储存数据
def save_page(driver):
n = 1
wb = xlwt.Workbook(encoding="utf-8", style_compression=0)
sheet = wb.add_sheet("新闻", cell_overwrite_ok=True)
sheet.write(0, 0, "序号")
sheet.write(0, 1, "名称")
sheet.write(0, 2, "链接")
sheet.write(0, 3, "观看次数")
soup = BeautifulSoup(driver.page_source, 'html.parser')
articles = soup.find_all('div', class_='kr-shadow-content')
for article in articles:
title = article.find("a",class_="article-item-title").get_text()
link = "https://www.36kr.com" + str(article.find("a",class_="article-item-pic").get("href"))
summary = article.find("a",class_="article-item-description ellipsis-2").get_text()
sheet.write(n, 0, n)
sheet.write(n, 1, title)
sheet.write(n, 2, link)
sheet.write(n, 3, summary)
n = n + 1
wb.save("36k_news.xls")
def pages(driver,page_num):
i = 1
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # 滚动到底
time.sleep(2) # 等待页面加载
if i == page_num:
break
else:
if_continue = driver.find_element_by_class_name('kr-loading-more-button')
if if_continue.text == '查看更多':
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # 滚动到底
next_page = driver.find_element_by_class_name('kr-loading-more-button')
next_page.click()
else:
print("没有更多了")
i += 1
if __name__ == "__main__":
driver = webdriver.Chrome(ChromeDriverManager().install())
url = 'https://www.36kr.com/information/web_news/latest'
get_article_info(url)