from httpx import Client
from bs4 import BeautifulSoup
client = Client(http2=True, base_url="https://www.yanglao.com.cn/")
获取url中的数字用到的正则表达式:
from re import compile
get_id_reg = compile(r"\d+")
获取文章列表
直接用网站自带的分页
def get_articles(page: int = 1):
"""https://www.yanglao.com.cn/article下的文章"""
dom = BeautifulSoup(client.get(f"/article_{page}").text, "lxml")
news_list = dom.select_one("ul.news-list")
return [
{"articleId": int(get_id_reg.findall(li.a["href"])[0]), "title": li.a.string, "date": li.span.string}
for li in news_list.find_all("li")
]
注意:这里暂时没有考虑获取不到的情况
获取文章详情
正文直接用原本的html数据吧。我记得微信小程序中是允许插入html片段的
def get_article_info(article_id: int):
dom = BeautifulSoup(client.get(f"/article/{article_id}.html").text, "lxml")
news_view = dom.select_one("div.news-view")
li_source, li_hits, li_datetime = news_view.select("ul.info > li")[:3]
return {
"title": news_view.h1.string,
"source": li_source.string.strip()[3:],
"hits": int(li_hits.string.strip()[3:]),
"datetime": li_datetime.string.strip(),
"html": news_view.select_one("div.news-content").prettify(),
"related": [
{"articleId": int(get_id_reg.findall(li.a["href"])[0]), "title": li.a["title"]}
for li in news_view.select("div.related-read li")
]
}
做成API
为了文档效果,我决定把返回值都改成用 Pydantic 模型
from pydantic import BaseModel, Field
class Article(BaseModel):
articleId: int = Field(alias="article_id", title="文章唯一标识,用于请求文章详情")
title: str = Field(title="文章标题")
class Config:
schema_extra = {"example": {"articleId": 521502, "title": "大兴区精心康复托养中心十二月康复展示"}}
class ArticleWithDate(Article):
date: str = Field(title="文章发布日期")
class Config:
schema_extra = {
"example": {"articleId": 521433, "title": "地方立法密集落地优化养老服务升级", "date": "2022-12-12"},
}
class ArticleDetails(BaseModel):
title: str = Field(title="文章标题")
source: str = Field(title="文章来源", description="可能为空字符串")
hits: int = Field(title="浏览量", description="经测试就是请求次数")
datetime: str = Field(title="文章发布日期和时间", description="按照 yyyy-mm-dd hh:mm:ss 格式")
html: str = Field(title="正文html", description="可能很长,可能包含图片文字甚至富文本")
related: list[Article] = Field(title="相关文章", description="可以搞成一个列表")
class Config:
schema_extra = {
"example": {
"title": "2022天津河西区老年痴呆养老院哪家好?2022河西区老年痴呆养老院多少钱?",
"source": "国家卫健委门户网站",
"hits": 27,
"datetime": "2022-12-27 15:10:26",
"html": """\
<div class="news-content">
位于上海市静安区保德路545号的**护理院也是<a href="https://www.yanglao.com.cn/resthome">收阿尔兹海默老人的养老机构</a>。
</div>
""",
"related": [
{'articleId': 521504, 'title': '仙栖谷精神障碍托养中心告诉您精神障碍患者“阳”了怎么办?'},
{'articleId': 521501, 'title': '椿萱茂日间照料|家门口的健康养老很幸福'},
{'articleId': 521497, 'title': '2022成都青羊区老年痴呆养老院有哪些,2022青羊区认知症养老院地址'}
]
}
}
已经把文档写进模型了,可能会有点不好看哈
初始化部分与原来类似:
from fastapi import APIRouter, Path, Query, HTTPException
from pydantic import BaseModel, Field
from httpx import AsyncClient
from bs4 import BeautifulSoup
import re
router = APIRouter(tags=["news"])
get_id_reg = re.compile(r"\d+")
client = AsyncClient(http2=True, base_url="https://www.yanglao.com.cn/", headers={
"user-agent": "gp-scraper / Guard Pine (https://gp.muspimerol.site/)",
"x-scraper-contact-email": "admin@muspimerol.site",
"x-gp-repo": "https://jihulab.com/CNSeniorious000/gp-backend"
})
为了养老网的管理者能识别我们,我还加了个自定义的user-agent请求头
接下来定义endpoints:
@router.get("/articles", response_model=list[ArticleWithDate], responses={404: {"description": "分页超出范围"}})
async def get_articles(page: int | None = Query(1, description="分页(从1开始)", ge=1)) -> list[ArticleWithDate]:
"""### fetch new articles directly from the web"""
dom = BeautifulSoup((await client.get(f"/article_{page}")).text, "lxml")
news_list = dom.select_one("ul.news-list")
if news_list is None:
raise HTTPException(404, "page not found")
return [ArticleWithDate(
article_id=int(get_id_reg.findall(li.a["href"])[0]), title=li.a.string, date=li.span.string
) for li in news_list.find_all("li")]
@router.get("/article/{articleId}", response_model=ArticleDetails, responses={404: {"description": "不存在该文章"}})
async def get_article_info(article_id: int = Path(alias="articleId", description="文章唯一标识")) -> ArticleDetails:
"""### get an article's details and its related articles"""
dom = BeautifulSoup((await client.get(f"/article/{article_id}.html")).text, "lxml")
news_view = dom.select_one("div.news-view")
if news_view is None:
raise HTTPException(404, "article not found")
li_source, li_hits, li_datetime = news_view.select("ul.info > li")[:3]
return ArticleDetails(
title=news_view.h1.string,
source=li_source.string.strip()[3:],
hits=int(li_hits.string.strip()[3:]),
datetime=li_datetime.string.strip(),
html=news_view.select_one("div.news-content").prettify(),
related=[
Article(article_id=int(get_id_reg.findall(li.a["href"])[0]), title=li.a["title"])
for li in news_view.select("div.related-read li")
]
)
最后上传PR:add news-fetching APIs
最后,显示效果还不错吧:
可惜,该网站协议中提到不允许爬它:
网站内容方面
上述“其他权益”涵盖本站已经发布或将要发布的所有内容(包括但不限于:文字、图片、图像、音频、视频等),任何单位或个人不得以任何方式(包括但不限于:转载、摘编、复制、剪辑、在非养老网所属的服务器上做镜像等)擅自使用上述内容,或利用这些内容再造与之相关的衍生产品。
robots.txt
也显示:
User-agent: *
Disallow:
Disallow: /assets/
Disallow: /css/
Disallow: /js/