muspi_merol / blog / wzakrq31d4zqsbog

最后更新于:2023年10月12日

gp: 爬取养老网聚合新闻


from httpx import Client
from bs4 import BeautifulSoup
client = Client(http2=True, base_url="https://www.yanglao.com.cn/")

获取url中的数字用到的正则表达式:

from re import compile
get_id_reg = compile(r"\d+")

获取文章列表

直接用网站自带的分页

def get_articles(page: int = 1):
    """https://www.yanglao.com.cn/article下的文章"""
    dom = BeautifulSoup(client.get(f"/article_{page}").text, "lxml")
    news_list = dom.select_one("ul.news-list")
    return [
        {"articleId": int(get_id_reg.findall(li.a["href"])[0]), "title": li.a.string, "date": li.span.string}
        for li in news_list.find_all("li")
    ]

注意:这里暂时没有考虑获取不到的情况

获取文章详情

正文直接用原本的html数据吧。我记得微信小程序中是允许插入html片段的

def get_article_info(article_id: int):
    dom = BeautifulSoup(client.get(f"/article/{article_id}.html").text, "lxml")
    news_view = dom.select_one("div.news-view")
    li_source, li_hits, li_datetime = news_view.select("ul.info > li")[:3]
    return {
        "title": news_view.h1.string,
        "source": li_source.string.strip()[3:],
        "hits": int(li_hits.string.strip()[3:]),
        "datetime": li_datetime.string.strip(),
        "html": news_view.select_one("div.news-content").prettify(),
        "related": [
            {"articleId": int(get_id_reg.findall(li.a["href"])[0]), "title": li.a["title"]}
            for li in news_view.select("div.related-read li")
        ]
    }

做成API

为了文档效果,我决定把返回值都改成用 Pydantic 模型

from pydantic import BaseModel, Field

class Article(BaseModel):
    articleId: int = Field(alias="article_id", title="文章唯一标识,用于请求文章详情")
    title: str = Field(title="文章标题")

    class Config:
        schema_extra = {"example": {"articleId": 521502, "title": "大兴区精心康复托养中心十二月康复展示"}}


class ArticleWithDate(Article):
    date: str = Field(title="文章发布日期")

    class Config:
        schema_extra = {
            "example": {"articleId": 521433, "title": "地方立法密集落地优化养老服务升级", "date": "2022-12-12"},
        }


class ArticleDetails(BaseModel):
    title: str = Field(title="文章标题")
    source: str = Field(title="文章来源", description="可能为空字符串")
    hits: int = Field(title="浏览量", description="经测试就是请求次数")
    datetime: str = Field(title="文章发布日期和时间", description="按照 yyyy-mm-dd hh:mm:ss 格式")
    html: str = Field(title="正文html", description="可能很长,可能包含图片文字甚至富文本")
    related: list[Article] = Field(title="相关文章", description="可以搞成一个列表")

    class Config:
        schema_extra = {
            "example": {
                "title": "2022天津河西区老年痴呆养老院哪家好?2022河西区老年痴呆养老院多少钱?",
                "source": "国家卫健委门户网站",
                "hits": 27,
                "datetime": "2022-12-27 15:10:26",
                "html": """\
                    <div class="news-content">
                     位于上海市静安区保德路545号的**护理院也是<a href="https://www.yanglao.com.cn/resthome">收阿尔兹海默老人的养老机构</a>。
                    </div>
                """,
                "related": [
                    {'articleId': 521504, 'title': '仙栖谷精神障碍托养中心告诉您精神障碍患者“阳”了怎么办?'},
                    {'articleId': 521501, 'title': '椿萱茂日间照料|家门口的健康养老很幸福'},
                    {'articleId': 521497, 'title': '2022成都青羊区老年痴呆养老院有哪些,2022青羊区认知症养老院地址'}
                ]
            }
        }

已经把文档写进模型了,可能会有点不好看哈

初始化部分与原来类似:

from fastapi import APIRouter, Path, Query, HTTPException
from pydantic import BaseModel, Field
from httpx import AsyncClient
from bs4 import BeautifulSoup
import re

router = APIRouter(tags=["news"])
get_id_reg = re.compile(r"\d+")
client = AsyncClient(http2=True, base_url="https://www.yanglao.com.cn/", headers={
    "user-agent": "gp-scraper / Guard Pine (https://gp.muspimerol.site/)",
    "x-scraper-contact-email": "admin@muspimerol.site",
    "x-gp-repo": "https://jihulab.com/CNSeniorious000/gp-backend"
})

为了养老网的管理者能识别我们,我还加了个自定义的user-agent请求头

接下来定义endpoints:

@router.get("/articles", response_model=list[ArticleWithDate], responses={404: {"description": "分页超出范围"}})
async def get_articles(page: int | None = Query(1, description="分页(从1开始)", ge=1)) -> list[ArticleWithDate]:
    """### fetch new articles directly from the web"""

    dom = BeautifulSoup((await client.get(f"/article_{page}")).text, "lxml")
    news_list = dom.select_one("ul.news-list")
    if news_list is None:
        raise HTTPException(404, "page not found")
    return [ArticleWithDate(
        article_id=int(get_id_reg.findall(li.a["href"])[0]), title=li.a.string, date=li.span.string
    ) for li in news_list.find_all("li")]


@router.get("/article/{articleId}", response_model=ArticleDetails, responses={404: {"description": "不存在该文章"}})
async def get_article_info(article_id: int = Path(alias="articleId", description="文章唯一标识")) -> ArticleDetails:
    """### get an article's details and its related articles"""

    dom = BeautifulSoup((await client.get(f"/article/{article_id}.html")).text, "lxml")
    news_view = dom.select_one("div.news-view")
    if news_view is None:
        raise HTTPException(404, "article not found")
    li_source, li_hits, li_datetime = news_view.select("ul.info > li")[:3]
    return ArticleDetails(
        title=news_view.h1.string,
        source=li_source.string.strip()[3:],
        hits=int(li_hits.string.strip()[3:]),
        datetime=li_datetime.string.strip(),
        html=news_view.select_one("div.news-content").prettify(),
        related=[
            Article(article_id=int(get_id_reg.findall(li.a["href"])[0]), title=li.a["title"])
            for li in news_view.select("div.related-read li")
        ]
    )

最后上传PR:add news-fetching APIs


最后,显示效果还不错吧:

可惜,该网站协议中提到不允许爬它:

网站内容方面 上述“其他权益”涵盖本站已经发布或将要发布的所有内容(包括但不限于:文字、图片、图像、音频、视频等),任何单位或个人不得以任何方式(包括但不限于:转载、摘编、复制、剪辑、在非养老网所属的服务器上做镜像等)擅自使用上述内容,或利用这些内容再造与之相关的衍生产品。

robots.txt也显示:

User-agent: *
Disallow: 
Disallow: /assets/
Disallow: /css/
Disallow: /js/