基于 playwright 的爬虫,让 LLM 来做标注
结果包含了 LLM 生成的选择器(经过一个 reflection 步骤确保了一定能解析)、选择到的元素、这些元素的 computedStyle 交集
核心代码:
@main_loop.callback
class _(Callback): # noqa: N801
max_retries = 4
async def end_process(self, context: ChainContext):
dom: Selector = context["dom"]
res: dict[str, str | None] = loads(context.result)
problems = context["problems"] = []
for key, selector in res.items():
if selector:
print_label(f"{key} - {selector}")
try:
if results := dom.css(selector):
show(results)
elif self.max_retries:
problems.append((key, None))
else:
res[key] = None
except SelectorSyntaxError as e:
print_exception_only(e)
problems.append((key, format_exception_only(e)))
if not problems or self.max_retries <= 0:
raise Jump(out_of=main_loop)
self.max_retries -= 1