- 使用
pdfplumber
import pdfplumber
from itertools import pairwise
class Document:
def __init__(self, filename):
self.pdf = pdfplumber.open(filename)
def get_all_words(self):
for page in self.pdf.pages:
for word in page.extract_words():
yield word["text"]
def get_joined_tokens(self):
for i, j in pairwise(self.get_all_words()):
yield i + j
def get_critical(self):
for token in self.get_joined_tokens():
if "信息技术" in token and "投入" in token:
print(f"\n {token} \n")