<pre> import requests, PyPDF2 url = 'https://<servicesname>.blob.core.windows.net/dataset/Coursera - The Essential Skills Playbook.pdf' response = requests.get(url) my_raw_data = response.content open_pdf_file = open("Coursera - The Essential Skills Playbook.pdf", 'rb') read_pdf = PyPDF2.PdfFileReader(open_pdf_file) if read_pdf.isEncrypted: read_pdf.decrypt("") print(read_pdf.getPage(0).extractText()) else: print(read_pdf.getPage(0).extractText())
import os import io from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage def convert_pdf_to_txt(path): '''Convert pdf content from a file path to text :path the file path ''' rsrcmgr = PDFResourceManager() codec = 'utf-8' laparams = LAParams() with io.StringIO() as retstr: with TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) as device: with open(path, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) return retstr.getvalue()
var
This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)