Click here to Skip to main content
15,909,896 members
Please Sign up or sign in to vote.
1.00/5 (1 vote)
See more:
How can I extract all the text from an online PDF, but I was only able to present the first page as posted to present the text of all pages:

<pre>
import requests, PyPDF2

url = 'https://<servicesname>.blob.core.windows.net/dataset/Coursera - The Essential Skills Playbook.pdf'
response = requests.get(url)
my_raw_data = response.content

open_pdf_file = open("Coursera - The Essential Skills Playbook.pdf", 'rb')
read_pdf = PyPDF2.PdfFileReader(open_pdf_file)
if read_pdf.isEncrypted:
    read_pdf.decrypt("")
    print(read_pdf.getPage(0).extractText())
else:
    print(read_pdf.getPage(0).extractText())


What I have tried:

import os
import io
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

def convert_pdf_to_txt(path):
    '''Convert pdf content from a file path to text

    :path the file path
    '''
    rsrcmgr = PDFResourceManager()
    codec = 'utf-8'
    laparams = LAParams()

    with io.StringIO() as retstr:
        with TextConverter(rsrcmgr, retstr, codec=codec,
                           laparams=laparams) as device:
            with open(path, 'rb') as fp:
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                password = ""
                maxpages = 0
                caching = True
                pagenos = set()

                for page in PDFPage.get_pages(fp,
                                              pagenos,
                                              maxpages=maxpages,
                                              password=password,
                                              caching=caching,
                                              check_extractable=True):
                    interpreter.process_page(page)

                return retstr.getvalue()
Posted
Updated 30-Oct-21 11:07am
Comments
Richard MacCutchan 29-Oct-21 4:13am    
You set maxpages to 0, is that correct?
José Madureira 30-Oct-21 17:17pm    
It displays the following error message:

PdfReadWarning: Xref table not zero-indexed. ID numbers for objects will be corrected. [pdf.py:1736]

and how can i find the last page dynamically?
José Madureira 30-Oct-21 17:19pm    
I try:

import requests, PyPDF2

import os
import io
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

url = 'https://servicestorageblob.blob.core.windows.net/dataset/Coursera - The Essential Skills Playbook.pdf'
response = requests.get(url)
my_raw_data = response.content

open_pdf_file = open("Coursera - The Essential Skills Playbook.pdf", 'rb')
read_pdf = PyPDF2.PdfFileReader(open_pdf_file)
if read_pdf.isEncrypted:
read_pdf.decrypt("")
print(read_pdf.getPage(5).extractText())

else:
print(read_pdf.getPage(5).extractText())


def convert_pdf_to_txt(path):
'''Convert pdf content from a file path to text

:path the file path
'''
rsrcmgr = PDFResourceManager()
codec = 'utf-8'
laparams = LAParams()

with io.StringIO() as retstr:
with TextConverter(rsrcmgr, retstr, codec=codec,
laparams=laparams) as device:
with open(path, 'rb') as fp:
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()

for page in PDFPage.get_pages(fp,
pagenos,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=True):
interpreter.process_page(page)

return retstr.getvalue()

text_combined = convert_pdf_to_txt(open_pdf_file)
print(text_combined)

but the error message is displayed:

The Essential Skills Map
for digital transformation
6
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-6-2cb100754238> in <module>
51 return retstr.getvalue()
52
---> 53 text_combined = convert_pdf_to_txt(open_pdf_file)
54 print(text_combined)

<ipython-input-6-2cb100754238> in convert_pdf_to_txt(path)
34 with TextConverter(rsrcmgr, retstr, codec=codec,
35 laparams=laparams) as device:
---> 36 with open(path, 'rb') as fp:
37 interpreter = PDFPageInterpreter(rsrcmgr, device)
38 password = ""

TypeError: expected str, bytes or os.PathLike object, not _io.BufferedReader

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)



CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900