Gecko [ www.ctm-euromeuble.fr ]

Name	Size	Permission	Date
__pycache__	[ DIR ]	drwxr-sr-x	2025-04-10 17:08
cmap	[ DIR ]	drwxr-sr-x	2025-04-10 17:08
__init__.py	76 B	-rw-r--r--	2025-04-10 17:07
arcfour.py	886 B	-rw-r--r--	2025-04-10 17:07
ascii85.py	1.98 KB	-rw-r--r--	2025-04-10 17:07
ccitt.py	19.53 KB	-rw-r--r--	2025-04-10 17:07
cmapdb.py	11.79 KB	-rw-r--r--	2025-04-10 17:07
converter.py	21.07 KB	-rw-r--r--	2025-04-10 17:07
encodingdb.py	3.47 KB	-rw-r--r--	2025-04-10 17:07
fontmetrics.py	56.62 KB	-rw-r--r--	2025-04-10 17:07
glyphlist.py	114.46 KB	-rw-r--r--	2025-04-10 17:07
high_level.py	6.17 KB	-rw-r--r--	2025-04-10 17:07
image.py	5.38 KB	-rw-r--r--	2025-04-10 17:07
jbig2.py	9.48 KB	-rw-r--r--	2025-04-10 17:07
latin_enc.py	7.73 KB	-rw-r--r--	2025-04-10 17:07
layout.py	28.65 KB	-rw-r--r--	2025-04-10 17:07
lzw.py	2.74 KB	-rw-r--r--	2025-04-10 17:07
pdfcolor.py	779 B	-rw-r--r--	2025-04-10 17:07
pdfdevice.py	5.84 KB	-rw-r--r--	2025-04-10 17:07
pdfdocument.py	26.63 KB	-rw-r--r--	2025-04-10 17:07
pdffont.py	29.31 KB	-rw-r--r--	2025-04-10 17:07
pdfinterp.py	28.35 KB	-rw-r--r--	2025-04-10 17:07
pdfpage.py	5.18 KB	-rw-r--r--	2025-04-10 17:07
pdfparser.py	5.18 KB	-rw-r--r--	2025-04-10 17:07
pdftypes.py	9.36 KB	-rw-r--r--	2025-04-10 17:07
psparser.py	16.76 KB	-rw-r--r--	2025-04-10 17:07
rijndael.py	45.34 KB	-rw-r--r--	2025-04-10 17:07
runlength.py	1.29 KB	-rw-r--r--	2025-04-10 17:07
settings.py	15 B	-rw-r--r--	2025-04-10 17:07
utils.py	11.97 KB	-rw-r--r--	2025-04-10 17:07

Rename

"""Functions that can be used for the most common use-cases for pdfminer.six"""

import logging
import sys
from io import StringIO

from .converter import XMLConverter, HTMLConverter, TextConverter, \
    PDFPageAggregator
from .image import ImageWriter
from .layout import LAParams
from .pdfdevice import TagExtractor
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
from .pdfpage import PDFPage
from .utils import open_filename

def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
                       laparams=None, maxpages=0, page_numbers=None,
                       password="", scale=1.0, rotation=0, layoutmode='normal',
                       output_dir=None, strip_control=False, debug=False,
                       disable_caching=False, **kwargs):
    """Parses text from inf-file and writes to outfp file-like object.

Takes loads of optional arguments but the defaults are somewhat sane.
    Beware laparams: Including an empty LAParams is not the same as passing
    None!

:param inf: a file-like object to read PDF structure from, such as a
        file handler (using the builtin `open()` function) or a `BytesIO`.
    :param outfp: a file-like object to write the text to.
    :param output_type: May be 'text', 'xml', 'html', 'tag'. Only 'text' works
        properly.
    :param codec: Text decoding codec
    :param laparams: An LAParams object from pdfminer.layout. Default is None
        but may not layout correctly.
    :param maxpages: How many pages to stop parsing after
    :param page_numbers: zero-indexed page numbers to operate on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param scale: Scale factor
    :param rotation: Rotation factor
    :param layoutmode: Default is 'normal', see
        pdfminer.converter.HTMLConverter
    :param output_dir: If given, creates an ImageWriter for extracted images.
    :param strip_control: Does what it says on the tin
    :param debug: Output more logging data
    :param disable_caching: Does what it says on the tin
    :param other:
    :return: nothing, acting as it does on two streams. Use StringIO to get
        strings.
    """
    if debug:
        logging.getLogger().setLevel(logging.DEBUG)

imagewriter = None
    if output_dir:
        imagewriter = ImageWriter(output_dir)

rsrcmgr = PDFResourceManager(caching=not disable_caching)

if output_type == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)

if outfp == sys.stdout:
        outfp = sys.stdout.buffer

if output_type == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=strip_control)
    elif output_type == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter)
    elif output_type == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)

interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(inf,
                                  page_numbers,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=not disable_caching,
                                  check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)

device.close()

def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
                 caching=True, codec='utf-8', laparams=None):
    """Parse and return the text contained in a PDF file.

:param pdf_file: Either a file path or a file-like object for the PDF file
        to be worked on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param page_numbers: List of zero-indexed page numbers to extract.
    :param maxpages: The maximum number of pages to parse
    :param caching: If resources should be cached
    :param codec: Text decoding codec
    :param laparams: An LAParams object from pdfminer.layout. If None, uses
        some default settings that often work well.
    :return: a string containing all of the text extracted.
    """
    if laparams is None:
        laparams = LAParams()

with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, codec=codec,
                               laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

for page in PDFPage.get_pages(
                fp,
                page_numbers,
                maxpages=maxpages,
                password=password,
                caching=caching,
                check_extractable=True,
        ):
            interpreter.process_page(page)

return output_string.getvalue()

def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
                  caching=True, laparams=None):
    """Extract and yield LTPage objects

:param pdf_file: Either a file path or a file-like object for the PDF file
        to be worked on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param page_numbers: List of zero-indexed page numbers to extract.
    :param maxpages: The maximum number of pages to parse
    :param caching: If resources should be cached
    :param laparams: An LAParams object from pdfminer.layout. If None, uses
        some default settings that often work well.
    :return:
    """
    if laparams is None:
        laparams = LAParams()

with open_filename(pdf_file, "rb") as fp:
        resource_manager = PDFResourceManager()
        device = PDFPageAggregator(resource_manager, laparams=laparams)
        interpreter = PDFPageInterpreter(resource_manager, device)
        for page in PDFPage.get_pages(fp, page_numbers, maxpages=maxpages,
                                      password=password, caching=caching):
            interpreter.process_page(page)
            layout = device.get_result()
            yield layout