Gecko [ www.ctm-euromeuble.fr ]

Name	Size	Permission	Date
__pycache__	[ DIR ]	drwxr-sr-x	2025-04-10 17:08
cmap	[ DIR ]	drwxr-sr-x	2025-04-10 17:08
__init__.py	76 B	-rw-r--r--	2025-04-10 17:07
arcfour.py	886 B	-rw-r--r--	2025-04-10 17:07
ascii85.py	1.98 KB	-rw-r--r--	2025-04-10 17:07
ccitt.py	19.53 KB	-rw-r--r--	2025-04-10 17:07
cmapdb.py	11.79 KB	-rw-r--r--	2025-04-10 17:07
converter.py	21.07 KB	-rw-r--r--	2025-04-10 17:07
encodingdb.py	3.47 KB	-rw-r--r--	2025-04-10 17:07
fontmetrics.py	56.62 KB	-rw-r--r--	2025-04-10 17:07
glyphlist.py	114.46 KB	-rw-r--r--	2025-04-10 17:07
high_level.py	6.17 KB	-rw-r--r--	2025-04-10 17:07
image.py	5.38 KB	-rw-r--r--	2025-04-10 17:07
jbig2.py	9.48 KB	-rw-r--r--	2025-04-10 17:07
latin_enc.py	7.73 KB	-rw-r--r--	2025-04-10 17:07
layout.py	28.65 KB	-rw-r--r--	2025-04-10 17:07
lzw.py	2.74 KB	-rw-r--r--	2025-04-10 17:07
pdfcolor.py	779 B	-rw-r--r--	2025-04-10 17:07
pdfdevice.py	5.84 KB	-rw-r--r--	2025-04-10 17:07
pdfdocument.py	26.63 KB	-rw-r--r--	2025-04-10 17:07
pdffont.py	29.31 KB	-rw-r--r--	2025-04-10 17:07
pdfinterp.py	28.35 KB	-rw-r--r--	2025-04-10 17:07
pdfpage.py	5.18 KB	-rw-r--r--	2025-04-10 17:07
pdfparser.py	5.18 KB	-rw-r--r--	2025-04-10 17:07
pdftypes.py	9.36 KB	-rw-r--r--	2025-04-10 17:07
psparser.py	16.76 KB	-rw-r--r--	2025-04-10 17:07
rijndael.py	45.34 KB	-rw-r--r--	2025-04-10 17:07
runlength.py	1.29 KB	-rw-r--r--	2025-04-10 17:07
settings.py	15 B	-rw-r--r--	2025-04-10 17:07
utils.py	11.97 KB	-rw-r--r--	2025-04-10 17:07

Rename

import logging
from . import settings
from .psparser import LIT
from .pdftypes import PDFObjectNotFound
from .pdftypes import resolve1
from .pdftypes import int_value
from .pdftypes import list_value
from .pdftypes import dict_value
from .pdfparser import PDFParser
from .pdfdocument import PDFDocument
from .pdfdocument import PDFTextExtractionNotAllowed

log = logging.getLogger(__name__)

# some predefined literals and keywords.
LITERAL_PAGE = LIT('Page')
LITERAL_PAGES = LIT('Pages')

class PDFPage:
    """An object that holds the information about a page.

A PDFPage object is merely a convenience class that has a set
    of keys and values, which describe the properties of a page
    and point to its contents.

Attributes:
      doc: a PDFDocument object.
      pageid: any Python object that can uniquely identify the page.
      attrs: a dictionary of page attributes.
      contents: a list of PDFStream objects that represents the page content.
      lastmod: the last modified time of the page.
      resources: a list of resources used by the page.
      mediabox: the physical size of the page.
      cropbox: the crop rectangle of the page.
      rotate: the page rotation (in degree).
      annots: the page annotations.
      beads: a chain that represents natural reading order.
    """

def __init__(self, doc, pageid, attrs):
        """Initialize a page object.

doc: a PDFDocument object.
        pageid: any Python object that can uniquely identify the page.
        attrs: a dictionary of page attributes.
        """
        self.doc = doc
        self.pageid = pageid
        self.attrs = dict_value(attrs)
        self.lastmod = resolve1(self.attrs.get('LastModified'))
        self.resources = resolve1(self.attrs.get('Resources', dict()))
        self.mediabox = resolve1(self.attrs['MediaBox'])
        if 'CropBox' in self.attrs:
            self.cropbox = resolve1(self.attrs['CropBox'])
        else:
            self.cropbox = self.mediabox
        self.rotate = (int_value(self.attrs.get('Rotate', 0))+360) % 360
        self.annots = self.attrs.get('Annots')
        self.beads = self.attrs.get('B')
        if 'Contents' in self.attrs:
            contents = resolve1(self.attrs['Contents'])
        else:
            contents = []
        if not isinstance(contents, list):
            contents = [contents]
        self.contents = contents
        return

def __repr__(self):
        return '<PDFPage: Resources={!r}, MediaBox={!r}>'\
            .format(self.resources, self.mediabox)

INHERITABLE_ATTRS = {'Resources', 'MediaBox', 'CropBox', 'Rotate'}

@classmethod
    def create_pages(cls, document):
        def search(obj, parent):
            if isinstance(obj, int):
                objid = obj
                tree = dict_value(document.getobj(objid)).copy()
            else:
                objid = obj.objid
                tree = dict_value(obj).copy()
            for (k, v) in parent.items():
                if k in cls.INHERITABLE_ATTRS and k not in tree:
                    tree[k] = v

tree_type = tree.get('Type')
            if tree_type is None and not settings.STRICT:  # See #64
                tree_type = tree.get('type')

if tree_type is LITERAL_PAGES and 'Kids' in tree:
                log.info('Pages: Kids=%r', tree['Kids'])
                for c in list_value(tree['Kids']):
                    yield from search(c, tree)
            elif tree_type is LITERAL_PAGE:
                log.info('Page: %r', tree)
                yield (objid, tree)
        pages = False
        if 'Pages' in document.catalog:
            objects = search(document.catalog['Pages'], document.catalog)
            for (objid, tree) in objects:
                yield cls(document, objid, tree)
                pages = True
        if not pages:
            # fallback when /Pages is missing.
            for xref in document.xrefs:
                for objid in xref.get_objids():
                    try:
                        obj = document.getobj(objid)
                        if isinstance(obj, dict) \
                                and obj.get('Type') is LITERAL_PAGE:
                            yield cls(document, objid, obj)
                    except PDFObjectNotFound:
                        pass
        return

@classmethod
    def get_pages(cls, fp,
                  pagenos=None, maxpages=0, password='',
                  caching=True, check_extractable=True):
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        # Create a PDF document object that stores the document structure.
        doc = PDFDocument(parser, password=password, caching=caching)
        # Check if the document allows text extraction. If not, abort.
        if check_extractable and not doc.is_extractable:
            error_msg = 'Text extraction is not allowed: %r' % fp
            raise PDFTextExtractionNotAllowed(error_msg)
        # Process each page contained in the document.
        for (pageno, page) in enumerate(cls.create_pages(doc)):
            if pagenos and (pageno not in pagenos):
                continue
            yield page
            if maxpages and maxpages <= pageno+1:
                break
        return