Source code for kvalikirstu2.odt_reader

""" A module for reading .odt files line by line. Does not use Libreoffice, instead uses odfpy.
"""
import logging
from odf.text import H, P
from odf.opendocument import load
from kvalikirstu2.localization import _

logger = logging.getLogger(__name__)


[docs]class ODTException(Exception):
    """Error occurred while reading odt file."""
    def __init__(self, inner_exception, *args):
        self.inner_exception = inner_exception
        Exception.__init__(self, *args)


[docs]def get_text_nodes_from_odt(odt):
    """Gets the text nodes from an OpenDocument instance.

    :param OpenDocument odt: The OpenDocument instance.
    """
    text_nodes = odt.text.childNodes
    paragraphs = odt.getElementsByType(P)
    headings = odt.getElementsByType(H)
    out_nodes = [node for node in text_nodes if node in paragraphs or node in headings]
    return out_nodes


[docs]def get_text_from_node(node):
    """Gets text from the ODT node.
    """
    str_val = str(node)
    if not str_val.isspace():
        return str_val
    return ''  # if paragraph node does not contain text, str(node) might return some number instead


[docs]class OdtReader:
    """Reads .odt files line by line

    :var doc: The .odt document
    :var int node_index: Current text node in document
    :var nodes: All the text nodes from the document.

    """
    def __init__(self, path: str):
        """Constructor

        :param str path: The path for the .odt file

        """
        try:
            self.doc = load(path)
        except Exception as exc:
            raise ODTException(exc, _(".odt-file %s could not be opened.") % path)

        self.node_index = 0
        self.nodes = get_text_nodes_from_odt(self.doc)
        logger.debug('Got %s paragraph nodes for %s', len(self.nodes), path)

[docs]    def can_read(self):
        """Can read from file?

        :rtype: bool
        :return: True if not at the end of the file, False otherwise.

        """
        return self.node_index < len(self.nodes)

[docs]    def read_line(self):
        """Read a line from the document

        :rtype: str
        :return: A line.

        """
        if not self.can_read():
            return None

        output = self.nodes[self.node_index]
        self.node_index = self.node_index + 1
        return get_text_from_node(output)
Source code for kvalikirstu2.odt_reader

kvalikirstu2

Navigation

Related Topics