Source code for kvalikirstu2.odt_reader

""" A module for reading .odt files line by line. Does not use Libreoffice, instead uses odfpy.
"""
import logging
from odf.text import H, P
from odf.opendocument import load
from kvalikirstu2.localization import _

logger = logging.getLogger(__name__)


[docs]class ODTException(Exception): """Error occurred while reading odt file.""" def __init__(self, inner_exception, *args): self.inner_exception = inner_exception Exception.__init__(self, *args)
[docs]def get_text_nodes_from_odt(odt): """Gets the text nodes from an OpenDocument instance. :param OpenDocument odt: The OpenDocument instance. """ text_nodes = odt.text.childNodes paragraphs = odt.getElementsByType(P) headings = odt.getElementsByType(H) out_nodes = [node for node in text_nodes if node in paragraphs or node in headings] return out_nodes
[docs]def get_text_from_node(node): """Gets text from the ODT node. """ str_val = str(node) if not str_val.isspace(): return str_val return '' # if paragraph node does not contain text, str(node) might return some number instead
[docs]class OdtReader: """Reads .odt files line by line :var doc: The .odt document :var int node_index: Current text node in document :var nodes: All the text nodes from the document. """ def __init__(self, path: str): """Constructor :param str path: The path for the .odt file """ try: self.doc = load(path) except Exception as exc: raise ODTException(exc, _(".odt-file %s could not be opened.") % path) self.node_index = 0 self.nodes = get_text_nodes_from_odt(self.doc) logger.debug('Got %s paragraph nodes for %s', len(self.nodes), path)
[docs] def can_read(self): """Can read from file? :rtype: bool :return: True if not at the end of the file, False otherwise. """ return self.node_index < len(self.nodes)
[docs] def read_line(self): """Read a line from the document :rtype: str :return: A line. """ if not self.can_read(): return None output = self.nodes[self.node_index] self.node_index = self.node_index + 1 return get_text_from_node(output)