Source code for kvalikirstu2.odt_reader
""" A module for reading .odt files line by line. Does not use Libreoffice, instead uses odfpy.
"""
import logging
from odf.text import H, P
from odf.opendocument import load
from kvalikirstu2.localization import _
logger = logging.getLogger(__name__)
[docs]class ODTException(Exception):
"""Error occurred while reading odt file."""
def __init__(self, inner_exception, *args):
self.inner_exception = inner_exception
Exception.__init__(self, *args)
[docs]def get_text_nodes_from_odt(odt):
"""Gets the text nodes from an OpenDocument instance.
:param OpenDocument odt: The OpenDocument instance.
"""
text_nodes = odt.text.childNodes
paragraphs = odt.getElementsByType(P)
headings = odt.getElementsByType(H)
out_nodes = [node for node in text_nodes if node in paragraphs or node in headings]
return out_nodes
[docs]def get_text_from_node(node):
"""Gets text from the ODT node.
"""
str_val = str(node)
if not str_val.isspace():
return str_val
return '' # if paragraph node does not contain text, str(node) might return some number instead
[docs]class OdtReader:
"""Reads .odt files line by line
:var doc: The .odt document
:var int node_index: Current text node in document
:var nodes: All the text nodes from the document.
"""
def __init__(self, path: str):
"""Constructor
:param str path: The path for the .odt file
"""
try:
self.doc = load(path)
except Exception as exc:
raise ODTException(exc, _(".odt-file %s could not be opened.") % path)
self.node_index = 0
self.nodes = get_text_nodes_from_odt(self.doc)
logger.debug('Got %s paragraph nodes for %s', len(self.nodes), path)
[docs] def can_read(self):
"""Can read from file?
:rtype: bool
:return: True if not at the end of the file, False otherwise.
"""
return self.node_index < len(self.nodes)
[docs] def read_line(self):
"""Read a line from the document
:rtype: str
:return: A line.
"""
if not self.can_read():
return None
output = self.nodes[self.node_index]
self.node_index = self.node_index + 1
return get_text_from_node(output)