Source code for kvalikirstu2.converter

""" A module for converting arbitrary text to UTF-8 and converting other files to .odt with Libreoffice. All Libreoffice
code is contained inside this module."""
import os
import logging
import shutil
import subprocess
import chardet
from kvalikirstu2.localization import _
from kvalikirstu2 import utils
from kvalikirstu2.argument_parser import get_args

if utils.is_windows():
    import winreg

logger = logging.getLogger(__name__)
_DATA = {'reg_keys': ['SOFTWARE\\LibreOffice\\LibreOffice', 'SOFTWARE\\WOW6432Node\\LibreOffice\\LibreOffice'],
         'libreoffice_path': None}


[docs]class ConversionException(Exception): """Exception that should be raised when an error occurs in data conversion."""
def _get_encoding(content): """ Gets the best guess of the encoding used in the text. :param bytes content: The text in binary form. :returns: Encoding type. """ args = get_args() encodings = args.detection_encodings encoding = chardet.detect(content)['encoding'].lower() # Try if the encoding is valid try: if encoding in encodings: content.decode(encoding=encoding) return encoding else: return args.default_detection_encoding except: return args.default_detection_encoding
[docs]def convert(content): """ Converts content in arbitrary encoding to UTF-8. :param bytes content: The content to be converted. :returns: UTF-8 encoded string """ encoding = _get_encoding(content) logger.info('Encoding detected as %s', encoding) return content.decode(encoding=encoding)
[docs]def convert_file(filepath, out_encoding): """ Converts a file to the given encoding: :param filepath: The path of the file to be converted. :param out_encoding: The encoding to be used. """ try: with open(filepath, "rb") as stream: content = stream.read() decoded = convert(content) with open(filepath, "w", encoding=out_encoding, errors='ignore', newline='') as stream: stream.write(decoded) except: logger.exception("Failed to convert file") raise ConversionException(_('Failed to convert encoding in file %s') % filepath)
[docs]def convert_file_encoding_in_folder(folder_path, out_encoding, file_extension): """Converts all files to the given encoding. :param folder_path: The path of the folder. :param out_encoding: The encoding to be used. :param file_extension: The file extension. """ utils.check_if_any_file_in_use(folder_path, [file_extension]) for filepath in utils.natsorted_glob(os.path.join(folder_path, '**')): if filepath.endswith(file_extension): convert_file(filepath, out_encoding) logger.debug("Converted encoding for %s", filepath) yield filepath
def _libreoffice_installed_windows(): """Checks that registry keys added during the installation of Libreoffice exist.""" if _DATA['libreoffice_path']: return True for reg_key in _DATA['reg_keys']: try: key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, reg_key) version = winreg.EnumKey(key, 0) # WARNING: Assumes that this registry path's first subkey is the one that # contains the installation path. Can break easily if new registry keys are added into Libreoffice. version_key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, "%s\\%s" % (reg_key, version)) _DATA['libreoffice_path'] = winreg.QueryValueEx(version_key, "Path")[0] return True except FileNotFoundError: pass return False def _libreoffice_installed_linux(): """Test if Libreoffice is installed(Linux).""" return shutil.which('soffice') is not None
[docs]def libreoffice_installed(): """Test if Libreoffice is installed(Windows).""" if utils.is_windows(): return _libreoffice_installed_windows() return _libreoffice_installed_linux()
def _get_soffice_executable(): """Gets the executable name that should be used to invoke Libreoffice. soffice on linux, on Windows this is the installation path of Libreoffice. """ if not libreoffice_installed(): logger.warning(_('Libreoffice is not installed on this system, Kvalikirstu cannot' ' convert pdf/docx/rtf etc. files to odt.')) return None if utils.is_windows(): return _DATA['libreoffice_path'] return "soffice" def _get_file_encoding(filepath): with open(filepath, "rb") as stream: content = stream.read() return _get_encoding(content) def _convert_file(filepath, executable, output_filter, timeout): def quote(x): return '"%s"' % x if utils.get_path_extension(filepath) == '.txt' and 'utf-8' in _get_file_encoding(filepath): # Use UTF-8 in filter for UTF-8 text(matches both UTF-8 BOM and UTF-8) call = ' '.join([executable, '--infilter="Text (encoded):UTF8,LF,,"', '--headless', '--convert-to', output_filter, quote(filepath), '--outdir', quote(os.path.dirname(filepath))]) result = subprocess.call(call, timeout=timeout) # This didn't work with subprocess.call like in the below call without the manual join for arguments. # Not sure why. else: result = subprocess.call([executable, '--headless', '--convert-to', output_filter, filepath, '--outdir', os.path.dirname(filepath)], timeout=timeout) if result == 0: os.unlink(filepath) def _convert_with_libreoffice(path, output_filter, extensions, timeout): executable = _get_soffice_executable() files = utils.get_files_with_extensions(path, extensions) if executable and files: for filepath in files: _convert_file(filepath, executable, output_filter, timeout) yield filepath
[docs]def convert_files_with_libreoffice_to_odt(path, timeout): """Converts suitable files with LibreOffice from the path to odt. :param path: The path of the files to be converted. :param timeout: The timeout used for Libreoffice. """ utils.check_if_any_file_in_use(path, utils.SUPPORTED_FORMATS + utils.CONVERTABLE_FORMATS) yield from _convert_with_libreoffice(path, "odt", utils.CONVERTABLE_TO_ODT, timeout)
[docs]def convert_files_with_libreoffice_to_txt(path, timeout): """Converts suitable files with LibreOffice from the path to txt. :param path: The path of the files to be converted. :param timeout: The timeout used for Libreoffice. """ utils.check_if_any_file_in_use(path, utils.SUPPORTED_FORMATS + utils.CONVERTABLE_FORMATS) yield from _convert_with_libreoffice(path, "txt", utils.CONVERTABLE_TO_TXT, timeout)