""" A module for converting arbitrary text to UTF-8 and converting other files to .odt with Libreoffice. All Libreoffice
code is contained inside this module."""
import os
import logging
import shutil
import subprocess
import chardet
from kvalikirstu2.localization import _
from kvalikirstu2 import utils
from kvalikirstu2.argument_parser import get_args
if utils.is_windows():
import winreg
logger = logging.getLogger(__name__)
_DATA = {'reg_keys': ['SOFTWARE\\LibreOffice\\LibreOffice', 'SOFTWARE\\WOW6432Node\\LibreOffice\\LibreOffice'],
'libreoffice_path': None}
[docs]class ConversionException(Exception):
"""Exception that should be raised when an error occurs in data conversion."""
def _get_encoding(content):
""" Gets the best guess of the encoding used in the text.
:param bytes content: The text in binary form.
:returns: Encoding type.
"""
args = get_args()
encodings = args.detection_encodings
encoding = chardet.detect(content)['encoding'].lower()
# Try if the encoding is valid
try:
if encoding in encodings:
content.decode(encoding=encoding)
return encoding
else:
return args.default_detection_encoding
except:
return args.default_detection_encoding
[docs]def convert(content):
""" Converts content in arbitrary encoding to UTF-8.
:param bytes content: The content to be converted.
:returns: UTF-8 encoded string
"""
encoding = _get_encoding(content)
logger.info('Encoding detected as %s', encoding)
return content.decode(encoding=encoding)
[docs]def convert_file(filepath, out_encoding):
""" Converts a file to the given encoding:
:param filepath: The path of the file to be converted.
:param out_encoding: The encoding to be used.
"""
try:
with open(filepath, "rb") as stream:
content = stream.read()
decoded = convert(content)
with open(filepath, "w", encoding=out_encoding, errors='ignore', newline='') as stream:
stream.write(decoded)
except:
logger.exception("Failed to convert file")
raise ConversionException(_('Failed to convert encoding in file %s') % filepath)
[docs]def convert_file_encoding_in_folder(folder_path, out_encoding, file_extension):
"""Converts all files to the given encoding.
:param folder_path: The path of the folder.
:param out_encoding: The encoding to be used.
:param file_extension: The file extension.
"""
utils.check_if_any_file_in_use(folder_path, [file_extension])
for filepath in utils.natsorted_glob(os.path.join(folder_path, '**')):
if filepath.endswith(file_extension):
convert_file(filepath, out_encoding)
logger.debug("Converted encoding for %s", filepath)
yield filepath
def _libreoffice_installed_windows():
"""Checks that registry keys added during the installation of Libreoffice exist."""
if _DATA['libreoffice_path']:
return True
for reg_key in _DATA['reg_keys']:
try:
key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, reg_key)
version = winreg.EnumKey(key, 0) # WARNING: Assumes that this registry path's first subkey is the one that
# contains the installation path. Can break easily if new registry keys are added into Libreoffice.
version_key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, "%s\\%s" % (reg_key, version))
_DATA['libreoffice_path'] = winreg.QueryValueEx(version_key, "Path")[0]
return True
except FileNotFoundError:
pass
return False
def _libreoffice_installed_linux():
"""Test if Libreoffice is installed(Linux)."""
return shutil.which('soffice') is not None
[docs]def libreoffice_installed():
"""Test if Libreoffice is installed(Windows)."""
if utils.is_windows():
return _libreoffice_installed_windows()
return _libreoffice_installed_linux()
def _get_soffice_executable():
"""Gets the executable name that should be used to invoke Libreoffice. soffice on linux, on Windows
this is the installation path of Libreoffice.
"""
if not libreoffice_installed():
logger.warning(_('Libreoffice is not installed on this system, Kvalikirstu cannot'
' convert pdf/docx/rtf etc. files to odt.'))
return None
if utils.is_windows():
return _DATA['libreoffice_path']
return "soffice"
def _get_file_encoding(filepath):
with open(filepath, "rb") as stream:
content = stream.read()
return _get_encoding(content)
def _convert_file(filepath, executable, output_filter, timeout):
def quote(x):
return '"%s"' % x
if utils.get_path_extension(filepath) == '.txt' and 'utf-8' in _get_file_encoding(filepath):
# Use UTF-8 in filter for UTF-8 text(matches both UTF-8 BOM and UTF-8)
call = ' '.join([executable, '--infilter="Text (encoded):UTF8,LF,,"', '--headless', '--convert-to',
output_filter, quote(filepath), '--outdir', quote(os.path.dirname(filepath))])
result = subprocess.call(call, timeout=timeout)
# This didn't work with subprocess.call like in the below call without the manual join for arguments.
# Not sure why.
else:
result = subprocess.call([executable, '--headless', '--convert-to', output_filter, filepath, '--outdir',
os.path.dirname(filepath)], timeout=timeout)
if result == 0:
os.unlink(filepath)
def _convert_with_libreoffice(path, output_filter, extensions, timeout):
executable = _get_soffice_executable()
files = utils.get_files_with_extensions(path, extensions)
if executable and files:
for filepath in files:
_convert_file(filepath, executable, output_filter, timeout)
yield filepath
[docs]def convert_files_with_libreoffice_to_odt(path, timeout):
"""Converts suitable files with LibreOffice from the path to odt.
:param path: The path of the files to be converted.
:param timeout: The timeout used for Libreoffice.
"""
utils.check_if_any_file_in_use(path, utils.SUPPORTED_FORMATS + utils.CONVERTABLE_FORMATS)
yield from _convert_with_libreoffice(path, "odt", utils.CONVERTABLE_TO_ODT, timeout)
[docs]def convert_files_with_libreoffice_to_txt(path, timeout):
"""Converts suitable files with LibreOffice from the path to txt.
:param path: The path of the files to be converted.
:param timeout: The timeout used for Libreoffice.
"""
utils.check_if_any_file_in_use(path, utils.SUPPORTED_FORMATS + utils.CONVERTABLE_FORMATS)
yield from _convert_with_libreoffice(path, "txt", utils.CONVERTABLE_TO_TXT, timeout)