Source code for kvalikirstu2.header_scanner

""" A module for scanning headers from data files.
"""
import logging
from collections import OrderedDict
from kvalikirstu2 import study
from kvalikirstu2 import utils
from kvalikirstu2 import paragraph_analyzer
from kvalikirstu2 import exceptions
from kvalikirstu2.localization import _

logger = logging.getLogger(__name__)


[docs]def get_header_info_for_study(path: str): """Inits HeaderInfo with headers and values scanned from files. :param str path: The path containing data files for the study. """ logger.debug('Scanning headers from path %s', path) header_info = study.HeaderInfo() try: headers = _scan_files(path) for header, values in headers.items(): header_info.add_header(header) for value in values: value_map_key = str((header, value)) header_info.value_mapping[value_map_key] = value if not headers: logger.warning(_("No headers detected for the selected data!\n\n" "If the data is not in textual form, you need to create a header list.\n" "Rename data files (if required) before creating the header list.\n\n" "If the data is in textual form, try converting the files.")) except exceptions.EncodingError as decode_ex: logger.warning(str(decode_ex)) header_info.init_builtin_headers() return header_info
def _add_line(headers, line): """Adds line and value to OrderedDict headers if line is a header line. :param headers: Dictionary of headers and values. :param line: The line to be parsed. """ if utils.is_header_line(line, None): header, value = utils.parse_header(line) if header not in headers: headers[header] = [] headers[header].append(value) def _scan_files(path): """Scans study files for header lines and returns OrderedDict with found headers and the related values. :param str path: The directory path to be scanned. """ headers = OrderedDict() for filepath in utils.get_supported_files(path): reader = paragraph_analyzer.ParagraphReader(filepath, use_temp=False) paragraphs = reader.read_paragraphs() for paragraph in paragraphs: if paragraph.par_type == paragraph_analyzer.ParagraphType.HEADER: for line in paragraph.lines: _add_line(headers, line) return headers