Source code for kvalikirstu2.header_scanner
""" A module for scanning headers from data files.
"""
import logging
from collections import OrderedDict
from kvalikirstu2 import study
from kvalikirstu2 import utils
from kvalikirstu2 import paragraph_analyzer
from kvalikirstu2 import exceptions
from kvalikirstu2.localization import _
logger = logging.getLogger(__name__)
[docs]def get_header_info_for_study(path: str):
"""Inits HeaderInfo with headers and values scanned from files.
:param str path: The path containing data files for the study.
"""
logger.debug('Scanning headers from path %s', path)
header_info = study.HeaderInfo()
try:
headers = _scan_files(path)
for header, values in headers.items():
header_info.add_header(header)
for value in values:
value_map_key = str((header, value))
header_info.value_mapping[value_map_key] = value
if not headers:
logger.warning(_("No headers detected for the selected data!\n\n"
"If the data is not in textual form, you need to create a header list.\n"
"Rename data files (if required) before creating the header list.\n\n"
"If the data is in textual form, try converting the files."))
except exceptions.EncodingError as decode_ex:
logger.warning(str(decode_ex))
header_info.init_builtin_headers()
return header_info
def _add_line(headers, line):
"""Adds line and value to OrderedDict headers if line is a header line.
:param headers: Dictionary of headers and values.
:param line: The line to be parsed.
"""
if utils.is_header_line(line, None):
header, value = utils.parse_header(line)
if header not in headers:
headers[header] = []
headers[header].append(value)
def _scan_files(path):
"""Scans study files for header lines and returns OrderedDict with found headers and the related values.
:param str path: The directory path to be scanned.
"""
headers = OrderedDict()
for filepath in utils.get_supported_files(path):
reader = paragraph_analyzer.ParagraphReader(filepath, use_temp=False)
paragraphs = reader.read_paragraphs()
for paragraph in paragraphs:
if paragraph.par_type == paragraph_analyzer.ParagraphType.HEADER:
for line in paragraph.lines:
_add_line(headers, line)
return headers