Source code for delphi.translators.for2py.preprocessor

"""
This module implements functions to preprocess Fortran source files prior to
parsing to fix up some constructs (such as continuation lines) that are
problematic for the OpenFortranParser front end.

Author:
    Saumya Debray
"""

import os
import sys
import re
from collections import OrderedDict
from typing import List, Dict, Tuple
from delphi.translators.for2py.syntax import (
    line_is_comment,
    line_is_continuation,
    line_is_continued,
    line_is_include,
)


[docs]def separate_trailing_comments(lines: List[str]) -> List[Tuple[int, str]]:
    """Given a list of Fortran source code linesseparate_trailing_comments()
       removes partial-line comments and returns the resulting list of lines.
    """
    i = 0
    while i < len(lines):
        code_line = lines[i]
        if not line_is_comment(code_line):
            (code_part, comment_part) = split_trailing_comment(code_line)
            if comment_part is not None:
                lines[i] = code_part
        i += 1

    return lines


[docs]def merge_continued_lines(lines, f_ext):
    """Given a list of Fortran source code lines, merge_continued_lines()
       merges sequences of lines that are indicated to be continuation lines
       and returns the resulting list of source lines.  The argument f_ext
       gives the file extension of the input file: this determines whether
       we have fixed-form or free-form syntax, which determines how
       continuation lines are written.
    """
    chg = True
    while chg:
        chg = False
        i = 0
        while i < len(lines):
            line = lines[i]
            if line_is_continuation(line, f_ext):
                assert i > 0, "Weird continuation line (line {}): {}".format(
                    i + 1, line
                )
                prev_line_code = lines[i - 1]
                curr_line_code = line.lstrip()[1:]  # remove continuation  char
                merged_code = (
                    prev_line_code.rstrip()
                    + " "
                    + curr_line_code.lstrip()
                    + "\n"
                )
                lines[i - 1] = merged_code
                lines.pop(i)
                chg = True
            elif line_is_continued(line):
                assert i < len(lines) - 1  # there must be a next line
                next_line_code = lines[i + 1]
                curr_line_code = line.rstrip()[
                    :-1
                ].rstrip()  # remove continuation  char
                merged_code = curr_line_code + " " + next_line_code.lstrip()
                lines[i] = merged_code
                lines.pop(i + 1)
                chg = True

            i += 1
    return lines


[docs]def discard_comments(lines):
    return [
        line
        for line in lines
        if not (line_is_comment(line) or line.strip() == "")
    ]


[docs]def split_trailing_comment(line: str) -> str:
    """Takes a line and splits it into two parts (code_part, comment_part)
    where code_part is the line up to but not including any trailing
    comment (the '!' comment character and subsequent characters
    to the end of the line), while comment_part is the trailing comment.
    Args:
        line: A line of Fortran source code.
    Returns:
        A pair (code_part, comment_part) where comment_part is the trailing
        comment.  If the line does not contain any trailing comment, then
        comment_part is None.
    """

    if line.find("!") == -1:
        return (line, None)

    i = 0
    while i < len(line):
        if line[i] == "'":
            j = line.find("'", i + 1)
            if j == -1:
                sys.stderr.write("WEIRD: unbalanced quote ': line = " + line)
                return (line, None)
            else:
                i = j + 1
        elif line[i] == '"':
            j = line.find('"', i + 1)
            if j == -1:
                sys.stderr.write('WEIRD: unbalanced quote ": line = ' + line)
                return (line, None)
            else:
                i = j + 1
        elif line[i] == "!" and i != 5:  # partial-line comment
            comment_part = line[i:]
            code_part = line[:i].rstrip() + "\n"
            return (code_part, comment_part)
        else:
            i += 1

    return (line, None)


[docs]def path_to_target(infile, target):
    # if target is already specified via an absolute path, return that path
    if target[0] == "/":
        return target

    # if infile has a path specified, specify target relative to that path
    pos = infile.rfind("/")
    if pos >= 0:
        path_to_infile = infile[:pos]
        return "{}/{}".format(path_to_infile, target)

    # otherwise simply return target
    return target


[docs]def process_includes(lines, infile):
    """ process_includes() processes INCLUDE statements, which behave like
        the #include preprocessor directive in C.
    """
    chg = True
    while chg:
        chg = False
        include_idxs = [
            i
            for i in range(len(lines))
            if line_is_include(lines[i]) is not None
        ]

        # include_idxs is a list of the index positions of INCLUDE statements.
        # Each such statement is processed by replacing it with the contents
        # of the file it mentions.  We process include_idxs in reverse so that
        # processing an INCLUDE statement does not change the index position of
        # any remaining INCLUDE statements.
        for idx in reversed(include_idxs):
            chg = True
            include_f = line_is_include(lines[idx])
            assert include_f is not None
            include_path = path_to_target(infile, include_f)
            incl_lines = get_preprocessed_lines_from_file(include_path)
            lines = lines[:idx] + incl_lines + lines[idx + 1 :]

    return lines


[docs]def refactor_select_case(lines):
    """Search for lines that are CASE statements and refactor their structure
    such that they are always in a i:j form. This means any CASE statement that
    is in the form <:3> will be <Inf:3>. This is done so that the FortranOFP
    recognizes the <:3> and <3:> structures properly.
    """
    prefix_regex = re.compile(r"([(,])\s*:\s*(-?[\d\w+])", re.I)
    suffix_regex = re.compile(r"(-?[\d\w+])\s*:\s*([),])", re.I)
    i = 0
    while i < len(lines):
        code_line = lines[i]
        if prefix_regex.search(code_line):
            match_list = re.findall(prefix_regex, code_line)
            code_line = re.sub(
                prefix_regex,
                f"{match_list[0][0]}'-Inf':" f"{match_list[0][1]}",
                code_line,
            )
        if suffix_regex.search(code_line):
            match_list = re.findall(suffix_regex, code_line)
            code_line = re.sub(
                suffix_regex,
                f"{match_list[0][0]}:'Inf'" f"{match_list[0][1]}",
                code_line,
            )

        lines[i] = code_line
        i += 1
    return lines



# The regular expressions defined below are used for processing implicit array
# declarations, which the preprocessor converts into explicit array declarations.

BASE_TYPES = r"^(\s*)(integer|real|double\s+precision|complex|character|logical)\s+(.*)"
RE_BASE_TYPES = re.compile(BASE_TYPES, re.I)

KWDS = r"\s*(DIMENSION|FUNCTION)\s*.*"
RE_KWDS = re.compile(KWDS, re.I)

IMPLICIT_ARRAY = r"(\w+)\((\w+)\)"
RE_IMPLICIT_ARRAY = re.compile(IMPLICIT_ARRAY, re.I)

VAR_OR_ARRAY = r"\s*(\w+)(\((\w+)\))?"
RE_VAR_OR_ARRAY = re.compile(VAR_OR_ARRAY, re.I)

DECL_CONTINUATION = r"\s*,\s*"
RE_DECL_CONTINUATION = re.compile(DECL_CONTINUATION, re.I)


[docs]def implicit_array_decl_parameters(line):
    """ If line contains an implicit array declaration, extract and return
        the following parameters: the initial indentation, the type of the
	array, and the rest of the line after the type; otherwise return None.
    """
    match = RE_BASE_TYPES.match(line)
    if match is None:
        return None

    indentation = match.group(1)
    type = match.group(2)
    rest = match.group(3)

    if type.lower() == "character":
        match = re.match(r"\s*(\(\s*len\s*=\s*\d+\s*\)|\*\s*\d+)", rest)
        if match is not None:
            char_parms = match.group(1)
            type += char_parms
            rest = rest[match.end():]

    # If the the rest of the string begins with specific keywords
    # like DIMENSION or FUNCTION, this is not an implicit declaration.
    match = RE_KWDS.match(rest)
    if match is not None:
        return None

    # If the line does not match the pattern for an implicit array,
    # it does not have an implicit array declaration
    match = RE_IMPLICIT_ARRAY.search(rest)
    if match is None:
        return None

    return (indentation, type, rest)



[docs]def fix_implicit_array_decls(lines):
    out_lines = []
    for line in lines:
        implicit_decl_parms = implicit_array_decl_parameters(line)
        if implicit_decl_parms is None:
            out_lines.append(line)
            continue
        else:
            (indentation, type, rest) = implicit_decl_parms
            decls = {}
            arr_name = arr_size = None
            match2 = RE_VAR_OR_ARRAY.match(rest)
            while match2 is not None:
                arr_name, arr_size = match2.group(1), match2.group(2)
                if arr_size is not None:
                    arr_size = arr_size[1:-1]
                else:
                    arr_size = 0
    
                if arr_size in decls:
                    decls[arr_size] += ", " + arr_name
                else:
                    decls[arr_size] = arr_name
    
                # get the rest of the string if appropriate
                n = match2.end()
                if n < len(rest):
                    rest = rest[n:]
                else:
                    rest = ""
    
                # process any comma separator if present
                match3 = RE_DECL_CONTINUATION.match(rest)
                if match3 is not None:
                    n = match3.end()
                    rest = rest[n:]
    
                match2 = RE_VAR_OR_ARRAY.match(rest)
    
            # finally, construct the output lines with implicit declarations
            # replaced by explicit ones
            new_lines = []
            for sz in decls:
                if sz != 0:
                    new_lines.append("{}{}, DIMENSION({}) :: {}\n".\
                                format(indentation, type, sz, decls[sz]))
                else:
                    new_lines.append("{}{} :: {}\n".\
                                format(indentation, type, decls[sz]))
    
            out_lines.extend(new_lines)

    return out_lines


[docs]def preprocess_lines(lines, infile, forModLogGen=False):
    _, f_ext = os.path.splitext(infile)
    lines = [line for line in lines if line.rstrip() != ""]
    lines = separate_trailing_comments(lines)
    lines = discard_comments(lines)
    lines = merge_continued_lines(lines, f_ext)
    lines = fix_implicit_array_decls(lines)
    # For module log file generation, we do not need to
    # preprocess any included external files, so skip in
    # such case.
    if not forModLogGen:
        lines = process_includes(lines, infile)
    lines = refactor_select_case(lines)
    return lines


[docs]def get_preprocessed_lines_from_file(infile, forModLogGen=False):
    with open(infile, mode="r", encoding="latin-1") as f:
        lines = f.readlines()
    return preprocess_lines(lines, infile, forModLogGen)