#!/usr/bin/env python3
# description: extract *Log options from all apache *.conf to create .log.fmt files
# 
# This is a simpler version of the modseccfg vhost reader.
#

import os, re, sys, random
import subprocess
import traceback
import json
from pprint import pprint
import logfmt1


# extraction patterns
class rx:
    # a conf file '(*) /etc/apache2/main.conf'
    dump_includes = re.compile("^\s*\([\d*]+\)\s+(.+)$", re.M)
    # directives we care about (to detect relevant .conf files)
    interesting = re.compile("""
        ^ \s*
         ( (Error|Custom|Global|Forensic|Transfer)Log | (Error)?LogFormat )           # log directives
        """,
        re.M|re.I|re.X
    )
    # extract directive line including line continuations (<\><NL>)
    configline = re.compile(
        """ ^
        [\ \\t]*                          # whitespace \h*
        # (?:Use \s{1,4})?                # optional: `Use␣` to find custom macros like `Use SecRuleRemoveByPath…`
        (
          \w+ |                           # alphanumeric directive 
          </?(?:File|Loc|Dir|If\\b)\w*    # or <Wrap> section
        )\\b
          [\ \\t]*                        # whitespace \h+
        (
          (?: [^\\n\\\\]+ | [\\\\]. )*    # literals, or backslash + anything
        )
        (?: $ | >.*$ )                    # account for line end or closing >
        """,
        re.M|re.S|re.X
    )
    # to strip <\><NL>
    escnewline = re.compile(
        """[\\\\][\\n]\s*"""              # escaped linkebreaks
    )
    # handle quoted/unquoted directive arguments (not entirely sure if Apache does \" escaped quotes within)
    split_args = re.compile(
        """
        (?:\s+)   |                       # skip whitespace (\K not supported in python re, so removing empty matches in postprocessing)
        \#.*$  |                          # skip trailing comment (which isn't technically allowed, but)
        " ((?:[^\\\\"]+|\\\\ .)+) "  |    # quoted arguments
        (?!\#) ([^"\s]+)                  # plain arguments (no quotes, no spaces)
        """,
        re.X
    ) 
    # envvars 
    shell_vars = re.compile(
    """
        ^\s* (?:export\s+)?  ([A-Z_]+)  =  ["']?  ([\w/\-.]+)  ["']?
    """, re.M|re.X)  #"
    

# temporary state variables
class tmp:

    env = {
        "APACHE_LOG_DIR": "/var/log/apache2"  #/var/log/httpd/
    }
    env_locations = [
        "/etc/apache2/envvars", "/etc/default/httpd"
    ]
    
    log_formats = {
        "error": "[%t] [%l] [pid %P] %F: %E: [client %a] %M",
        #"default": "%h %l %u %t "%r" %>s %b",
        "common": '%h %l %u %t "%r" %>s %O',
        "forensic": '+%{forensic-id}n|%r|Host:%H|%{UA}|%{H*}\n-%{forensic-id}n',
        #%t == [%02d/%s/%d:%02d:%02d:%02d %c%.2d%.2d]
    }
    log_map = {
        #"../fn.log": "combined"
    }


# encapsulate properties of config file (either vhosts, SecCfg*, or secrule collections)
class vhost:
    """
        Represents a config/vhost or mod_security rules file.
        
        Parameters
        ----------
        fn : str
            *.conf filename
        src : str
            config file source
        
    """

    # split *.conf directives, dispatch onto assignment/extract methods
    def __init__(self, fn, src, cfg_only=False):
        self.logs = []
        self.extract(src, cfg_only=cfg_only)

    # extract directive lines
    def extract(self, src, cfg_only=False):
        for dir,args  in rx.configline.findall(src):    # or .finditer()? to record positions right away?
            dir = dir.lower()
            #log.debug(dir, args)
            if hasattr(self, dir):
                if cfg_only: #→ if run from SecOptions dialog, we don't actually want rules collected
                    continue
                func = getattr(self, dir)
                func(self.split_args(args))

    # strip \\ \n line continuations, split all "args"
    def split_args(self, args):
        args = re.sub(rx.escnewline, " ", args)
        args = rx.split_args.findall(args)
        args = [s[1] or s[0] for s in args]
        args = [s for s in args if len(s)]
        #args = [s.decode("unicode_escape") for s in args]   # don't strip backslashes
        return args
    # apply ${ENV} vars
    def var_sub(self, s):
        return re.sub('\$\{(\w+)\}', lambda m: tmp.env.get(m.group(1), ""), s)

    # apache: log directives
    def customlog(self, args):
        fn, ty = self.var_sub(args[0]), args[1]
        self.logs.append(fn)
        if ty.find("%") >= 0:  # turn literal placeholder format into temporary name
            ty, fmt = hex(hash(ty))[4:], ty
            self.logformat(ty, fmt)
        tmp.log_map[fn] = ty
    def errorlog(self, args):
        self.customlog([args[0], "error"])
    def forensiclog(self, args):
        self.customlog([args[0], "forensic"])
    def globallog(self, args):
        self.customlog([args[0], args[1] or "combined"])
    def transferlog(self, args):
        self.customlog([args[0], "transfer"])
    def logformat(self, args):
        if len(args) == 1: args[1] = "transfer"
        tmp.log_formats[args[1]] = args[0].replace('\\"', '"')
    def errorlogformat(self, args):
        self.logformat([args[0], "error"])
        
    # could look into LoadModule directives to determine errorlogformat
    # from e.g. mpm_prefork being present


# scan for APACHE_ENV= vars
def read_env_vars():
    for fn in tmp.env_locations:
        if os.path.exists(fn):
            src = open(fn, "r", encoding="utf-8").read()
            tmp.env.update(
                dict(rx.shell_vars.findall(src))
            )

# iterate over all Apache config files, visit relevant ones (vhosts/mod_security configs)
def scan_all():
    read_env_vars()
    ls = apache_dump_includes()
    for i, fn in enumerate(ls):
        src = open(fn, "r", encoding="utf-8").read()
        if rx.interesting.search(src):
            vhost(fn, src)

# get *.conf list from apache2ctl
def apache_dump_includes():
    cmd = ["apache2ctl", "-t", "-D", "DUMP_INCLUDES"]
    stdout = subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout
    return rx.dump_includes.findall(stdout.read().decode("utf-8"))



# traverse log files, create .fmt descriptor with current format string
def mk_fmt():

    for fn,ty in tmp.log_map.items():

        fn_fmt = f"{fn}.fmt"
        fmt_record = tmp.log_formats.get(ty)
        if not fmt_record:
            continue
        
        j = {}
        if os.path.exists(fn_fmt):
            try:
                j = json.loads(open(fn_fmt, "r", encoding="utf-8").read())
            except Exception as e:
                j = {}
                print(f"WARN: {fn_fmt} contained invalid json: {str(e)}")
        if not "class" in j:
            j["class"] = f"apache {ty}"
        if not "record" in j or j["record"] != fmt_record:
            j["record"] = fmt_record
            
        # add descriptors for known placeholders
        if not "fields" in j or True:
            j["regex"] = logfmt1.regex(j)

        print(f"→ {fn_fmt}")
        try:
            f = open(fn_fmt, "w")
            f.write(json.dumps(j, indent=4))
            f.close()
        except Exception as e:
            print("ERR: " + str(e))



scan_all()
mk_fmt()


