Source code for docp_core.utilities

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
:Purpose:   This module provides utility-based functionality for the
            project.

:Platform:  Linux/Windows | Python 3.10+
:Developer: J Berendt
:Email:     development@s3dev.uk

:Comments:  n/a

"""

import os
import re
from glob import glob
from utils4 import futils


[docs] class Utilities: """General (cross-project) utility functions."""
[docs] @staticmethod def build_project_outpath(subpath: str) -> str: """Build (and create) the path for project output files. Args: subpath (str): The sub-path to be appended to the default ``~/Desktop/docp`` path. If the path does not exist, it will be created automatically. Returns: str: The full path as a string. """ base = os.path.join(os.path.expanduser('~/Desktop'), 'docp') path = os.path.join(base, subpath) os.makedirs(path, exist_ok=True) return path
[docs] @staticmethod def collect_files(path: str, ext: str='**', recursive: bool=False) -> list: """Collect all files for a given extension from a path. Args: path (str): Full path serving as the root for the search. ext (str, optional): If the ``path`` argument refers to a *directory*, a specific file extension can be specified here. For example: ``ext = 'pdf'``. If anything other than ``'**'`` is provided, all alpha-characters are parsed from the string, and prefixed with ``*.``. Meaning, if ``'.pdf'`` is passed, the characters ``'pdf'`` are parsed and prefixed with ``*.`` to create ``'*.pdf'``. However, if ``'things.foo'`` is passed, the derived extension will be ``'*.thingsfoo'``. Defaults to '**', for an 'everything' or recursive search (if the ``resursive`` argument is passed as True). recursive (bool, optional): Instruct the search to recurse into sub-directories. Defaults to False. Returns: list: The list of full file paths returned by the ``glob`` call. Any directory-only paths are removed. """ if ext != '**': ext = f'*.{re.findall("[a-zA-Z]+", ext)[0]}' return list(filter(os.path.isfile, glob(os.path.join(path, ext), recursive=recursive)))
[docs] @staticmethod def ispdf(path: str) -> bool: """Test the file signature. Verify this is a valid PDF file. Args: path (str): Path to the file being tested. Returns: bool: True if this is a valid PDF file, otherwise False. """ return futils.ispdf(path)
[docs] @staticmethod def iszip(path: str) -> bool: """Test the file signature. Verify this is a valid ZIP archive. Args: path (str): Path to the file being tested. Returns: bool: True if this is a valid ZIP archive, otherwise False. """ return futils.iszip(path)
[docs] @staticmethod def parse_to_keywords(resp: str) -> str: r"""Parse the bot's response into a list of keywords. Args: resp (str): Text response directly from the bot. The bullet points extracted must be in any of the following forms. Asterisk as bullet points: - * Spam - * Eggs Hyphen as bullet points: - - Spam - - Eggs Numbered (1): - 1. Spam - 2. Eggs Numbered (2): - 1\) Spam - 2\) Eggs Returns: str: A comma separated string of keywords extracted from the response, *converted to lower case*. """ # Capture asterisk bullet points or a numbered list. rexp = re.compile(r'(?:-|\*|[0-9]+[\.\)])\s*(.*)\n?') trans = {47: ' '} resp_ = resp.translate(trans).lower() kwds = rexp.findall(resp_) if kwds: return ', '.join(kwds) return ''
[docs] @staticmethod def remove_duplicate_lines(text: str) -> str: """Remove any duplicated lines from the document. Generally, this function will be used to remove repeated headers and footers from a document. Args: text (str): A string containing text from which duplicated lines are to be removed. Returns: str: A string containing only the unique lines (or empty lines) from the provided text. """ tmp = [] lines = filter(None, re.split('(\n+)', text)) # re.split keeps the separator. # Set comprehension cannot be used here as order *must* be retained. for line in lines: # Keep only unique lines and preserve newline characters. if line not in tmp or '\n' in line: tmp.append(line) return ''.join(tmp)
utilities = Utilities()