diff --git a/maintain.py b/maintain.py index 330a5e1..70556fd 100755 --- a/maintain.py +++ b/maintain.py @@ -17,6 +17,10 @@ from typing import Callable, Iterable, List, Mapping warn = partial(print, file=sys.stderr) + +# === Configuration + + INDEX_DIR = ".index" DEFAULT_CATEGORY = "_toSort" OCR_LANGS = [ @@ -27,6 +31,71 @@ DEFAULT_SCAN_SOURCE = "ADF Duplex" ALTERNATE_SCAN_SOURCE = "Flatbed" MIN_NUM_WIDTH = 6 # only used for INDEX_DIR files +ID_AROUND_RANGE = 10 + + +# === Patterns + + +SCAN_SUFFIXES = [ # Regexes + "jpe?g", + "pdf", + "png", +] + +ID_REGEX = re.compile(r"""^ + ( + (?Pd(igital)?) # no physical original + | + ( + (?P\d+) # simple id + | + (?P\d+)\+ # id and following id + | + (?P\d+)\+\+ # id and following 3 ids (this and following document with each 2 sides) + | + (?P\d+)-(?P\d+) # id range + )(?P\#)? + ) +$""", re.VERBOSE) + +SCAN_REGEX = re.compile(r"""^ + ( # Date + (?P\d{4}-\d{2}-\d{2})_ + )? + # automatic prefix of scanimage + (out)? + # scan id + (?P + """ + ID_REGEX.pattern[1:-1] + r""" + ) + ( # Description (optional) + _(?P.*) + )? + # Suffix + \.(""" + "|".join(SCAN_SUFFIXES) + r""") +$""", re.VERBOSE) +SCAN_WARN_REGEX = re.compile(r"\.(" + "|".join(SCAN_SUFFIXES) + r")$") + +NUMBER_REGEX = re.compile(r"^\d+$") +CONTENT_SPLIT_REGEX = re.compile(r"[\W]") + +DATE_REGEX = re.compile(r"(\d{2,4}-\d{1,2}-\d{1,2}|\d{1,2}\.\d{1,2}\.\d{2,4}|\d{1,2}\.\s+[a-zA-Z]+\s+\d{2,4})") +DATE_FORMATS = [ # date.strptime compatible + "%Y-%m-%d", + "%y-%m-%d", + "%d.%m.%Y", + "%d.%m.%y", + "%d. %B %Y", + "%d. %B %y", + "%d. %b %Y", + "%d. %b %y", +] + + +# === Code + + def build_args(args: Iterable) -> str: return " ".join((shlex.quote(str(e)) for e in args)) @@ -54,22 +123,6 @@ def rlinput(prompt, prefill=None, suggestions=[]): finally: readline.set_startup_hook() -ID_REGEX = re.compile(r"""^ - ( - (?Pd(igital)?) # no physical original - | - ( - (?P\d+) # simple id - | - (?P\d+)\+ # id and following id - | - (?P\d+)\+\+ # id and following 3 ids (this and following document with each 2 sides) - | - (?P\d+)-(?P\d+) # id range - )(?P\#)? - ) -$""", re.VERBOSE) -ID_AROUND_RANGE = 10 @dataclass(eq=True, order=True, frozen=True) class IdRange: @@ -155,44 +208,6 @@ class IdRange: return self.fancy -SCAN_SUFFIXES = [ # Regexes - "jpe?g", - "pdf", - "png", -] - -SCAN_REGEX = re.compile(r"""^ - ( # Date - (?P\d{4}-\d{2}-\d{2})_ - )? - # automatic prefix of scanimage - (out)? - # scan id - (?P - """ + ID_REGEX.pattern[1:-1] + r""" - ) - ( # Description (optional) - _(?P.*) - )? - # Suffix - \.(""" + "|".join(SCAN_SUFFIXES) + r""") -$""", re.VERBOSE) -SCAN_WARN_REGEX = re.compile(r"\.(" + "|".join(SCAN_SUFFIXES) + r")$") - -NUMBER_REGEX = re.compile(r"^\d+$") -CONTENT_SPLIT_REGEX = re.compile(r"[\W]") - -DATE_REGEX = re.compile(r"(\d{2,4}-\d{1,2}-\d{1,2}|\d{1,2}\.\d{1,2}\.\d{2,4}|\d{1,2}\.\s+[a-zA-Z]+\s+\d{2,4})") -DATE_FORMATS = [ - "%Y-%m-%d", - "%y-%m-%d", - "%d.%m.%Y", - "%d.%m.%y", - "%d. %B %Y", - "%d. %B %y", - "%d. %b %Y", - "%d. %b %y", -] def interpret_date(text: str) -> datetime: for date_format in DATE_FORMATS: try: @@ -200,13 +215,16 @@ def interpret_date(text: str) -> datetime: except ValueError: continue return None + def format_date(date: datetime) -> str: return date.strftime(DATE_FORMATS[0]) + def avg(dates: list[datetime]) -> datetime: m = min(dates) s = sum((date - m for date in dates), start=timedelta()) return m + (s / len(dates)) + @dataclass class ScanFile: path: Path @@ -333,6 +351,7 @@ class ScanFile: def __eq__(self, other): return self.path == other.path + SCAN_FORMATS: dict[Callable[[ScanFile], str]] = { "content": lambda scan: scan.text_content, "date": lambda scan: scan.date_from_content, @@ -405,6 +424,7 @@ def extract_dates(scans: List[ScanFile]) -> List[str]: dates[format_date(date)] = None return list(dates) + # args dependent def read_single_id(args):