Separate configuration & patterns from code

main
Felix Stupp 2 years ago
parent e4fe661e77
commit 208c3ed5ad
Signed by: zocker
GPG Key ID: 93E1BD26F6B02FB7

@ -17,6 +17,10 @@ from typing import Callable, Iterable, List, Mapping
warn = partial(print, file=sys.stderr)
# === Configuration
INDEX_DIR = ".index"
DEFAULT_CATEGORY = "_toSort"
OCR_LANGS = [
@ -27,6 +31,71 @@ DEFAULT_SCAN_SOURCE = "ADF Duplex"
ALTERNATE_SCAN_SOURCE = "Flatbed"
MIN_NUM_WIDTH = 6 # only used for INDEX_DIR files
ID_AROUND_RANGE = 10
# === Patterns
SCAN_SUFFIXES = [ # Regexes
"jpe?g",
"pdf",
"png",
]
ID_REGEX = re.compile(r"""^
(
(?P<digital>d(igital)?) # no physical original
|
(
(?P<id_simple>\d+) # simple id
|
(?P<id_following>\d+)\+ # id and following id
|
(?P<id_following_twice>\d+)\+\+ # id and following 3 ids (this and following document with each 2 sides)
|
(?P<id_range_begin>\d+)-(?P<id_range_end>\d+) # id range
)(?P<around>\#)?
)
$""", re.VERBOSE)
SCAN_REGEX = re.compile(r"""^
( # Date
(?P<date>\d{4}-\d{2}-\d{2})_
)?
# automatic prefix of scanimage
(out)?
# scan id
(?P<scan_id>
""" + ID_REGEX.pattern[1:-1] + r"""
)
( # Description (optional)
_(?P<description>.*)
)?
# Suffix
\.(""" + "|".join(SCAN_SUFFIXES) + r""")
$""", re.VERBOSE)
SCAN_WARN_REGEX = re.compile(r"\.(" + "|".join(SCAN_SUFFIXES) + r")$")
NUMBER_REGEX = re.compile(r"^\d+$")
CONTENT_SPLIT_REGEX = re.compile(r"[\W]")
DATE_REGEX = re.compile(r"(\d{2,4}-\d{1,2}-\d{1,2}|\d{1,2}\.\d{1,2}\.\d{2,4}|\d{1,2}\.\s+[a-zA-Z]+\s+\d{2,4})")
DATE_FORMATS = [ # date.strptime compatible
"%Y-%m-%d",
"%y-%m-%d",
"%d.%m.%Y",
"%d.%m.%y",
"%d. %B %Y",
"%d. %B %y",
"%d. %b %Y",
"%d. %b %y",
]
# === Code
def build_args(args: Iterable) -> str:
return " ".join((shlex.quote(str(e)) for e in args))
@ -54,22 +123,6 @@ def rlinput(prompt, prefill=None, suggestions=[]):
finally:
readline.set_startup_hook()
ID_REGEX = re.compile(r"""^
(
(?P<digital>d(igital)?) # no physical original
|
(
(?P<id_simple>\d+) # simple id
|
(?P<id_following>\d+)\+ # id and following id
|
(?P<id_following_twice>\d+)\+\+ # id and following 3 ids (this and following document with each 2 sides)
|
(?P<id_range_begin>\d+)-(?P<id_range_end>\d+) # id range
)(?P<around>\#)?
)
$""", re.VERBOSE)
ID_AROUND_RANGE = 10
@dataclass(eq=True, order=True, frozen=True)
class IdRange:
@ -155,44 +208,6 @@ class IdRange:
return self.fancy
SCAN_SUFFIXES = [ # Regexes
"jpe?g",
"pdf",
"png",
]
SCAN_REGEX = re.compile(r"""^
( # Date
(?P<date>\d{4}-\d{2}-\d{2})_
)?
# automatic prefix of scanimage
(out)?
# scan id
(?P<scan_id>
""" + ID_REGEX.pattern[1:-1] + r"""
)
( # Description (optional)
_(?P<description>.*)
)?
# Suffix
\.(""" + "|".join(SCAN_SUFFIXES) + r""")
$""", re.VERBOSE)
SCAN_WARN_REGEX = re.compile(r"\.(" + "|".join(SCAN_SUFFIXES) + r")$")
NUMBER_REGEX = re.compile(r"^\d+$")
CONTENT_SPLIT_REGEX = re.compile(r"[\W]")
DATE_REGEX = re.compile(r"(\d{2,4}-\d{1,2}-\d{1,2}|\d{1,2}\.\d{1,2}\.\d{2,4}|\d{1,2}\.\s+[a-zA-Z]+\s+\d{2,4})")
DATE_FORMATS = [
"%Y-%m-%d",
"%y-%m-%d",
"%d.%m.%Y",
"%d.%m.%y",
"%d. %B %Y",
"%d. %B %y",
"%d. %b %Y",
"%d. %b %y",
]
def interpret_date(text: str) -> datetime:
for date_format in DATE_FORMATS:
try:
@ -200,13 +215,16 @@ def interpret_date(text: str) -> datetime:
except ValueError:
continue
return None
def format_date(date: datetime) -> str:
return date.strftime(DATE_FORMATS[0])
def avg(dates: list[datetime]) -> datetime:
m = min(dates)
s = sum((date - m for date in dates), start=timedelta())
return m + (s / len(dates))
@dataclass
class ScanFile:
path: Path
@ -333,6 +351,7 @@ class ScanFile:
def __eq__(self, other):
return self.path == other.path
SCAN_FORMATS: dict[Callable[[ScanFile], str]] = {
"content": lambda scan: scan.text_content,
"date": lambda scan: scan.date_from_content,
@ -405,6 +424,7 @@ def extract_dates(scans: List[ScanFile]) -> List[str]:
dates[format_date(date)] = None
return list(dates)
# args dependent
def read_single_id(args):

Loading…
Cancel
Save