Reworked ExtractedData classes, split into Offline & Online

master
Felix Stupp 2 years ago
parent bba57e82d8
commit c97559ed62
Signed by: zocker
GPG Key ID: 93E1BD26F6B02FB7

@ -1,16 +1,19 @@
from __future__ import annotations
import re
from typing import List, Set
from typing import List, Set, TypeAlias
from pony import orm
from ...models import MediaCollection, MediaCollectionLink, MediaElement
from ..generic import ExtractedData, ExtractedDataLight, SuitableLevel
from ..generic import ExtractedDataOnline, ExtractedDataOffline, SuitableLevel
from .base import CollectionExtractor
class AggregatedCollectionExtractor(CollectionExtractor[List[List[MediaElement]]]):
DataType: TypeAlias = List[List[MediaElement]]
class AggregatedCollectionExtractor(CollectionExtractor[DataType]):
__uri_regex = re.compile(r"^aggregated:///(?P<id>\d+(,\d+)*)")
@ -43,18 +46,18 @@ class AggregatedCollectionExtractor(CollectionExtractor[List[List[MediaElement]]
return True
return False
def _extract_offline(self, uri: str) -> ExtractedDataLight:
def _extract_offline(self, uri: str) -> ExtractedDataOffline[DataType]:
coll_id = ",".join(str(i) for i in self.__get_id(uri))
return ExtractedDataLight(
return ExtractedDataOffline[DataType](
extractor_name=self.name,
object_key=coll_id,
object_uri=uri,
)
def _extract_online(self, uri: str) -> ExtractedData[List[List[MediaElement]]]:
def _extract_online(self, uri: str) -> ExtractedDataOnline[DataType]:
colls = self.__get_collections(uri)
coll_id = ",".join(str(c.id) for c in colls)
return ExtractedData(
return ExtractedDataOnline[DataType](
extractor_name=self.name,
object_key=coll_id,
object_uri=uri,
@ -69,9 +72,7 @@ class AggregatedCollectionExtractor(CollectionExtractor[List[List[MediaElement]]
],
)
def _update_object_raw(
self, object: MediaCollection, data: List[List[MediaElement]]
) -> None:
def _update_object_raw(self, object: MediaCollection, data: DataType) -> None:
if object.title is None or "[aggregated]" not in object.title:
object.title = f"[aggregated] {object.uri}"
object.creator = None

@ -13,7 +13,12 @@ from ...models import (
MediaCollectionLink,
MediaElement,
)
from ..generic import ExtractedData, ExtractionError, GeneralExtractor
from ..generic import (
ExtractedDataOnline,
ExtractedDataOffline,
ExtractionError,
GeneralExtractor,
)
T = TypeVar("T")
@ -48,12 +53,12 @@ class CollectionExtractor(GeneralExtractor[MediaCollection, T]):
def __configure_collection(self, collection: MediaCollection) -> None:
collection.keep_updated = True
def _create_object(self, data: ExtractedData[T]) -> MediaCollection:
def _create_object(self, data: ExtractedDataOffline[T]) -> MediaCollection:
collection = data.create_collection()
self.__configure_collection(collection)
return collection
def _load_object(self, data: ExtractedData[T]) -> Optional[MediaCollection]:
def _load_object(self, data: ExtractedDataOffline[T]) -> Optional[MediaCollection]:
collection = data.load_collection()
if collection is not None:
self.__configure_collection(collection)
@ -89,7 +94,7 @@ class CollectionExtractor(GeneralExtractor[MediaCollection, T]):
def _inject_episode(
self,
collection: MediaCollection,
data: ExtractedData[Any],
data: ExtractedDataOnline[Any],
season: int = 0,
episode: int = 0,
) -> Optional[MediaElement]:
@ -133,5 +138,7 @@ class CollectionExtractor(GeneralExtractor[MediaCollection, T]):
link.season = 0
link.episode = index + 1
def _update_hook(self, object: MediaCollection, data: ExtractedData[T]) -> None:
def _update_hook(
self, object: MediaCollection, data: ExtractedDataOnline[T]
) -> None:
self._sort_episodes(object)

@ -11,9 +11,8 @@ from ..all.tmdb import (
TmdbCollectionData,
TMDB_REGEX_URI,
TmdbKeywordData,
TmdbMovieEntryData,
)
from ..generic import ExtractedData, ExtractedDataLight, SuitableLevel
from ..generic import ExtractedDataOnline, ExtractedDataOffline, SuitableLevel
from .base import CollectionExtractor
@ -58,9 +57,9 @@ class TmdbBaseExtractor(CollectionExtractor[T]):
self._calculate_wait_hours(last_release_date) * 7 * 24
)
def _extract_offline(self, uri: str) -> ExtractedDataLight:
def _extract_offline(self, uri: str) -> ExtractedDataOffline[T]:
id = self._get_id(uri)
return ExtractedDataLight(
return ExtractedDataOffline[T](
extractor_name=self.name,
object_key=f"{self.TMDB_CLASS}:{id}",
object_uri=uri,
@ -71,10 +70,10 @@ class TmdbCollectionExtractor(TmdbBaseExtractor[TmdbCollectionData]):
TMDB_CLASS = "collection"
def _extract_online(self, uri: str) -> ExtractedData[TmdbCollectionData]:
def _extract_online(self, uri: str) -> ExtractedDataOnline[TmdbCollectionData]:
id = self._get_id(uri)
data = TmdbCollectionData.from_id(id)
return ExtractedData(
return ExtractedDataOnline(
extractor_name=self.name,
object_key=f"{self.TMDB_CLASS}:{id}",
object_uri=uri,
@ -110,10 +109,10 @@ class TmdbKeywordExtractor(TmdbBaseExtractor[TmdbKeywordData]):
TMDB_CLASS = "keyword"
def _extract_online(self, uri: str) -> ExtractedData[TmdbKeywordData]:
def _extract_online(self, uri: str) -> ExtractedDataOnline[TmdbKeywordData]:
id = self._get_id(uri)
data = TmdbKeywordData.from_id(id)
return ExtractedData(
return ExtractedDataOnline(
extractor_name=self.name,
object_key=f"{self.TMDB_CLASS}:{id}",
object_uri=uri,

@ -8,7 +8,7 @@ from pony import orm # TODO remove
from ...models import MediaCollection
from ..all.tt_rss import HeadlineList, TtRssConnectionParameter, TtRssUri
from ..generic import ExtractedData, ExtractedDataLight, SuitableLevel
from ..generic import ExtractedDataOnline, ExtractedDataOffline, SuitableLevel
from .base import CollectionExtractor
@ -41,14 +41,14 @@ class TtRssCollectionExtractor(CollectionExtractor[HeadlineList]):
def _cache_expired(self, object: MediaCollection) -> bool:
return (datetime.now() - object.last_updated) > timedelta(minutes=15)
def _extract_offline(self, uri: str) -> ExtractedDataLight:
return ExtractedDataLight(
def _extract_offline(self, uri: str) -> ExtractedDataOffline[HeadlineList]:
return ExtractedDataOffline[HeadlineList](
extractor_name=self.name,
object_key=uri,
object_uri=uri,
)
def _extract_online(self, uri: str) -> ExtractedData[HeadlineList]:
def _extract_online(self, uri: str) -> ExtractedDataOnline[HeadlineList]:
rss_uri = self.__decode_uri(uri)
logging.info(f"Extract collection from tt-rss: {uri!r}")
data = rss_uri.request(self.__params, order_by="feed_dates", view_mode="unread")
@ -59,7 +59,7 @@ class TtRssCollectionExtractor(CollectionExtractor[HeadlineList]):
if self.__label_filter
in (label_marker[0] for label_marker in headline.labels)
]
return ExtractedData(
return ExtractedDataOnline(
extractor_name=self.name,
object_key=uri,
object_uri=uri,

@ -10,7 +10,7 @@ import requests
from ...models import MediaCollection, Tag
from ..all.tvmaze import TvmazeEpisodeEmbedded, TvmazeShowEmbedded, add_embedding
from ..generic import ExtractedData, ExtractedDataLight, SuitableLevel
from ..generic import ExtractedDataOnline, ExtractedDataOffline, SuitableLevel
from .base import CollectionExtractor
@ -71,15 +71,15 @@ class TvmazeCollectionExtractor(CollectionExtractor[TvmazeShowEmbedded]):
last_release_date
)
def _extract_offline(self, uri: str) -> ExtractedDataLight:
def _extract_offline(self, uri: str) -> ExtractedDataOffline[TvmazeShowEmbedded]:
show_id = self.__require_show_id(uri)
return ExtractedDataLight(
return ExtractedDataOffline[TvmazeShowEmbedded](
extractor_name=self.name,
object_key=str(show_id),
object_uri=self.__get_show_uri(show_id),
)
def _extract_online(self, uri: str) -> ExtractedData[TvmazeShowEmbedded]:
def _extract_online(self, uri: str) -> ExtractedDataOnline[TvmazeShowEmbedded]:
show_id = self.__require_show_id(uri)
api_uri = self.__get_show_api_uri(show_id)
res = requests.get(
@ -91,7 +91,7 @@ class TvmazeCollectionExtractor(CollectionExtractor[TvmazeShowEmbedded]):
},
)
data = res.json()
return ExtractedData(
return ExtractedDataOnline[TvmazeShowEmbedded](
extractor_name=self.name,
object_key=str(show_id),
object_uri=self.__get_show_uri(show_id),
@ -125,7 +125,7 @@ class TvmazeCollectionExtractor(CollectionExtractor[TvmazeShowEmbedded]):
add_embedding(episode, "show", data)
self._inject_episode(
collection=object,
data=ExtractedData[TvmazeEpisodeEmbedded](
data=ExtractedDataOnline[TvmazeEpisodeEmbedded](
extractor_name="tvmaze",
object_key=str(episode["id"]),
object_uri=f"tvmaze:///episodes/{episode['id']}",

@ -3,17 +3,20 @@ from __future__ import annotations
from datetime import datetime
import logging
import re
from typing import Dict
from typing import Dict, TypeAlias
from pony import orm # TODO remove
import youtubesearchpython
from ...models import MediaCollection
from ..generic import ExtractedData, ExtractedDataLight, SuitableLevel
from ..generic import ExtractedDataOnline, ExtractedDataOffline, SuitableLevel
from .base import CollectionExtractor
class YouTubeCollectionExtractor(CollectionExtractor[Dict]):
DataType: TypeAlias = Dict
class YouTubeCollectionExtractor(CollectionExtractor[DataType]):
__uri_regex = re.compile(
r"^https?://(www\.)?youtube\.com/(channel/|playlist\?list=)(?P<id>[^/&?]+)"
@ -59,15 +62,15 @@ class YouTubeCollectionExtractor(CollectionExtractor[Dict]):
last_release_date
)
def _extract_offline(self, uri: str) -> ExtractedDataLight:
def _extract_offline(self, uri: str) -> ExtractedDataOffline[DataType]:
playlist_id = self.__convert_if_required(self.__get_id(uri))
return ExtractedDataLight(
return ExtractedDataOffline[DataType](
extractor_name=self.name,
object_key=playlist_id,
object_uri=uri,
)
def _extract_online(self, uri: str) -> ExtractedData[Dict]:
def _extract_online(self, uri: str) -> ExtractedDataOnline[DataType]:
orig_id = self.__get_id(uri)
playlist_id = self.__convert_if_required(orig_id)
playlist_link = f"https://www.youtube.com/playlist?list={playlist_id}"
@ -89,7 +92,7 @@ class YouTubeCollectionExtractor(CollectionExtractor[Dict]):
logging.debug(
f"Retrieved {len(playlist.videos)} videos from playlist {playlist_link!r}"
)
return ExtractedData(
return ExtractedDataOnline[DataType](
extractor_name=self.name,
object_key=playlist_id,
object_uri=uri,
@ -99,7 +102,7 @@ class YouTubeCollectionExtractor(CollectionExtractor[Dict]):
},
)
def _update_object_raw(self, object: MediaCollection, data: Dict) -> None:
def _update_object_raw(self, object: MediaCollection, data: DataType) -> None:
info = data["info"]
is_channel = self.__is_channel_id(info["id"])
object.title = (

@ -64,15 +64,6 @@ class ExtractedDataLight:
extractor_key=self.object_key,
)
@dataclass
class ExtractedData(ExtractedDataLight, Generic[T]):
data: T = dataclasses.field(repr=False, compare=False)
@property
def has_data(self) -> bool:
return self.data is not None
def load_media(self) -> Optional[MediaElement]:
return MediaElement.get(
extractor_name=self.extractor_name,
@ -86,6 +77,39 @@ class ExtractedData(ExtractedDataLight, Generic[T]):
)
@dataclass
class ExtractedDataOffline(ExtractedDataLight, Generic[T]):
data: Optional[T] = dataclasses.field(default=None, repr=False, compare=False)
@property
def has_data(self) -> bool:
return self.data is not None
@property
def online_type(self) -> ExtractedDataOnline[T]:
if self.data is None:
raise Exception("Explicit type requires data to be set")
return ExtractedDataOnline[T](
object_uri=self.object_uri,
extractor_name=self.extractor_name,
object_key=self.object_key,
data=self.data,
)
@dataclass
class ExtractedDataOnline(ExtractedDataOffline[T]):
data: T = dataclasses.field(repr=False, compare=False)
@property
def has_data(self) -> bool:
return True
@property
def online_type(self) -> ExtractedDataOnline[T]:
return self
@dataclass
class AuthorExtractedData(ExtractedDataLight):
author_name: str
@ -111,10 +135,10 @@ class GeneralExtractor(Generic[E, T]):
def check_uri(uri: str) -> Optional[E]:
raise NotImplementedError()
def _create_object(self, data: ExtractedData[T]) -> E:
def _create_object(self, data: ExtractedDataOffline[T]) -> E:
raise NotImplementedError()
def _load_object(self, data: ExtractedData[T]) -> Optional[E]:
def _load_object(self, data: ExtractedDataOffline[T]) -> Optional[E]:
raise NotImplementedError()
# abstract (for specific extractor classes)
@ -128,33 +152,35 @@ class GeneralExtractor(Generic[E, T]):
def _cache_expired(self, object: E) -> bool:
return False
def _extract_offline_only(self, uri: str) -> ExtractedDataLight:
def _extract_offline_only(self, uri: str) -> ExtractedDataOffline[T]:
raise NotImplementedError()
def _extract_online(self, uri: str) -> ExtractedData[T]:
def _extract_online(self, uri: str) -> ExtractedDataOnline[T]:
raise NotImplementedError()
def _update_object_raw(self, object: E, data: T) -> None:
raise NotImplementedError()
def _update_hook(self, object: E, data: ExtractedData[T]) -> None:
def _update_hook(self, object: E, data: ExtractedDataOnline[T]) -> None:
return None
# defined
def _extract_offline(self, uri: str) -> ExtractedDataLight:
def _extract_offline(self, uri: str) -> ExtractedDataOffline[T]:
return (
self._extract_offline_only(uri)
if self.can_extract_offline(uri)
else self._extract_online(uri)
)
def _extract_required(self, data: ExtractedData[T]) -> ExtractedData[T]:
def _extract_required(
self, data: ExtractedDataOffline[T]
) -> ExtractedDataOnline[T]:
if data.has_data:
return data
return data.online_type
return self._extract_online(data.object_uri)
def _update_object(self, object: E, data: ExtractedData[T]) -> E:
def _update_object(self, object: E, data: ExtractedDataOnline[T]) -> E:
object.uri = data.object_uri
self._update_object_raw(object, data.data)
self._update_hook(object, data)
@ -175,7 +201,7 @@ class GeneralExtractor(Generic[E, T]):
logging.debug(f"Updating info for media: {data!r}")
return self._update_object(object, data)
def inject_object(self, data: ExtractedData[T]) -> E:
def inject_object(self, data: ExtractedDataOnline[T]) -> E:
object = self._load_object(data)
data = self._extract_required(data)
if object is None:
@ -183,15 +209,15 @@ class GeneralExtractor(Generic[E, T]):
object = self._create_object(data)
return self._update_object(object, data)
def store_object(self, data: ExtractedData[T]) -> E:
def store_object(self, data: ExtractedDataOffline[T]) -> E:
object = self._load_object(data)
if object is not None:
logging.debug(f"Found object already in database: {data!r}")
return object
data = self._extract_required(data)
logging.debug(f"Store info for object: {data!r}")
object = self._create_object(data)
return self._update_object(object, data)
full_data = self._extract_required(data)
logging.debug(f"Store info for object: {full_data!r}")
object = self._create_object(full_data)
return self._update_object(object, full_data)
def extract_and_store(self, uri: str) -> E:
object = self.check_uri(uri)

@ -4,7 +4,12 @@ import logging
from typing import Optional, TypeVar
from ...models import MediaCollection, MediaElement, MediaUriMapping
from ..generic import AuthorExtractedData, ExtractedData, GeneralExtractor
from ..generic import (
AuthorExtractedData,
ExtractedDataOnline,
ExtractedDataOffline,
GeneralExtractor,
)
from ..collection.base import CollectionExtractor
@ -35,10 +40,10 @@ class MediaExtractor(GeneralExtractor[MediaElement, T]):
return elem
return None
def _create_object(self, data: ExtractedData[T]) -> MediaElement:
def _create_object(self, data: ExtractedDataOffline[T]) -> MediaElement:
return data.create_media()
def _load_object(self, data: ExtractedData[T]) -> Optional[MediaElement]:
def _load_object(self, data: ExtractedDataOffline[T]) -> Optional[MediaElement]:
return data.load_media()
def __create_author_collection(
@ -78,5 +83,5 @@ class MediaExtractor(GeneralExtractor[MediaElement, T]):
collection = self.__get_author_collection(author_data)
collection.add_episode(element)
def _update_hook(self, object: MediaElement, data: ExtractedData[T]) -> None:
def _update_hook(self, object: MediaElement, data: ExtractedDataOnline[T]) -> None:
self.__add_to_author_collection(object, data.data)

@ -8,7 +8,12 @@ from pony import orm
from ...models import MediaElement, MediaThumbnail, Query, Tag
from ..all.tmdb import TmdbMovieData, TMDB_REGEX_URI
from ..generic import ExtractedData, ExtractedDataLight, ExtractionError, SuitableLevel
from ..generic import (
ExtractedDataOnline,
ExtractedDataOffline,
ExtractionError,
SuitableLevel,
)
from .base import MediaExtractor
@ -41,18 +46,18 @@ class TmdbMovieMediaExtractor(MediaExtractor[TmdbMovieData]):
def can_extract_offline(self, uri: str) -> bool:
return True
def _extract_offline(self, uri: str) -> ExtractedDataLight:
def _extract_offline(self, uri: str) -> ExtractedDataOffline[TmdbMovieData]:
movie_id = self.__get_movie_id(uri)
return ExtractedDataLight(
return ExtractedDataOffline[TmdbMovieData](
extractor_name=self.name,
object_key=str(movie_id),
object_uri=uri,
)
def _extract_online(self, uri: str) -> ExtractedData[TmdbMovieData]:
def _extract_online(self, uri: str) -> ExtractedDataOnline[TmdbMovieData]:
movie_id = self.__get_movie_id(uri)
data = TmdbMovieData.from_id(movie_id)
return ExtractedData(
return ExtractedDataOnline[TmdbMovieData](
extractor_name=self.name,
object_key=f"movie:{movie_id}",
object_uri=uri,

@ -7,8 +7,13 @@ from typing import Optional
import requests
from ...models import MediaElement, MediaThumbnail
from ..all.tvmaze import TvmazeEpisodeEmbedded, select_best_image
from ..generic import ExtractedData, ExtractedDataLight, ExtractionError, SuitableLevel
from ..all.tvmaze import TvmazeEpisodeEmbedded, TvmazeShowEmbedded, select_best_image
from ..generic import (
ExtractedDataOnline,
ExtractedDataOffline,
ExtractionError,
SuitableLevel,
)
from .base import MediaExtractor
@ -54,15 +59,15 @@ class TvmazeMediaExtractor(MediaExtractor[TvmazeEpisodeEmbedded]):
def can_extract_offline(self, uri: str) -> bool:
return True
def _extract_offline(self, uri: str) -> ExtractedDataLight:
def _extract_offline(self, uri: str) -> ExtractedDataOffline[TvmazeEpisodeEmbedded]:
episode_id = self.__get_episode_id(uri)
return ExtractedDataLight(
return ExtractedDataOffline[TvmazeEpisodeEmbedded](
extractor_name=self.name,
object_key=str(episode_id),
object_uri=uri,
)
def _extract_online(self, uri: str) -> ExtractedData[TvmazeEpisodeEmbedded]:
def _extract_online(self, uri: str) -> ExtractedDataOnline[TvmazeEpisodeEmbedded]:
episode_id = self.__get_episode_id(uri)
if episode_id is None:
raise Exception(f"Expected {uri!r} to be extractable")
@ -76,7 +81,7 @@ class TvmazeMediaExtractor(MediaExtractor[TvmazeEpisodeEmbedded]):
},
)
data = res.json()
return ExtractedData(
return ExtractedDataOnline[TvmazeEpisodeEmbedded](
extractor_name=self.name,
object_key=str(episode_id),
object_uri=uri,
@ -84,7 +89,9 @@ class TvmazeMediaExtractor(MediaExtractor[TvmazeEpisodeEmbedded]):
)
def _update_object_raw(
self, object: MediaElement, data: TvmazeEpisodeEmbedded
self,
object: MediaElement,
data: TvmazeEpisodeEmbedded,
) -> None:
# sanity check
airstamp = data.get("airstamp")

@ -14,7 +14,7 @@ from ...models import (
)
from ..generic import (
AuthorExtractedData,
ExtractedData,
ExtractedDataOnline,
ExtractionError,
SuitableLevel,
)
@ -91,7 +91,7 @@ class YoutubeMediaExtractor(MediaExtractor[YoutubeVideoData]):
author_name=data["channel"]["name"],
)
def _extract_online(self, uri: str) -> ExtractedData[YoutubeVideoData]:
def _extract_online(self, uri: str) -> ExtractedDataOnline[YoutubeVideoData]:
logging.info(f"Request info using youtube_search_python for {uri!r}")
uri_match = self.__uri_regex.match(uri)
if not uri_match:
@ -106,7 +106,7 @@ class YoutubeMediaExtractor(MediaExtractor[YoutubeVideoData]):
raise ExtractionError() from e
if vid_data["isLiveNow"]:
raise ExtractionError("Video is live, so pass extraction")
return ExtractedData[YoutubeVideoData](
return ExtractedDataOnline[YoutubeVideoData](
object_uri=uri,
extractor_name=self.name,
object_key=vid_data["id"],

@ -12,7 +12,12 @@ from ...models import (
thumbnail_sort_key,
)
from ..all.ytdl import get_video_info, YtdlErrorException
from ..generic import AuthorExtractedData, ExtractedData, ExtractionError, SuitableLevel
from ..generic import (
AuthorExtractedData,
ExtractedDataOnline,
ExtractionError,
SuitableLevel,
)
from .base import MediaExtractor
@ -48,7 +53,7 @@ class YtdlMediaExtractor(MediaExtractor[Dict]):
else None,
)
def _extract_online(self, uri: str) -> ExtractedData[Dict]:
def _extract_online(self, uri: str) -> ExtractedDataOnline[Dict]:
logging.info(f"Request info using youtube-dl for {uri!r}")
try:
vid_data = get_video_info(uri)
@ -58,7 +63,7 @@ class YtdlMediaExtractor(MediaExtractor[Dict]):
raise ExtractionError("Video is live, so pass extraction")
ytdl_extractor_key = vid_data.get("extractor_key") or vid_data["ie_key"]
ytdl_video_id = vid_data["id"]
return ExtractedData[Dict](
return ExtractedDataOnline[Dict](
object_uri=uri,
extractor_name=self.name,
object_key=f"{ytdl_extractor_key}:{ytdl_video_id}",

Loading…
Cancel
Save