|
|
@ -61,7 +61,7 @@ class ExtractedDataLight:
|
|
|
|
return MediaCollection(
|
|
|
|
return MediaCollection(
|
|
|
|
uri=self.object_uri,
|
|
|
|
uri=self.object_uri,
|
|
|
|
extractor_name=self.extractor_name,
|
|
|
|
extractor_name=self.extractor_name,
|
|
|
|
extractor_key = self.object_key
|
|
|
|
extractor_key=self.object_key,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -74,10 +74,14 @@ class ExtractedData(ExtractedDataLight, Generic[T]):
|
|
|
|
return self.data is not None
|
|
|
|
return self.data is not None
|
|
|
|
|
|
|
|
|
|
|
|
def load_media(self) -> Optional[MediaElement]:
|
|
|
|
def load_media(self) -> Optional[MediaElement]:
|
|
|
|
return MediaElement.get(extractor_name=self.extractor_name, extractor_key=self.object_key)
|
|
|
|
return MediaElement.get(
|
|
|
|
|
|
|
|
extractor_name=self.extractor_name, extractor_key=self.object_key
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def load_collection(self) -> Optional[MediaCollection]:
|
|
|
|
def load_collection(self) -> Optional[MediaCollection]:
|
|
|
|
return MediaCollection.get(extractor_name=self.extractor_name, extractor_key=self.object_key)
|
|
|
|
return MediaCollection.get(
|
|
|
|
|
|
|
|
extractor_name=self.extractor_name, extractor_key=self.object_key
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
@dataclass
|
|
|
@ -91,6 +95,7 @@ class AuthorExtractedData(ExtractedDataLight):
|
|
|
|
|
|
|
|
|
|
|
|
E = TypeVar("E", MediaElement, MediaCollection)
|
|
|
|
E = TypeVar("E", MediaElement, MediaCollection)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class GeneralExtractor(Generic[E, T]):
|
|
|
|
class GeneralExtractor(Generic[E, T]):
|
|
|
|
|
|
|
|
|
|
|
|
name: str
|
|
|
|
name: str
|
|
|
@ -136,7 +141,11 @@ class GeneralExtractor(Generic[E, T]):
|
|
|
|
# defined
|
|
|
|
# defined
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_offline(self, uri: str) -> ExtractedData[T]:
|
|
|
|
def _extract_offline(self, uri: str) -> ExtractedData[T]:
|
|
|
|
return self._extract_offline_only(uri) if self.can_extract_offline(uri) else self._extract_online(uri)
|
|
|
|
return (
|
|
|
|
|
|
|
|
self._extract_offline_only(uri)
|
|
|
|
|
|
|
|
if self.can_extract_offline(uri)
|
|
|
|
|
|
|
|
else self._extract_online(uri)
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_required(self, data: ExtractedData[T]) -> ExtractedData[T]:
|
|
|
|
def _extract_required(self, data: ExtractedData[T]) -> ExtractedData[T]:
|
|
|
|
if data.has_data:
|
|
|
|
if data.has_data:
|
|
|
@ -151,8 +160,14 @@ class GeneralExtractor(Generic[E, T]):
|
|
|
|
return object
|
|
|
|
return object
|
|
|
|
|
|
|
|
|
|
|
|
def update_object(self, object: E, check_cache_expired: bool = True) -> E:
|
|
|
|
def update_object(self, object: E, check_cache_expired: bool = True) -> E:
|
|
|
|
if object.was_extracted and check_cache_expired and not self._cache_expired(object.last_updated):
|
|
|
|
if (
|
|
|
|
logging.debug(f"Skip info for element as already extracted and cache valid: {object.title!r}")
|
|
|
|
object.was_extracted
|
|
|
|
|
|
|
|
and check_cache_expired
|
|
|
|
|
|
|
|
and not self._cache_expired(object.last_updated)
|
|
|
|
|
|
|
|
):
|
|
|
|
|
|
|
|
logging.debug(
|
|
|
|
|
|
|
|
f"Skip info for element as already extracted and cache valid: {object.title!r}"
|
|
|
|
|
|
|
|
)
|
|
|
|
return object
|
|
|
|
return object
|
|
|
|
data = self._extract_online(object.uri)
|
|
|
|
data = self._extract_online(object.uri)
|
|
|
|
logging.debug(f"Updating info for media: {data!r}")
|
|
|
|
logging.debug(f"Updating info for media: {data!r}")
|
|
|
|