youtube-dl/youtube_dl/extractor/ninegag.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    determine_ext,
    url_or_none,
    int_or_none,
    float_or_none,
    ExtractorError
)


class NineGagIE(InfoExtractor):
    IE_NAME = '9gag'
    _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P<id>[a-zA-Z0-9]+)'

    _TESTS = [{
        'url': 'https://9gag.com/gag/an5Qz5b',
        'info_dict': {
            'id': 'an5Qz5b',
            'ext': 'webm',
            'title': 'Dogs playing tetherball',
            'upload_date': '20191108',
            'timestamp': 1573243994,
            'age_limit': 0,
            'categories': [
                'Wholesome'
            ],
            'tags': [
                'Dog'
            ]
        }
    }, {
        'url': 'https://9gag.com/gag/ae5Ag7B',
        'info_dict': {
            'id': 'ae5Ag7B',
            'ext': 'webm',
            'title': 'Capybara Agility Training',
            'upload_date': '20191108',
            'timestamp': 1573237208,
            'age_limit': 0,
            'categories': [
                'Awesome'
            ],
            'tags': [
                'Weimaraner',
                'American Pit Bull Terrier'
            ]
        }
    }]

    _EXTERNAL_VIDEO_PROVIDERS = {
        'Youtube': 'https://youtube.com/watch?v=%s'
    }

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
        rawJsonData = self._search_regex(
            r'window._config\s*=\s*JSON.parse\(["\']({.+?})["\']\);',
            webpage,
            'data')
        rawJsonData = rawJsonData.replace('\\"', '"').replace('\\\\/', '/')
        data = self._parse_json(rawJsonData, video_id)['data']['post']

        if data['type'] == 'Video':
            vid = data['video']['id']
            ie_key = data['video']['source'].capitalize()
            return {
                '_type': 'url_transparent',
                'url': self._EXTERNAL_VIDEO_PROVIDERS[ie_key] % vid,
                'ie_key': ie_key,
                'id': vid,
                'duration': data['video'].get('duration'),
                'start_time': data['video'].get('startTs')
            }

        if data['type'] == 'EmbedVideo':
            vid = data['video']['id']
            ie_key = data['video']['source'].capitalize()
            return {
                '_type': 'url_transparent',
                'url': data['video']['embedUrl'],
                #'ie_key': vid,
                'start_time': data['video'].get('startTs')
            }

        if data['type'] != 'Animated':
            raise ExtractorError(
                'The given url does not contain a video',
                expected=True)

        duration = None
        formats = []
        thumbnails = []
        for key in data['images']:
            image = data['images'][key]
            if 'duration' in image and duration is None:
                duration = int_or_none(image['duration'])
            url = url_or_none(image.get('url'))
            if url == None:
                continue
            ext = determine_ext(url)
            if ext == 'jpg' or ext == 'png':
                thumbnail = {
                    'url': url,
                    'width': float_or_none(image.get('width')),
                    'height': float_or_none(image.get('height'))
                }
                thumbnails.append(thumbnail)
            elif ext == 'webm' or ext == 'mp4':
                formats.append({
                    'format_id': re.sub(r'.*_([^\.]+).(.*)', r'\1_\2', url),
                    'ext': ext,
                    'url': url,
                    'width': float_or_none(image.get('width')),
                    'height': float_or_none(image.get('height'))
                })
        section = None
        postSection = data.get('postSection')
        if postSection != None and 'name' in postSection:
            section = re.sub(r'\\[^\\]{5}', '', postSection['name'])
        age_limit = int_or_none(data.get('nsfw'))
        if age_limit != None:
            age_limit = age_limit * 18
        tags = None
        if 'tags' in data:
            tags = []
            for tag in data.get('tags') or []:
                tags.append(tag.get('key'))

        return {
            'id': video_id,
            'title': data['title'],
            'timestamp': int_or_none(data.get('creationTs')),
            'duration': duration,
            'formats': formats,
            'thumbnails': thumbnails,
            'like_count': int_or_none(data.get('upVoteCount')),
            'dislike_count': int_or_none(data.get('downVoteCount')),
            'comment_count': int_or_none(data.get('commentsCount')),
            'age_limit': age_limit,
            'categories': [section],
            'tags': tags,
            'is_live': False
        }
[ninegag] Use unicode_literals 11 years ago			`from __future__ import unicode_literals`

[9gag] Add extractor 11 years ago			`import re`

			`from .common import InfoExtractor`
[9gag] Fix Extraction (#23022) 4 years ago			`from ..utils import (`
			`determine_ext,`
			`url_or_none,`
			`int_or_none,`
			`float_or_none,`
			`ExtractorError`
			`)`
[9gag] Add extractor 11 years ago

			`class NineGagIE(InfoExtractor):`
			`IE_NAME = '9gag'`
[9gag] Fix Extraction (#23022) 4 years ago			`_VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P<id>[a-zA-Z0-9]+)'`
[9gag] Add extractor 11 years ago
[ninegag] Add support for p/ URLs 11 years ago			`_TESTS = [{`
[9gag] Fix Extraction (#23022) 4 years ago			`'url': 'https://9gag.com/gag/an5Qz5b',`
[9gag] Quotes consistency 9 years ago			`'info_dict': {`
[9gag] Fix Extraction (#23022) 4 years ago			`'id': 'an5Qz5b',`
			`'ext': 'webm',`
			`'title': 'Dogs playing tetherball',`
			`'upload_date': '20191108',`
			`'timestamp': 1573243994,`
			`'age_limit': 0,`
			`'categories': [`
			`'Wholesome'`
			`],`
			`'tags': [`
			`'Dog'`
			`]`
			`}`
[9gag] Add vimeo test 9 years ago			`}, {`
[9gag] Fix Extraction (#23022) 4 years ago			`'url': 'https://9gag.com/gag/ae5Ag7B',`
[9gag] Add vimeo test 9 years ago			`'info_dict': {`
[9gag] Fix Extraction (#23022) 4 years ago			`'id': 'ae5Ag7B',`
			`'ext': 'webm',`
			`'title': 'Capybara Agility Training',`
			`'upload_date': '20191108',`
			`'timestamp': 1573237208,`
			`'age_limit': 0,`
			`'categories': [`
			`'Awesome'`
			`],`
			`'tags': [`
			`'Weimaraner',`
			`'American Pit Bull Terrier'`
			`]`
			`}`
[ninegag] Add support for p/ URLs 11 years ago			`}]`
[9gag] Support embed URLs 9 years ago
[9gag] Fix Extraction (#23022) 4 years ago			`_EXTERNAL_VIDEO_PROVIDERS = {`
			`'Youtube': 'https://youtube.com/watch?v=%s'`
[ninegag] fix _VALID_URL regex and handle the use of other external providers 9 years ago			`}`
[9gag] Add extractor 11 years ago
			`def _real_extract(self, url):`
[9gag] Fix Extraction (#23022) 4 years ago			`video_id = self._match_id(url)`
			`webpage = self._download_webpage(url, video_id)`
			`rawJsonData = self._search_regex(`
			`r'window._config\s=\sJSON.parse\(["\']({.+?})["\']\);',`
			`webpage,`
			`'data')`
			`rawJsonData = rawJsonData.replace('\\"', '"').replace('\\\\/', '/')`
			`data = self._parse_json(rawJsonData, video_id)['data']['post']`

			`if data['type'] == 'Video':`
			`vid = data['video']['id']`
			`ie_key = data['video']['source'].capitalize()`
			`return {`
			`'_type': 'url_transparent',`
			`'url': self._EXTERNAL_VIDEO_PROVIDERS[ie_key] % vid,`
			`'ie_key': ie_key,`
			`'id': vid,`
			`'duration': data['video'].get('duration'),`
			`'start_time': data['video'].get('startTs')`
			`}`
[9gag] Add extractor 11 years ago
[9gag] Fix Extraction (#23022) 4 years ago			`if data['type'] == 'EmbedVideo':`
			`vid = data['video']['id']`
			`ie_key = data['video']['source'].capitalize()`
			`return {`
			`'_type': 'url_transparent',`
			`'url': data['video']['embedUrl'],`
			`#'ie_key': vid,`
			`'start_time': data['video'].get('startTs')`
			`}`
[9gag] Add extractor 11 years ago
[9gag] Fix Extraction (#23022) 4 years ago			`if data['type'] != 'Animated':`
			`raise ExtractorError(`
			`'The given url does not contain a video',`
			`expected=True)`
[9gag] Fix and improve extraction 11 years ago
[9gag] Fix Extraction (#23022) 4 years ago			`duration = None`
			`formats = []`
			`thumbnails = []`
			`for key in data['images']:`
			`image = data['images'][key]`
			`if 'duration' in image and duration is None:`
			`duration = int_or_none(image['duration'])`
			`url = url_or_none(image.get('url'))`
			`if url == None:`
			`continue`
			`ext = determine_ext(url)`
			`if ext == 'jpg' or ext == 'png':`
			`thumbnail = {`
			`'url': url,`
			`'width': float_or_none(image.get('width')),`
			`'height': float_or_none(image.get('height'))`
			`}`
			`thumbnails.append(thumbnail)`
			`elif ext == 'webm' or ext == 'mp4':`
			`formats.append({`
			`'format_id': re.sub(r'._([^\.]+).(.)', r'\1_\2', url),`
			`'ext': ext,`
			`'url': url,`
			`'width': float_or_none(image.get('width')),`
			`'height': float_or_none(image.get('height'))`
			`})`
			`section = None`
			`postSection = data.get('postSection')`
			`if postSection != None and 'name' in postSection:`
			`section = re.sub(r'\\[^\\]{5}', '', postSection['name'])`
			`age_limit = int_or_none(data.get('nsfw'))`
			`if age_limit != None:`
			`age_limit = age_limit * 18`
			`tags = None`
			`if 'tags' in data:`
			`tags = []`
			`for tag in data.get('tags') or []:`
			`tags.append(tag.get('key'))`
[9gag] Add extractor 11 years ago
			`return {`
			`'id': video_id,`
[9gag] Fix Extraction (#23022) 4 years ago			`'title': data['title'],`
			`'timestamp': int_or_none(data.get('creationTs')),`
			`'duration': duration,`
			`'formats': formats,`
			`'thumbnails': thumbnails,`
			`'like_count': int_or_none(data.get('upVoteCount')),`
			`'dislike_count': int_or_none(data.get('downVoteCount')),`
			`'comment_count': int_or_none(data.get('commentsCount')),`
			`'age_limit': age_limit,`
			`'categories': [section],`
			`'tags': tags,`
			`'is_live': False`
[9gag] Add extractor 11 years ago			`}`