Merge branch 'extract_info_rewrite'

12 years ago · dce9027045
parent feba604e92 d281274bf2
commit dce9027045
4 changed files with 176 additions and 114 deletions
--- a/README.md
+++ b/README.md
@ -150,6 +150,8 @@ The `-o` option allows users to indicate a template for the output file names. T
 - `ext`: The sequence will be replaced by the appropriate extension (like flv or mp4).
 - `epoch`: The sequence will be replaced by the Unix epoch when creating the file.
 - `autonumber`: The sequence will be replaced by a five-digit number that will be increased with each download, starting at zero.
 - `playlist`: The name or the id of the playlist that contains the video.
 - `playlist_index`: The index of the video in the playlist, a five-digit number.
 The current default template is `%(id)s.%(ext)s`, but that will be switchted to `%(title)s-%(id)s.%(ext)s` (which can be requested with `-t` at the moment).
--- a/test/test_youtube_lists.py
+++ b/test/test_youtube_lists.py
@ -10,6 +10,7 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from youtube_dl.InfoExtractors import YoutubeUserIE, YoutubePlaylistIE, YoutubeIE
 from youtube_dl.utils import *
 from youtube_dl.FileDownloader import FileDownloader
 PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json")
 with io.open(PARAMETERS_FILE, encoding='utf-8') as pf:
@ -22,7 +23,7 @@ proxy_handler = compat_urllib_request.ProxyHandler()
 opener = compat_urllib_request.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
 compat_urllib_request.install_opener(opener)
-class FakeDownloader(object):
+class FakeDownloader(FileDownloader):
    def __init__(self):
        self.result = []
        self.params = parameters
@ -30,35 +31,42 @@ class FakeDownloader(object):
        print(s)
    def trouble(self, s):
        raise Exception(s)
-    def download(self, x):
+    def extract_info(self, url):
-        self.result.append(x)
+        self.result.append(url)
        return url
 class TestYoutubeLists(unittest.TestCase):
    def assertIsPlaylist(self,info):
        """Make sure the info has '_type' set to 'playlist'"""
        self.assertEqual(info['_type'], 'playlist')
    def test_youtube_playlist(self):
        dl = FakeDownloader()
        ie = YoutubePlaylistIE(dl)
-        ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re')
+        result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re')[0]
-        ytie_results = [YoutubeIE()._extract_id(r[0]) for r in dl.result]
+        self.assertIsPlaylist(result)
        ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']]
        self.assertEqual(ytie_results, [ 'bV9L5Ht9LgY', 'FXxLjLQi3Fg', 'tU3Bgo5qJZE'])
    def test_issue_673(self):
        dl = FakeDownloader()
        ie = YoutubePlaylistIE(dl)
-        ie.extract('PLBB231211A4F62143')
+        result = ie.extract('PLBB231211A4F62143')[0]
-        self.assertTrue(len(dl.result) > 40)
+        self.assertTrue(len(result['entries']) > 40)
    def test_youtube_playlist_long(self):
        dl = FakeDownloader()
        ie = YoutubePlaylistIE(dl)
-        ie.extract('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
+        result = ie.extract('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')[0]
-        self.assertTrue(len(dl.result) >= 799)
+        self.assertIsPlaylist(result)
        self.assertTrue(len(result['entries']) >= 799)
    def test_youtube_playlist_with_deleted(self):
        #651
        dl = FakeDownloader()
        ie = YoutubePlaylistIE(dl)
-        ie.extract('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
+        result = ie.extract('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')[0]
-        ytie_results = [YoutubeIE()._extract_id(r[0]) for r in dl.result]
+        ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']]
        self.assertFalse('pElCt5oNDuI' in ytie_results)
        self.assertFalse('KdPEApIVdWM' in ytie_results)
@ -66,10 +74,11 @@ class TestYoutubeLists(unittest.TestCase):
        dl = FakeDownloader()
        ie = YoutubePlaylistIE(dl)
        # TODO find a > 100 (paginating?) videos course
-        ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
+        result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')[0]
-        self.assertEqual(YoutubeIE()._extract_id(dl.result[0][0]), 'j9WZyLZCBzs')
+        entries = result['entries']
-        self.assertEqual(len(dl.result), 25)
+        self.assertEqual(YoutubeIE()._extract_id(entries[0]['url']), 'j9WZyLZCBzs')
-        self.assertEqual(YoutubeIE()._extract_id(dl.result[-1][0]), 'rYefUsYuEp0')
+        self.assertEqual(len(entries), 25)
        self.assertEqual(YoutubeIE()._extract_id(entries[-1]['url']), 'rYefUsYuEp0')
    def test_youtube_channel(self):
        # I give up, please find a channel that does paginate and test this like test_youtube_playlist_long
@ -78,8 +87,8 @@ class TestYoutubeLists(unittest.TestCase):
    def test_youtube_user(self):
        dl = FakeDownloader()
        ie = YoutubeUserIE(dl)
-        ie.extract('https://www.youtube.com/user/TheLinuxFoundation')
+        result = ie.extract('https://www.youtube.com/user/TheLinuxFoundation')[0]
-        self.assertTrue(len(dl.result) >= 320)
+        self.assertTrue(len(result['entries']) >= 320)
 if __name__ == '__main__':
    unittest.main()
--- a/youtube_dl/FileDownloader.py
+++ b/youtube_dl/FileDownloader.py
@ -393,6 +393,8 @@ class FileDownloader(object):
                autonumber_size = 5
            autonumber_templ = u'%0' + str(autonumber_size) + u'd'
            template_dict['autonumber'] = autonumber_templ % self._num_downloads
            if template_dict['playlist_index'] is not None:
                template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index']
            sanitize = lambda k,v: sanitize_filename(
                u'NA' if v is None else compat_str(v),
@ -422,10 +424,110 @@ class FileDownloader(object):
            if re.search(rejecttitle, title, re.IGNORECASE):
                return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
        return None
    def extract_info(self, url, download = True):
        '''
        Returns a list with a dictionary for each video we find.
        If 'download', also downloads the videos.
         '''
        suitable_found = False
        for ie in self._ies:
            # Go to next InfoExtractor if not suitable
            if not ie.suitable(url):
                continue
            # Warn if the _WORKING attribute is False
            if not ie.working():
                self.to_stderr(u'WARNING: the program functionality for this site has been marked as broken, '
                               u'and will probably not work. If you want to go on, use the -i option.')
            # Suitable InfoExtractor found
            suitable_found = True
            # Extract information from URL and process it
            try:
                ie_results = ie.extract(url)
                results = []
                for ie_result in ie_results:
                    if not 'extractor' in ie_result:
                        #The extractor has already been set somewhere else
                        ie_result['extractor'] = ie.IE_NAME
                    results.append(self.process_ie_result(ie_result, download))
                return results
            except ExtractorError as de: # An error we somewhat expected
                self.trouble(u'ERROR: ' + compat_str(de), de.format_traceback())
                break
            except Exception as e:
                if self.params.get('ignoreerrors', False):
                    self.trouble(u'ERROR: ' + compat_str(e), tb=compat_str(traceback.format_exc()))
                    break
                else:
                    raise
        if not suitable_found:
                self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
    def process_ie_result(self, ie_result, download = True):
        """
        Take the result of the ie and return a list of videos.
        For url elements it will search the suitable ie and get the videos
        For playlist elements it will process each of the elements of the 'entries' key
        It will also download the videos if 'download'.
        """
        result_type = ie_result.get('_type', 'video') #If not given we suppose it's a video, support the dafault old system
        if result_type == 'video':
            if 'playlist' not in ie_result:
                #It isn't part of a playlist
                ie_result['playlist'] = None
                ie_result['playlist_index'] = None
            if download:
                #Do the download:
                self.process_info(ie_result)
            return ie_result
        elif result_type == 'url':
            #We get the video pointed by the url
            result = self.extract_info(ie_result['url'], download)[0]
            return result
        elif result_type == 'playlist':
            #We process each entry in the playlist
            playlist = ie_result.get('title', None) or ie_result.get('id', None)
            self.to_screen(u'[download] Downloading playlist: %s'  % playlist)
            playlist_results = []
            n_all_entries = len(ie_result['entries'])
            playliststart = self.params.get('playliststart', 1) - 1
            playlistend = self.params.get('playlistend', -1)
            if playlistend == -1:
                entries = ie_result['entries'][playliststart:]
            else:
                entries = ie_result['entries'][playliststart:playlistend]
            n_entries = len(entries)
            self.to_screen(u"[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
                (ie_result['extractor'], playlist, n_all_entries, n_entries))
            for i,entry in enumerate(entries,1):
                self.to_screen(u'[download] Downloading video #%s of %s' %(i, n_entries))
                entry_result = self.process_ie_result(entry, False)
                entry_result['playlist'] = playlist
                entry_result['playlist_index'] = i + playliststart
                #We must do the download here to correctly set the 'playlist' key
                if download:
                    self.process_info(entry_result)
                playlist_results.append(entry_result)
            result = ie_result.copy()
            result['entries'] = playlist_results
            return result
    def process_info(self, info_dict):
        """Process a single dictionary returned by an InfoExtractor."""
        #We increment the download the download count here to match the previous behaviour.
        self.increment_downloads()
        info_dict['fulltitle'] = info_dict['title']
        if len(info_dict['title']) > 200:
            info_dict['title'] = info_dict['title'][:197] + u'...'
@ -564,53 +666,14 @@ class FileDownloader(object):
            raise SameFileError(self.params['outtmpl'])
        for url in url_list:
-            suitable_found = False
+            try:
-            for ie in self._ies:
+                #It also downloads the videos
-                # Go to next InfoExtractor if not suitable
+                videos = self.extract_info(url)
-                if not ie.suitable(url):
+            except UnavailableVideoError:
-                    continue
+                self.trouble(u'\nERROR: unable to download video')
-
+            except MaxDownloadsReached:
-                # Warn if the _WORKING attribute is False
+                self.to_screen(u'[info] Maximum number of downloaded files reached.')
-                if not ie.working():
+                raise
                    self.report_warning(u'the program functionality for this site has been marked as broken, '
                                        u'and will probably not work. If you want to go on, use the -i option.')
                # Suitable InfoExtractor found
                suitable_found = True
                # Extract information from URL and process it
                try:
                    videos = ie.extract(url)
                except ExtractorError as de: # An error we somewhat expected
                    self.trouble(u'ERROR: ' + compat_str(de), de.format_traceback())
                    break
                except MaxDownloadsReached:
                    self.to_screen(u'[info] Maximum number of downloaded files reached.')
                    raise
                except Exception as e:
                    if self.params.get('ignoreerrors', False):
                        self.report_error(u'' + compat_str(e), tb=compat_str(traceback.format_exc()))
                        break
                    else:
                        raise
                if len(videos or []) > 1 and self.fixed_template():
                    raise SameFileError(self.params['outtmpl'])
                for video in videos or []:
                    video['extractor'] = ie.IE_NAME
                    try:
                        self.increment_downloads()
                        self.process_info(video)
                    except UnavailableVideoError:
                        self.to_stderr(u"\n")
                        self.report_error(u'unable to download video')
                # Suitable InfoExtractor had been found; go to next URL
                break
            if not suitable_found:
                self.report_error(u'no suitable InfoExtractor: %s' % url)
        return self._download_retcode
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@ -143,6 +143,28 @@ class InfoExtractor(object):
            dump = base64.b64encode(webpage_bytes).decode('ascii')
            self._downloader.to_screen(dump)
        return webpage_bytes.decode(encoding, 'replace')
    #Methods for following #608
    #They set the correct value of the '_type' key
    def video_result(self, video_info):
        """Returns a video"""
        video_info['_type'] = 'video'
        return video_info
    def url_result(self, url, ie=None):
        """Returns a url that points to a page that should be processed"""
        #TODO: ie should be the class used for getting the info
        video_info = {'_type': 'url',
                      'url': url}
        return video_info
    def playlist_result(self, entries, playlist_id=None, playlist_title=None):
        """Returns a playlist"""
        video_info = {'_type': 'playlist',
                      'entries': entries}
        if playlist_id:
            video_info['id'] = playlist_id
        if playlist_title:
            video_info['title'] = playlist_title
        return video_info
 class YoutubeIE(InfoExtractor):
@ -706,8 +728,7 @@ class MetacafeIE(InfoExtractor):
        # Check if video comes from YouTube
        mobj2 = re.match(r'^yt-(.*)$', video_id)
        if mobj2 is not None:
-            self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
+            return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1))]
            return
        # Retrieve video webpage to extract further information
        request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
@ -1348,7 +1369,7 @@ class GenericIE(InfoExtractor):
        self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
    def _test_redirect(self, url):
-        """Check if it is a redirect, like url shorteners, in case restart chain."""
+        """Check if it is a redirect, like url shorteners, in case return the new url."""
        class HeadRequest(compat_urllib_request.Request):
            def get_method(self):
                return "HEAD"
@ -1399,11 +1420,11 @@ class GenericIE(InfoExtractor):
            return False
        self.report_following_redirect(new_url)
-        self._downloader.download([new_url])
+        return new_url
        return True
    def _real_extract(self, url):
-        if self._test_redirect(url): return
+        new_url = self._test_redirect(url)
        if new_url: return [self.url_result(new_url)]
        video_id = url.split('/')[-1]
        try:
@ -1794,23 +1815,9 @@ class YoutubePlaylistIE(InfoExtractor):
            page_num += 1
        videos = [v[1] for v in sorted(videos)]
        total = len(videos)
        playliststart = self._downloader.params.get('playliststart', 1) - 1
        playlistend = self._downloader.params.get('playlistend', -1)
        if playlistend == -1:
            videos = videos[playliststart:]
        else:
            videos = videos[playliststart:playlistend]
        if len(videos) == total:
            self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
        else:
            self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
-        for video in videos:
+        url_results = [self.url_result(url) for url in videos]
-            self._downloader.download([video])
+        return [self.playlist_result(url_results, playlist_id)]
        return
 class YoutubeChannelIE(InfoExtractor):
@ -1860,9 +1867,9 @@ class YoutubeChannelIE(InfoExtractor):
        self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
-        for id in video_ids:
+        urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
-            self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
+        url_entries = [self.url_result(url) for url in urls]
-        return
+        return [self.playlist_result(url_entries, channel_id)]
 class YoutubeUserIE(InfoExtractor):
@ -1932,20 +1939,9 @@ class YoutubeUserIE(InfoExtractor):
            pagenum += 1
-        all_ids_count = len(video_ids)
+        urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
-        playliststart = self._downloader.params.get('playliststart', 1) - 1
+        url_results = [self.url_result(url) for url in urls]
-        playlistend = self._downloader.params.get('playlistend', -1)
+        return [self.playlist_result(url_results, playlist_title = username)]
        if playlistend == -1:
            video_ids = video_ids[playliststart:]
        else:
            video_ids = video_ids[playliststart:playlistend]
        self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
                (username, all_ids_count, len(video_ids)))
        for video_id in video_ids:
            self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
 class BlipTVUserIE(InfoExtractor):
@ -2023,20 +2019,12 @@ class BlipTVUserIE(InfoExtractor):
            pagenum += 1
        all_ids_count = len(video_ids)
        playliststart = self._downloader.params.get('playliststart', 1) - 1
        playlistend = self._downloader.params.get('playlistend', -1)
        if playlistend == -1:
            video_ids = video_ids[playliststart:]
        else:
            video_ids = video_ids[playliststart:playlistend]
        self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
                (self.IE_NAME, username, all_ids_count, len(video_ids)))
-        for video_id in video_ids:
+        urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
-            self._downloader.download([u'http://blip.tv/'+video_id])
+        url_entries = [self.url_result(url) for url in urls]
        return [self.playlist_result(url_entries, playlist_title = username)]
 class DepositFilesIE(InfoExtractor):