[Youtube] Rewrite comment extraction (#167)

Closes #121 TODO: * Add an option for the user to specify newest/popular and max number of comments * Refactor the download code and generalize with TabIE * Parse time_text to timestamp
2021-03-14 22:41:11 +00:00 · 2021-03-14 22:41:11 +00:00 · a1c5d2ca64
commit a1c5d2ca64
parent ca87974543
2 changed files with 300 additions and 171 deletions
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -264,6 +264,7 @@ class InfoExtractor(object):
                    properties (all but one of text or html optional):
                        * "author" - human-readable name of the comment author
                        * "author_id" - user ID of the comment author
                        * "author_thumbnail" - The thumbnail of the comment author
                        * "id" - Comment ID
                        * "html" - Comment as HTML
                        * "text" - Plain text of the comment
@ -271,6 +272,12 @@ class InfoExtractor(object):
                        * "parent" - ID of the comment this one is replying to.
                                     Set to "root" to indicate that this is a
                                     comment to the original video.
                        * "like_count" - Number of positive ratings of the comment
                        * "dislike_count" - Number of negative ratings of the comment
                        * "is_favorited" - Whether the comment is marked as
                                           favorite by the video uploader
                        * "author_is_uploader" - Whether the comment is made by
                                                 the video uploader
    age_limit:      Age restriction for the video, as an integer (years)
    webpage_url:    The URL to the video webpage, if given to yt-dlp it
                    should allow to get the same result again. (It will be set
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@ -272,15 +272,21 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
        if not self._login():
            return
    _YT_WEB_CLIENT_VERSION = '2.20210301.08.00'
    _DEFAULT_API_DATA = {
        'context': {
            'client': {
                'clientName': 'WEB',
-                'clientVersion': '2.20210301.08.00',
+                'clientVersion': _YT_WEB_CLIENT_VERSION,
            }
        },
    }
    _DEFAULT_BASIC_API_HEADERS = {
        'X-YouTube-Client-Name': '1',
        'X-YouTube-Client-Version': _YT_WEB_CLIENT_VERSION
    }
    _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
    _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
    _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
@ -315,6 +321,27 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
                 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
            video_id)
    def _extract_identity_token(self, webpage, item_id):
        ytcfg = self._extract_ytcfg(item_id, webpage)
        if ytcfg:
            token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
            if token:
                return token
        return self._search_regex(
            r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
            'identity token', default=None)
    @staticmethod
    def _extract_account_syncid(data):
        """Extract syncId required to download private playlists of secondary channels"""
        sync_ids = (
            try_get(data, lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'], compat_str)
            or '').split("||")
        if len(sync_ids) >= 2 and sync_ids[1]:
            # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel
            # and just "user_syncid||" for primary channel. We only want the channel_syncid
            return sync_ids[0]
    def _extract_ytcfg(self, video_id, webpage):
        return self._parse_json(
            self._search_regex(
@ -1462,6 +1489,270 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
             regex), webpage, name, default='{}'), video_id, fatal=False)
    @staticmethod
    def _join_text_entries(runs):
        text = None
        for run in runs:
            if not isinstance(run, dict):
                continue
            sub_text = try_get(run, lambda x: x['text'], compat_str)
            if sub_text:
                if not text:
                    text = sub_text
                    continue
                text += sub_text
        return text
    def _extract_comment(self, comment_renderer, parent=None):
        comment_id = comment_renderer.get('commentId')
        if not comment_id:
            return
        comment_text_runs = try_get(comment_renderer, lambda x: x['contentText']['runs']) or []
        text = self._join_text_entries(comment_text_runs) or ''
        comment_time_text = try_get(comment_renderer, lambda x: x['publishedTimeText']['runs']) or []
        time_text = self._join_text_entries(comment_time_text)
        author = try_get(comment_renderer, lambda x: x['authorText']['simpleText'], compat_str)
        author_id = try_get(comment_renderer,
                            lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str)
        votes = str_to_int(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'],
                                                      lambda x: x['likeCount']), compat_str)) or 0
        author_thumbnail = try_get(comment_renderer,
                                   lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], compat_str)
        author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool)
        is_liked = try_get(comment_renderer, lambda x: x['isLiked'], bool)
        return {
            'id': comment_id,
            'text': text,
            # TODO: This should be parsed to timestamp
            'time_text': time_text,
            'like_count': votes,
            'is_favorited': is_liked,
            'author': author,
            'author_id': author_id,
            'author_thumbnail': author_thumbnail,
            'author_is_uploader': author_is_uploader,
            'parent': parent or 'root'
        }
    def _comment_entries(self, root_continuation_data, identity_token, account_syncid,
                         session_token_list, parent=None, comment_counts=None):
        def extract_thread(parent_renderer):
            contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
            if not parent:
                comment_counts[2] = 0
            for content in contents:
                comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
                comment_renderer = try_get(
                    comment_thread_renderer, (lambda x: x['comment']['commentRenderer'], dict)) or try_get(
                    content, (lambda x: x['commentRenderer'], dict))
                if not comment_renderer:
                    continue
                comment = self._extract_comment(comment_renderer, parent)
                if not comment:
                    continue
                comment_counts[0] += 1
                yield comment
                # Attempt to get the replies
                comment_replies_renderer = try_get(
                    comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict)
                if comment_replies_renderer:
                    comment_counts[2] += 1
                    comment_entries_iter = self._comment_entries(
                        comment_replies_renderer, identity_token, account_syncid,
                        parent=comment.get('id'), session_token_list=session_token_list,
                        comment_counts=comment_counts)
                    for reply_comment in comment_entries_iter:
                        yield reply_comment
        if not comment_counts:
            # comment so far, est. total comments, current comment thread #
            comment_counts = [0, 0, 0]
        headers = self._DEFAULT_BASIC_API_HEADERS.copy()
        # TODO: Generalize the download code with TabIE
        if identity_token:
            headers['x-youtube-identity-token'] = identity_token
        if account_syncid:
            headers['X-Goog-PageId'] = account_syncid
            headers['X-Goog-AuthUser'] = 0
        continuation = YoutubeTabIE._extract_continuation(root_continuation_data)  # TODO
        first_continuation = False
        if parent is None:
            first_continuation = True
        for page_num in itertools.count(0):
            if not continuation:
                break
            retries = self._downloader.params.get('extractor_retries', 3)
            count = -1
            last_error = None
            while count < retries:
                count += 1
                if last_error:
                    self.report_warning('%s. Retrying ...' % last_error)
                try:
                    query = {
                        'ctoken': continuation['ctoken'],
                        'pbj': 1,
                        'type': 'next',
                    }
                    if parent:
                        query['action_get_comment_replies'] = 1
                    else:
                        query['action_get_comments'] = 1
                    comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1])
                    if page_num == 0:
                        if first_continuation:
                            note_prefix = "Downloading initial comment continuation page"
                        else:
                            note_prefix = "    Downloading comment reply thread %d %s" % (comment_counts[2], comment_prog_str)
                    else:
                        note_prefix = "%sDownloading comment%s page %d %s" % (
                            "       " if parent else "",
                            ' replies' if parent else '',
                            page_num,
                            comment_prog_str)
                    browse = self._download_json(
                        'https://www.youtube.com/comment_service_ajax', None,
                        '%s %s' % (note_prefix, '(retry #%d)' % count if count else ''),
                        headers=headers, query=query,
                        data=urlencode_postdata({
                            'session_token': session_token_list[0]
                        }))
                except ExtractorError as e:
                    if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404, 413):
                        if e.cause.code == 413:
                            self.report_warning("Assumed end of comments (received HTTP Error 413)")
                            return
                        # Downloading page may result in intermittent 5xx HTTP error
                        # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
                        last_error = 'HTTP Error %s' % e.cause.code
                        if e.cause.code == 404:
                            last_error = last_error + " (this API is probably deprecated)"
                        if count < retries:
                            continue
                    raise
                else:
                    session_token = try_get(browse, lambda x: x['xsrf_token'], compat_str)
                    if session_token:
                        session_token_list[0] = session_token
                    response = try_get(browse,
                                       (lambda x: x['response'],
                                        lambda x: x[1]['response'])) or {}
                    if response.get('continuationContents'):
                        break
                    # YouTube sometimes gives reload: now json if something went wrong (e.g. bad auth)
                    if browse.get('reload'):
                        raise ExtractorError("Invalid or missing params in continuation request", expected=False)
                    # TODO: not tested, merged from old extractor
                    err_msg = browse.get('externalErrorMessage')
                    if err_msg:
                        raise ExtractorError('YouTube said: %s' % err_msg, expected=False)
                    # Youtube sometimes sends incomplete data
                    # See: https://github.com/ytdl-org/youtube-dl/issues/28194
                    last_error = 'Incomplete data received'
                    if count >= retries:
                        self._downloader.report_error(last_error)
            if not response:
                break
            known_continuation_renderers = {
                'itemSectionContinuation': extract_thread,
                'commentRepliesContinuation': extract_thread
            }
            # extract next root continuation from the results
            continuation_contents = try_get(
                response, lambda x: x['continuationContents'], dict) or {}
            for key, value in continuation_contents.items():
                if key not in known_continuation_renderers:
                    continue
                continuation_renderer = value
                if first_continuation:
                    first_continuation = False
                    expected_comment_count = try_get(
                        continuation_renderer,
                        (lambda x: x['header']['commentsHeaderRenderer']['countText']['runs'][0]['text'],
                         lambda x: x['header']['commentsHeaderRenderer']['commentsCount']['runs'][0]['text']),
                        compat_str)
                    if expected_comment_count:
                        comment_counts[1] = str_to_int(expected_comment_count)
                        self.to_screen("Downloading ~%d comments" % str_to_int(expected_comment_count))
                        yield comment_counts[1]
                    # TODO: cli arg.
                    # 1/True for newest, 0/False for popular (default)
                    comment_sort_index = int(True)
                    sort_continuation_renderer = try_get(
                        continuation_renderer,
                        lambda x: x['header']['commentsHeaderRenderer']['sortMenu']['sortFilterSubMenuRenderer']['subMenuItems']
                        [comment_sort_index]['continuation']['reloadContinuationData'], dict)
                    # If this fails, the initial continuation page
                    # starts off with popular anyways.
                    if sort_continuation_renderer:
                        continuation = YoutubeTabIE._build_continuation_query(
                            continuation=sort_continuation_renderer.get('continuation'),
                            ctp=sort_continuation_renderer.get('clickTrackingParams'))
                        self.to_screen("Sorting comments by %s" % ('popular' if comment_sort_index == 0 else 'newest'))
                        break
                for entry in known_continuation_renderers[key](continuation_renderer):
                    yield entry
                continuation = YoutubeTabIE._extract_continuation(continuation_renderer)  # TODO
                break
    def _extract_comments(self, ytcfg, video_id, contents, webpage, xsrf_token):
        """Entry for comment extraction"""
        comments = []
        known_entry_comment_renderers = (
            'itemSectionRenderer',
        )
        estimated_total = 0
        for entry in contents:
            for key, renderer in entry.items():
                if key not in known_entry_comment_renderers:
                    continue
                comment_iter = self._comment_entries(
                    renderer,
                    identity_token=self._extract_identity_token(webpage, item_id=video_id),
                    account_syncid=self._extract_account_syncid(ytcfg),
                    session_token_list=[xsrf_token])
                for comment in comment_iter:
                    if isinstance(comment, int):
                        estimated_total = comment
                        continue
                    comments.append(comment)
                break
        self.to_screen("Downloaded %d/%d comments" % (len(comments), estimated_total))
        return {
            'comments': comments,
            'comment_count': len(comments),
        }
    def _real_extract(self, url):
        url, smuggled_data = unsmuggle_url(url, {})
        video_id = self._match_id(url)
@ -2024,156 +2315,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                    errnote='Unable to download video annotations', fatal=False,
                    data=urlencode_postdata({xsrf_field_name: xsrf_token}))
        # Get comments
        # TODO: Refactor and move to seperate function
        def extract_comments():
            expected_video_comment_count = 0
            video_comments = []
            comment_xsrf = xsrf_token
            def find_value(html, key, num_chars=2, separator='"'):
                pos_begin = html.find(key) + len(key) + num_chars
                pos_end = html.find(separator, pos_begin)
                return html[pos_begin: pos_end]
            def search_dict(partial, key):
                if isinstance(partial, dict):
                    for k, v in partial.items():
                        if k == key:
                            yield v
                        else:
                            for o in search_dict(v, key):
                                yield o
                elif isinstance(partial, list):
                    for i in partial:
                        for o in search_dict(i, key):
                            yield o
            continuations = []
            if initial_data:
                try:
                    ncd = next(search_dict(initial_data, 'nextContinuationData'))
                    continuations = [ncd['continuation']]
                # Handle videos where comments have been disabled entirely
                except StopIteration:
                    pass
            def get_continuation(continuation, session_token, replies=False):
                query = {
                    'pbj': 1,
                    'ctoken': continuation,
                }
                if replies:
                    query['action_get_comment_replies'] = 1
                else:
                    query['action_get_comments'] = 1
                while True:
                    content, handle = self._download_webpage_handle(
                        'https://www.youtube.com/comment_service_ajax',
                        video_id,
                        note=False,
                        expected_status=[413],
                        data=urlencode_postdata({
                            'session_token': session_token
                        }),
                        query=query,
                        headers={
                            'Accept': '*/*',
                            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0',
                            'X-YouTube-Client-Name': '1',
                            'X-YouTube-Client-Version': '2.20201202.06.01'
                        }
                    )
                    response_code = handle.getcode()
                    if (response_code == 200):
                        return self._parse_json(content, video_id)
                    if (response_code == 413):
                        return None
                    raise ExtractorError('Unexpected HTTP error code: %s' % response_code)
            first_continuation = True
            chain_msg = ''
            self.to_screen('Downloading comments')
            while continuations:
                continuation = continuations.pop()
                comment_response = get_continuation(continuation, comment_xsrf)
                if not comment_response:
                    continue
                if list(search_dict(comment_response, 'externalErrorMessage')):
                    raise ExtractorError('Error returned from server: ' + next(search_dict(comment_response, 'externalErrorMessage')))
                if 'continuationContents' not in comment_response['response']:
                    # Something is wrong here. Youtube won't accept this continuation token for some reason and responds with a user satisfaction dialog (error?)
                    continue
                # not sure if this actually helps
                if 'xsrf_token' in comment_response:
                    comment_xsrf = comment_response['xsrf_token']
                item_section = comment_response['response']['continuationContents']['itemSectionContinuation']
                if first_continuation:
                    expected_video_comment_count = int(item_section['header']['commentsHeaderRenderer']['countText']['runs'][0]['text'].replace(' Comments', '').replace('1 Comment', '1').replace(',', ''))
                    first_continuation = False
                if 'contents' not in item_section:
                    # continuation returned no comments?
                    # set an empty array as to not break the for loop
                    item_section['contents'] = []
                for meta_comment in item_section['contents']:
                    comment = meta_comment['commentThreadRenderer']['comment']['commentRenderer']
                    video_comments.append({
                        'id': comment['commentId'],
                        'text': ''.join([c['text'] for c in try_get(comment, lambda x: x['contentText']['runs'], list) or []]),
                        'time_text': ''.join([c['text'] for c in comment['publishedTimeText']['runs']]),
                        'author': comment.get('authorText', {}).get('simpleText', ''),
                        'votes': comment.get('voteCount', {}).get('simpleText', '0'),
                        'author_thumbnail': comment['authorThumbnail']['thumbnails'][-1]['url'],
                        'parent': 'root'
                    })
                    if 'replies' not in meta_comment['commentThreadRenderer']:
                        continue
                    reply_continuations = [rcn['nextContinuationData']['continuation'] for rcn in meta_comment['commentThreadRenderer']['replies']['commentRepliesRenderer']['continuations']]
                    while reply_continuations:
                        time.sleep(1)
                        continuation = reply_continuations.pop()
                        replies_data = get_continuation(continuation, comment_xsrf, True)
                        if not replies_data or 'continuationContents' not in replies_data[1]['response']:
                            continue
                        if self._downloader.params.get('verbose', False):
                            chain_msg = ' (chain %s)' % comment['commentId']
                        self.to_screen('Comments downloaded: %d of ~%d%s' % (len(video_comments), expected_video_comment_count, chain_msg))
                        reply_comment_meta = replies_data[1]['response']['continuationContents']['commentRepliesContinuation']
                        for reply_meta in reply_comment_meta.get('contents', {}):
                            reply_comment = reply_meta['commentRenderer']
                            video_comments.append({
                                'id': reply_comment['commentId'],
                                'text': ''.join([c['text'] for c in reply_comment['contentText']['runs']]),
                                'time_text': ''.join([c['text'] for c in reply_comment['publishedTimeText']['runs']]),
                                'author': reply_comment.get('authorText', {}).get('simpleText', ''),
                                'votes': reply_comment.get('voteCount', {}).get('simpleText', '0'),
                                'author_thumbnail': reply_comment['authorThumbnail']['thumbnails'][-1]['url'],
                                'parent': comment['commentId']
                            })
                        if 'continuations' not in reply_comment_meta or len(reply_comment_meta['continuations']) == 0:
                            continue
                        reply_continuations += [rcn['nextContinuationData']['continuation'] for rcn in reply_comment_meta['continuations']]
                self.to_screen('Comments downloaded: %d of ~%d' % (len(video_comments), expected_video_comment_count))
                if 'continuations' in item_section:
                    continuations += [ncd['nextContinuationData']['continuation'] for ncd in item_section['continuations']]
                time.sleep(1)
            self.to_screen('Total comments downloaded: %d of ~%d' % (len(video_comments), expected_video_comment_count))
            return {
                'comments': video_comments,
                'comment_count': expected_video_comment_count
            }
        if get_comments:
-            info['__post_extractor'] = extract_comments
+            info['__post_extractor'] = lambda: self._extract_comments(ytcfg, video_id, contents, webpage, xsrf_token)
        self.mark_watched(video_id, player_response)
@ -3031,27 +3174,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
        if err_msg:
            raise ExtractorError('YouTube said: %s' % err_msg, expected=expected)
    def _extract_identity_token(self, webpage, item_id):
        ytcfg = self._extract_ytcfg(item_id, webpage)
        if ytcfg:
            token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
            if token:
                return token
        return self._search_regex(
            r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
            'identity token', default=None)
    @staticmethod
    def _extract_account_syncid(data):
        """Extract syncId required to download private playlists of secondary channels"""
        sync_ids = (
            try_get(data, lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'], compat_str)
            or '').split("||")
        if len(sync_ids) >= 2 and sync_ids[1]:
            # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel
            # and just "user_syncid||" for primary channel. We only want the channel_syncid
            return sync_ids[0]
    def _extract_webpage(self, url, item_id):
        retries = self._downloader.params.get('extractor_retries', 3)
        count = -1