[Youtube] Rewrite comment extraction (#167)
Closes #121 TODO: * Add an option for the user to specify newest/popular and max number of comments * Refactor the download code and generalize with TabIE * Parse time_text to timestamp
This commit is contained in:
parent
ca87974543
commit
a1c5d2ca64
2 changed files with 300 additions and 171 deletions
|
@ -264,6 +264,7 @@ class InfoExtractor(object):
|
||||||
properties (all but one of text or html optional):
|
properties (all but one of text or html optional):
|
||||||
* "author" - human-readable name of the comment author
|
* "author" - human-readable name of the comment author
|
||||||
* "author_id" - user ID of the comment author
|
* "author_id" - user ID of the comment author
|
||||||
|
* "author_thumbnail" - The thumbnail of the comment author
|
||||||
* "id" - Comment ID
|
* "id" - Comment ID
|
||||||
* "html" - Comment as HTML
|
* "html" - Comment as HTML
|
||||||
* "text" - Plain text of the comment
|
* "text" - Plain text of the comment
|
||||||
|
@ -271,6 +272,12 @@ class InfoExtractor(object):
|
||||||
* "parent" - ID of the comment this one is replying to.
|
* "parent" - ID of the comment this one is replying to.
|
||||||
Set to "root" to indicate that this is a
|
Set to "root" to indicate that this is a
|
||||||
comment to the original video.
|
comment to the original video.
|
||||||
|
* "like_count" - Number of positive ratings of the comment
|
||||||
|
* "dislike_count" - Number of negative ratings of the comment
|
||||||
|
* "is_favorited" - Whether the comment is marked as
|
||||||
|
favorite by the video uploader
|
||||||
|
* "author_is_uploader" - Whether the comment is made by
|
||||||
|
the video uploader
|
||||||
age_limit: Age restriction for the video, as an integer (years)
|
age_limit: Age restriction for the video, as an integer (years)
|
||||||
webpage_url: The URL to the video webpage, if given to yt-dlp it
|
webpage_url: The URL to the video webpage, if given to yt-dlp it
|
||||||
should allow to get the same result again. (It will be set
|
should allow to get the same result again. (It will be set
|
||||||
|
|
|
@ -272,15 +272,21 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
|
||||||
if not self._login():
|
if not self._login():
|
||||||
return
|
return
|
||||||
|
|
||||||
|
_YT_WEB_CLIENT_VERSION = '2.20210301.08.00'
|
||||||
_DEFAULT_API_DATA = {
|
_DEFAULT_API_DATA = {
|
||||||
'context': {
|
'context': {
|
||||||
'client': {
|
'client': {
|
||||||
'clientName': 'WEB',
|
'clientName': 'WEB',
|
||||||
'clientVersion': '2.20210301.08.00',
|
'clientVersion': _YT_WEB_CLIENT_VERSION,
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_DEFAULT_BASIC_API_HEADERS = {
|
||||||
|
'X-YouTube-Client-Name': '1',
|
||||||
|
'X-YouTube-Client-Version': _YT_WEB_CLIENT_VERSION
|
||||||
|
}
|
||||||
|
|
||||||
_YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
|
_YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
|
||||||
_YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
|
_YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
|
||||||
_YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
|
_YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
|
||||||
|
@ -315,6 +321,27 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
|
||||||
self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
|
self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
|
||||||
video_id)
|
video_id)
|
||||||
|
|
||||||
|
def _extract_identity_token(self, webpage, item_id):
|
||||||
|
ytcfg = self._extract_ytcfg(item_id, webpage)
|
||||||
|
if ytcfg:
|
||||||
|
token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
|
||||||
|
if token:
|
||||||
|
return token
|
||||||
|
return self._search_regex(
|
||||||
|
r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
|
||||||
|
'identity token', default=None)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _extract_account_syncid(data):
|
||||||
|
"""Extract syncId required to download private playlists of secondary channels"""
|
||||||
|
sync_ids = (
|
||||||
|
try_get(data, lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'], compat_str)
|
||||||
|
or '').split("||")
|
||||||
|
if len(sync_ids) >= 2 and sync_ids[1]:
|
||||||
|
# datasyncid is of the form "channel_syncid||user_syncid" for secondary channel
|
||||||
|
# and just "user_syncid||" for primary channel. We only want the channel_syncid
|
||||||
|
return sync_ids[0]
|
||||||
|
|
||||||
def _extract_ytcfg(self, video_id, webpage):
|
def _extract_ytcfg(self, video_id, webpage):
|
||||||
return self._parse_json(
|
return self._parse_json(
|
||||||
self._search_regex(
|
self._search_regex(
|
||||||
|
@ -1462,6 +1489,270 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
(r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
|
(r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
|
||||||
regex), webpage, name, default='{}'), video_id, fatal=False)
|
regex), webpage, name, default='{}'), video_id, fatal=False)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _join_text_entries(runs):
|
||||||
|
text = None
|
||||||
|
for run in runs:
|
||||||
|
if not isinstance(run, dict):
|
||||||
|
continue
|
||||||
|
sub_text = try_get(run, lambda x: x['text'], compat_str)
|
||||||
|
if sub_text:
|
||||||
|
if not text:
|
||||||
|
text = sub_text
|
||||||
|
continue
|
||||||
|
text += sub_text
|
||||||
|
return text
|
||||||
|
|
||||||
|
def _extract_comment(self, comment_renderer, parent=None):
|
||||||
|
comment_id = comment_renderer.get('commentId')
|
||||||
|
if not comment_id:
|
||||||
|
return
|
||||||
|
comment_text_runs = try_get(comment_renderer, lambda x: x['contentText']['runs']) or []
|
||||||
|
text = self._join_text_entries(comment_text_runs) or ''
|
||||||
|
comment_time_text = try_get(comment_renderer, lambda x: x['publishedTimeText']['runs']) or []
|
||||||
|
time_text = self._join_text_entries(comment_time_text)
|
||||||
|
|
||||||
|
author = try_get(comment_renderer, lambda x: x['authorText']['simpleText'], compat_str)
|
||||||
|
author_id = try_get(comment_renderer,
|
||||||
|
lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str)
|
||||||
|
votes = str_to_int(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'],
|
||||||
|
lambda x: x['likeCount']), compat_str)) or 0
|
||||||
|
author_thumbnail = try_get(comment_renderer,
|
||||||
|
lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], compat_str)
|
||||||
|
|
||||||
|
author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool)
|
||||||
|
is_liked = try_get(comment_renderer, lambda x: x['isLiked'], bool)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'id': comment_id,
|
||||||
|
'text': text,
|
||||||
|
# TODO: This should be parsed to timestamp
|
||||||
|
'time_text': time_text,
|
||||||
|
'like_count': votes,
|
||||||
|
'is_favorited': is_liked,
|
||||||
|
'author': author,
|
||||||
|
'author_id': author_id,
|
||||||
|
'author_thumbnail': author_thumbnail,
|
||||||
|
'author_is_uploader': author_is_uploader,
|
||||||
|
'parent': parent or 'root'
|
||||||
|
}
|
||||||
|
|
||||||
|
def _comment_entries(self, root_continuation_data, identity_token, account_syncid,
|
||||||
|
session_token_list, parent=None, comment_counts=None):
|
||||||
|
|
||||||
|
def extract_thread(parent_renderer):
|
||||||
|
contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
|
||||||
|
if not parent:
|
||||||
|
comment_counts[2] = 0
|
||||||
|
for content in contents:
|
||||||
|
comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
|
||||||
|
comment_renderer = try_get(
|
||||||
|
comment_thread_renderer, (lambda x: x['comment']['commentRenderer'], dict)) or try_get(
|
||||||
|
content, (lambda x: x['commentRenderer'], dict))
|
||||||
|
|
||||||
|
if not comment_renderer:
|
||||||
|
continue
|
||||||
|
comment = self._extract_comment(comment_renderer, parent)
|
||||||
|
if not comment:
|
||||||
|
continue
|
||||||
|
comment_counts[0] += 1
|
||||||
|
yield comment
|
||||||
|
# Attempt to get the replies
|
||||||
|
comment_replies_renderer = try_get(
|
||||||
|
comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict)
|
||||||
|
|
||||||
|
if comment_replies_renderer:
|
||||||
|
comment_counts[2] += 1
|
||||||
|
comment_entries_iter = self._comment_entries(
|
||||||
|
comment_replies_renderer, identity_token, account_syncid,
|
||||||
|
parent=comment.get('id'), session_token_list=session_token_list,
|
||||||
|
comment_counts=comment_counts)
|
||||||
|
|
||||||
|
for reply_comment in comment_entries_iter:
|
||||||
|
yield reply_comment
|
||||||
|
|
||||||
|
if not comment_counts:
|
||||||
|
# comment so far, est. total comments, current comment thread #
|
||||||
|
comment_counts = [0, 0, 0]
|
||||||
|
headers = self._DEFAULT_BASIC_API_HEADERS.copy()
|
||||||
|
|
||||||
|
# TODO: Generalize the download code with TabIE
|
||||||
|
if identity_token:
|
||||||
|
headers['x-youtube-identity-token'] = identity_token
|
||||||
|
|
||||||
|
if account_syncid:
|
||||||
|
headers['X-Goog-PageId'] = account_syncid
|
||||||
|
headers['X-Goog-AuthUser'] = 0
|
||||||
|
|
||||||
|
continuation = YoutubeTabIE._extract_continuation(root_continuation_data) # TODO
|
||||||
|
first_continuation = False
|
||||||
|
if parent is None:
|
||||||
|
first_continuation = True
|
||||||
|
|
||||||
|
for page_num in itertools.count(0):
|
||||||
|
if not continuation:
|
||||||
|
break
|
||||||
|
retries = self._downloader.params.get('extractor_retries', 3)
|
||||||
|
count = -1
|
||||||
|
last_error = None
|
||||||
|
|
||||||
|
while count < retries:
|
||||||
|
count += 1
|
||||||
|
if last_error:
|
||||||
|
self.report_warning('%s. Retrying ...' % last_error)
|
||||||
|
try:
|
||||||
|
query = {
|
||||||
|
'ctoken': continuation['ctoken'],
|
||||||
|
'pbj': 1,
|
||||||
|
'type': 'next',
|
||||||
|
}
|
||||||
|
if parent:
|
||||||
|
query['action_get_comment_replies'] = 1
|
||||||
|
else:
|
||||||
|
query['action_get_comments'] = 1
|
||||||
|
|
||||||
|
comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1])
|
||||||
|
if page_num == 0:
|
||||||
|
if first_continuation:
|
||||||
|
note_prefix = "Downloading initial comment continuation page"
|
||||||
|
else:
|
||||||
|
note_prefix = " Downloading comment reply thread %d %s" % (comment_counts[2], comment_prog_str)
|
||||||
|
else:
|
||||||
|
note_prefix = "%sDownloading comment%s page %d %s" % (
|
||||||
|
" " if parent else "",
|
||||||
|
' replies' if parent else '',
|
||||||
|
page_num,
|
||||||
|
comment_prog_str)
|
||||||
|
|
||||||
|
browse = self._download_json(
|
||||||
|
'https://www.youtube.com/comment_service_ajax', None,
|
||||||
|
'%s %s' % (note_prefix, '(retry #%d)' % count if count else ''),
|
||||||
|
headers=headers, query=query,
|
||||||
|
data=urlencode_postdata({
|
||||||
|
'session_token': session_token_list[0]
|
||||||
|
}))
|
||||||
|
except ExtractorError as e:
|
||||||
|
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404, 413):
|
||||||
|
if e.cause.code == 413:
|
||||||
|
self.report_warning("Assumed end of comments (received HTTP Error 413)")
|
||||||
|
return
|
||||||
|
# Downloading page may result in intermittent 5xx HTTP error
|
||||||
|
# Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
|
||||||
|
last_error = 'HTTP Error %s' % e.cause.code
|
||||||
|
if e.cause.code == 404:
|
||||||
|
last_error = last_error + " (this API is probably deprecated)"
|
||||||
|
if count < retries:
|
||||||
|
continue
|
||||||
|
raise
|
||||||
|
else:
|
||||||
|
session_token = try_get(browse, lambda x: x['xsrf_token'], compat_str)
|
||||||
|
if session_token:
|
||||||
|
session_token_list[0] = session_token
|
||||||
|
|
||||||
|
response = try_get(browse,
|
||||||
|
(lambda x: x['response'],
|
||||||
|
lambda x: x[1]['response'])) or {}
|
||||||
|
|
||||||
|
if response.get('continuationContents'):
|
||||||
|
break
|
||||||
|
|
||||||
|
# YouTube sometimes gives reload: now json if something went wrong (e.g. bad auth)
|
||||||
|
if browse.get('reload'):
|
||||||
|
raise ExtractorError("Invalid or missing params in continuation request", expected=False)
|
||||||
|
|
||||||
|
# TODO: not tested, merged from old extractor
|
||||||
|
err_msg = browse.get('externalErrorMessage')
|
||||||
|
if err_msg:
|
||||||
|
raise ExtractorError('YouTube said: %s' % err_msg, expected=False)
|
||||||
|
|
||||||
|
# Youtube sometimes sends incomplete data
|
||||||
|
# See: https://github.com/ytdl-org/youtube-dl/issues/28194
|
||||||
|
last_error = 'Incomplete data received'
|
||||||
|
if count >= retries:
|
||||||
|
self._downloader.report_error(last_error)
|
||||||
|
|
||||||
|
if not response:
|
||||||
|
break
|
||||||
|
|
||||||
|
known_continuation_renderers = {
|
||||||
|
'itemSectionContinuation': extract_thread,
|
||||||
|
'commentRepliesContinuation': extract_thread
|
||||||
|
}
|
||||||
|
|
||||||
|
# extract next root continuation from the results
|
||||||
|
continuation_contents = try_get(
|
||||||
|
response, lambda x: x['continuationContents'], dict) or {}
|
||||||
|
|
||||||
|
for key, value in continuation_contents.items():
|
||||||
|
if key not in known_continuation_renderers:
|
||||||
|
continue
|
||||||
|
continuation_renderer = value
|
||||||
|
|
||||||
|
if first_continuation:
|
||||||
|
first_continuation = False
|
||||||
|
expected_comment_count = try_get(
|
||||||
|
continuation_renderer,
|
||||||
|
(lambda x: x['header']['commentsHeaderRenderer']['countText']['runs'][0]['text'],
|
||||||
|
lambda x: x['header']['commentsHeaderRenderer']['commentsCount']['runs'][0]['text']),
|
||||||
|
compat_str)
|
||||||
|
|
||||||
|
if expected_comment_count:
|
||||||
|
comment_counts[1] = str_to_int(expected_comment_count)
|
||||||
|
self.to_screen("Downloading ~%d comments" % str_to_int(expected_comment_count))
|
||||||
|
yield comment_counts[1]
|
||||||
|
|
||||||
|
# TODO: cli arg.
|
||||||
|
# 1/True for newest, 0/False for popular (default)
|
||||||
|
comment_sort_index = int(True)
|
||||||
|
sort_continuation_renderer = try_get(
|
||||||
|
continuation_renderer,
|
||||||
|
lambda x: x['header']['commentsHeaderRenderer']['sortMenu']['sortFilterSubMenuRenderer']['subMenuItems']
|
||||||
|
[comment_sort_index]['continuation']['reloadContinuationData'], dict)
|
||||||
|
# If this fails, the initial continuation page
|
||||||
|
# starts off with popular anyways.
|
||||||
|
if sort_continuation_renderer:
|
||||||
|
continuation = YoutubeTabIE._build_continuation_query(
|
||||||
|
continuation=sort_continuation_renderer.get('continuation'),
|
||||||
|
ctp=sort_continuation_renderer.get('clickTrackingParams'))
|
||||||
|
self.to_screen("Sorting comments by %s" % ('popular' if comment_sort_index == 0 else 'newest'))
|
||||||
|
break
|
||||||
|
|
||||||
|
for entry in known_continuation_renderers[key](continuation_renderer):
|
||||||
|
yield entry
|
||||||
|
|
||||||
|
continuation = YoutubeTabIE._extract_continuation(continuation_renderer) # TODO
|
||||||
|
break
|
||||||
|
|
||||||
|
def _extract_comments(self, ytcfg, video_id, contents, webpage, xsrf_token):
|
||||||
|
"""Entry for comment extraction"""
|
||||||
|
comments = []
|
||||||
|
known_entry_comment_renderers = (
|
||||||
|
'itemSectionRenderer',
|
||||||
|
)
|
||||||
|
estimated_total = 0
|
||||||
|
for entry in contents:
|
||||||
|
for key, renderer in entry.items():
|
||||||
|
if key not in known_entry_comment_renderers:
|
||||||
|
continue
|
||||||
|
|
||||||
|
comment_iter = self._comment_entries(
|
||||||
|
renderer,
|
||||||
|
identity_token=self._extract_identity_token(webpage, item_id=video_id),
|
||||||
|
account_syncid=self._extract_account_syncid(ytcfg),
|
||||||
|
session_token_list=[xsrf_token])
|
||||||
|
|
||||||
|
for comment in comment_iter:
|
||||||
|
if isinstance(comment, int):
|
||||||
|
estimated_total = comment
|
||||||
|
continue
|
||||||
|
comments.append(comment)
|
||||||
|
break
|
||||||
|
self.to_screen("Downloaded %d/%d comments" % (len(comments), estimated_total))
|
||||||
|
return {
|
||||||
|
'comments': comments,
|
||||||
|
'comment_count': len(comments),
|
||||||
|
}
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
url, smuggled_data = unsmuggle_url(url, {})
|
url, smuggled_data = unsmuggle_url(url, {})
|
||||||
video_id = self._match_id(url)
|
video_id = self._match_id(url)
|
||||||
|
@ -2024,156 +2315,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
errnote='Unable to download video annotations', fatal=False,
|
errnote='Unable to download video annotations', fatal=False,
|
||||||
data=urlencode_postdata({xsrf_field_name: xsrf_token}))
|
data=urlencode_postdata({xsrf_field_name: xsrf_token}))
|
||||||
|
|
||||||
# Get comments
|
|
||||||
# TODO: Refactor and move to seperate function
|
|
||||||
def extract_comments():
|
|
||||||
expected_video_comment_count = 0
|
|
||||||
video_comments = []
|
|
||||||
comment_xsrf = xsrf_token
|
|
||||||
|
|
||||||
def find_value(html, key, num_chars=2, separator='"'):
|
|
||||||
pos_begin = html.find(key) + len(key) + num_chars
|
|
||||||
pos_end = html.find(separator, pos_begin)
|
|
||||||
return html[pos_begin: pos_end]
|
|
||||||
|
|
||||||
def search_dict(partial, key):
|
|
||||||
if isinstance(partial, dict):
|
|
||||||
for k, v in partial.items():
|
|
||||||
if k == key:
|
|
||||||
yield v
|
|
||||||
else:
|
|
||||||
for o in search_dict(v, key):
|
|
||||||
yield o
|
|
||||||
elif isinstance(partial, list):
|
|
||||||
for i in partial:
|
|
||||||
for o in search_dict(i, key):
|
|
||||||
yield o
|
|
||||||
|
|
||||||
continuations = []
|
|
||||||
if initial_data:
|
|
||||||
try:
|
|
||||||
ncd = next(search_dict(initial_data, 'nextContinuationData'))
|
|
||||||
continuations = [ncd['continuation']]
|
|
||||||
# Handle videos where comments have been disabled entirely
|
|
||||||
except StopIteration:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def get_continuation(continuation, session_token, replies=False):
|
|
||||||
query = {
|
|
||||||
'pbj': 1,
|
|
||||||
'ctoken': continuation,
|
|
||||||
}
|
|
||||||
if replies:
|
|
||||||
query['action_get_comment_replies'] = 1
|
|
||||||
else:
|
|
||||||
query['action_get_comments'] = 1
|
|
||||||
|
|
||||||
while True:
|
|
||||||
content, handle = self._download_webpage_handle(
|
|
||||||
'https://www.youtube.com/comment_service_ajax',
|
|
||||||
video_id,
|
|
||||||
note=False,
|
|
||||||
expected_status=[413],
|
|
||||||
data=urlencode_postdata({
|
|
||||||
'session_token': session_token
|
|
||||||
}),
|
|
||||||
query=query,
|
|
||||||
headers={
|
|
||||||
'Accept': '*/*',
|
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0',
|
|
||||||
'X-YouTube-Client-Name': '1',
|
|
||||||
'X-YouTube-Client-Version': '2.20201202.06.01'
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
response_code = handle.getcode()
|
|
||||||
if (response_code == 200):
|
|
||||||
return self._parse_json(content, video_id)
|
|
||||||
if (response_code == 413):
|
|
||||||
return None
|
|
||||||
raise ExtractorError('Unexpected HTTP error code: %s' % response_code)
|
|
||||||
|
|
||||||
first_continuation = True
|
|
||||||
chain_msg = ''
|
|
||||||
self.to_screen('Downloading comments')
|
|
||||||
while continuations:
|
|
||||||
continuation = continuations.pop()
|
|
||||||
comment_response = get_continuation(continuation, comment_xsrf)
|
|
||||||
if not comment_response:
|
|
||||||
continue
|
|
||||||
if list(search_dict(comment_response, 'externalErrorMessage')):
|
|
||||||
raise ExtractorError('Error returned from server: ' + next(search_dict(comment_response, 'externalErrorMessage')))
|
|
||||||
|
|
||||||
if 'continuationContents' not in comment_response['response']:
|
|
||||||
# Something is wrong here. Youtube won't accept this continuation token for some reason and responds with a user satisfaction dialog (error?)
|
|
||||||
continue
|
|
||||||
# not sure if this actually helps
|
|
||||||
if 'xsrf_token' in comment_response:
|
|
||||||
comment_xsrf = comment_response['xsrf_token']
|
|
||||||
|
|
||||||
item_section = comment_response['response']['continuationContents']['itemSectionContinuation']
|
|
||||||
if first_continuation:
|
|
||||||
expected_video_comment_count = int(item_section['header']['commentsHeaderRenderer']['countText']['runs'][0]['text'].replace(' Comments', '').replace('1 Comment', '1').replace(',', ''))
|
|
||||||
first_continuation = False
|
|
||||||
if 'contents' not in item_section:
|
|
||||||
# continuation returned no comments?
|
|
||||||
# set an empty array as to not break the for loop
|
|
||||||
item_section['contents'] = []
|
|
||||||
|
|
||||||
for meta_comment in item_section['contents']:
|
|
||||||
comment = meta_comment['commentThreadRenderer']['comment']['commentRenderer']
|
|
||||||
video_comments.append({
|
|
||||||
'id': comment['commentId'],
|
|
||||||
'text': ''.join([c['text'] for c in try_get(comment, lambda x: x['contentText']['runs'], list) or []]),
|
|
||||||
'time_text': ''.join([c['text'] for c in comment['publishedTimeText']['runs']]),
|
|
||||||
'author': comment.get('authorText', {}).get('simpleText', ''),
|
|
||||||
'votes': comment.get('voteCount', {}).get('simpleText', '0'),
|
|
||||||
'author_thumbnail': comment['authorThumbnail']['thumbnails'][-1]['url'],
|
|
||||||
'parent': 'root'
|
|
||||||
})
|
|
||||||
if 'replies' not in meta_comment['commentThreadRenderer']:
|
|
||||||
continue
|
|
||||||
|
|
||||||
reply_continuations = [rcn['nextContinuationData']['continuation'] for rcn in meta_comment['commentThreadRenderer']['replies']['commentRepliesRenderer']['continuations']]
|
|
||||||
while reply_continuations:
|
|
||||||
time.sleep(1)
|
|
||||||
continuation = reply_continuations.pop()
|
|
||||||
replies_data = get_continuation(continuation, comment_xsrf, True)
|
|
||||||
if not replies_data or 'continuationContents' not in replies_data[1]['response']:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if self._downloader.params.get('verbose', False):
|
|
||||||
chain_msg = ' (chain %s)' % comment['commentId']
|
|
||||||
self.to_screen('Comments downloaded: %d of ~%d%s' % (len(video_comments), expected_video_comment_count, chain_msg))
|
|
||||||
reply_comment_meta = replies_data[1]['response']['continuationContents']['commentRepliesContinuation']
|
|
||||||
for reply_meta in reply_comment_meta.get('contents', {}):
|
|
||||||
reply_comment = reply_meta['commentRenderer']
|
|
||||||
video_comments.append({
|
|
||||||
'id': reply_comment['commentId'],
|
|
||||||
'text': ''.join([c['text'] for c in reply_comment['contentText']['runs']]),
|
|
||||||
'time_text': ''.join([c['text'] for c in reply_comment['publishedTimeText']['runs']]),
|
|
||||||
'author': reply_comment.get('authorText', {}).get('simpleText', ''),
|
|
||||||
'votes': reply_comment.get('voteCount', {}).get('simpleText', '0'),
|
|
||||||
'author_thumbnail': reply_comment['authorThumbnail']['thumbnails'][-1]['url'],
|
|
||||||
'parent': comment['commentId']
|
|
||||||
})
|
|
||||||
if 'continuations' not in reply_comment_meta or len(reply_comment_meta['continuations']) == 0:
|
|
||||||
continue
|
|
||||||
reply_continuations += [rcn['nextContinuationData']['continuation'] for rcn in reply_comment_meta['continuations']]
|
|
||||||
|
|
||||||
self.to_screen('Comments downloaded: %d of ~%d' % (len(video_comments), expected_video_comment_count))
|
|
||||||
if 'continuations' in item_section:
|
|
||||||
continuations += [ncd['nextContinuationData']['continuation'] for ncd in item_section['continuations']]
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
self.to_screen('Total comments downloaded: %d of ~%d' % (len(video_comments), expected_video_comment_count))
|
|
||||||
return {
|
|
||||||
'comments': video_comments,
|
|
||||||
'comment_count': expected_video_comment_count
|
|
||||||
}
|
|
||||||
|
|
||||||
if get_comments:
|
if get_comments:
|
||||||
info['__post_extractor'] = extract_comments
|
info['__post_extractor'] = lambda: self._extract_comments(ytcfg, video_id, contents, webpage, xsrf_token)
|
||||||
|
|
||||||
self.mark_watched(video_id, player_response)
|
self.mark_watched(video_id, player_response)
|
||||||
|
|
||||||
|
@ -3031,27 +3174,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
|
||||||
if err_msg:
|
if err_msg:
|
||||||
raise ExtractorError('YouTube said: %s' % err_msg, expected=expected)
|
raise ExtractorError('YouTube said: %s' % err_msg, expected=expected)
|
||||||
|
|
||||||
def _extract_identity_token(self, webpage, item_id):
|
|
||||||
ytcfg = self._extract_ytcfg(item_id, webpage)
|
|
||||||
if ytcfg:
|
|
||||||
token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
|
|
||||||
if token:
|
|
||||||
return token
|
|
||||||
return self._search_regex(
|
|
||||||
r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
|
|
||||||
'identity token', default=None)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _extract_account_syncid(data):
|
|
||||||
"""Extract syncId required to download private playlists of secondary channels"""
|
|
||||||
sync_ids = (
|
|
||||||
try_get(data, lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'], compat_str)
|
|
||||||
or '').split("||")
|
|
||||||
if len(sync_ids) >= 2 and sync_ids[1]:
|
|
||||||
# datasyncid is of the form "channel_syncid||user_syncid" for secondary channel
|
|
||||||
# and just "user_syncid||" for primary channel. We only want the channel_syncid
|
|
||||||
return sync_ids[0]
|
|
||||||
|
|
||||||
def _extract_webpage(self, url, item_id):
|
def _extract_webpage(self, url, item_id):
|
||||||
retries = self._downloader.params.get('extractor_retries', 3)
|
retries = self._downloader.params.get('extractor_retries', 3)
|
||||||
count = -1
|
count = -1
|
||||||
|
|
Loading…
Reference in a new issue