[youtube] Improve chapters extraction (closes #13247)

This commit is contained in:
Sergey M․ 2017-06-01 23:29:45 +07:00
parent f7a747ce59
commit 39d4c1be4d
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D
2 changed files with 13 additions and 0 deletions

View file

@ -254,6 +254,13 @@ class TestYoutubeChapters(unittest.TestCase):
'title': '3 - Из серпов луны...[Iz serpov luny]', 'title': '3 - Из серпов луны...[Iz serpov luny]',
}] }]
), ),
(
# https://www.youtube.com/watch?v=xZW70zEasOk
# time point more than duration
'''● LCS Spring finals: Saturday and Sunday from <a href="#" onclick="yt.www.watch.player.seekTo(13*60+30);return false;">13:30</a> outside the venue! <br />● PAX East: Fri, Sat & Sun - more info in tomorrows video on the main channel!''',
283,
[]
),
] ]
def test_youtube_chapters(self): def test_youtube_chapters(self):

View file

@ -1353,10 +1353,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
start_time = parse_duration(time_point) start_time = parse_duration(time_point)
if start_time is None: if start_time is None:
continue continue
if start_time > duration:
break
end_time = (duration if next_num == len(chapter_lines) end_time = (duration if next_num == len(chapter_lines)
else parse_duration(chapter_lines[next_num][1])) else parse_duration(chapter_lines[next_num][1]))
if end_time is None: if end_time is None:
continue continue
if end_time > duration:
end_time = duration
if start_time > end_time:
break
chapter_title = re.sub( chapter_title = re.sub(
r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-') r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
chapter_title = re.sub(r'\s+', ' ', chapter_title) chapter_title = re.sub(r'\s+', ' ', chapter_title)