Pull 'extract_videos' out into seperate function

This commit is contained in:
Omar Roth 2018-08-10 09:44:19 -05:00
parent 2f8716d97f
commit 15c26d022b
4 changed files with 157 additions and 249 deletions

View file

@ -1283,23 +1283,31 @@ get "/feed/channel/:ucid" do |env|
if !ucid.match(/UC[a-zA-Z0-9_-]{22}/) if !ucid.match(/UC[a-zA-Z0-9_-]{22}/)
rss = client.get("/feeds/videos.xml?user=#{ucid}") rss = client.get("/feeds/videos.xml?user=#{ucid}")
rss = XML.parse_html(rss.body) rss = XML.parse_html(rss.body)
ucid = rss.xpath_node("//feed/channelid") ucid = rss.xpath_node("//feed/channelid")
if !ucid if !ucid
error_message = "User does not exist." error_message = "User does not exist."
halt env, status_code: 404, response: error_message halt env, status_code: 404, response: error_message
end end
next env.redirect "/channel/#{ucid}" ucid = ucid.content
next env.redirect "/feed/channel/#{ucid}"
end end
url = produce_videos_url(ucid) url = produce_videos_url(ucid)
response = client.get(url) response = client.get(url)
response = JSON.parse(response.body) json = JSON.parse(response.body)
if !response["content_html"]?
error_message = "This channel does not exist." if json["content_html"].as_s.empty?
halt env, status_code: 404, response: error_message if response.status_code == 500
error_message = "This channel does not exist."
halt env, status_code: 404, response: error_message
else
next ""
end
end end
content_html = response["content_html"].as_s
content_html = json["content_html"].as_s
document = XML.parse_html(content_html) document = XML.parse_html(content_html)
channel = get_channel(ucid, client, PG_DB, pull_all_videos: false) channel = get_channel(ucid, client, PG_DB, pull_all_videos: false)
@ -1321,7 +1329,8 @@ get "/feed/channel/:ucid" do |env|
xml.element("uri") { xml.text "#{host_url}/channel/#{ucid}" } xml.element("uri") { xml.text "#{host_url}/channel/#{ucid}" }
end end
extract_channel_videos(document, channel.author, ucid).each do |video| nodeset = document.xpath_nodes(%q(//li[contains(@class, "feed-item-container")]))
extract_videos(nodeset).each do |video|
xml.element("entry") do xml.element("entry") do
xml.element("id") { xml.text "yt:video:#{video.id}" } xml.element("id") { xml.text "yt:video:#{video.id}" }
xml.element("yt:videoId") { xml.text video.id } xml.element("yt:videoId") { xml.text video.id }
@ -1480,12 +1489,14 @@ get "/channel/:ucid" do |env|
if !ucid.match(/UC[a-zA-Z0-9_-]{22}/) if !ucid.match(/UC[a-zA-Z0-9_-]{22}/)
rss = client.get("/feeds/videos.xml?user=#{ucid}") rss = client.get("/feeds/videos.xml?user=#{ucid}")
rss = XML.parse_html(rss.body) rss = XML.parse_html(rss.body)
ucid = rss.xpath_node("//feed/channelid") ucid = rss.xpath_node("//feed/channelid")
if !ucid if !ucid
error_message = "User does not exist." error_message = "User does not exist."
next templated "error" next templated "error"
end end
ucid = ucid.content
next env.redirect "/channel/#{ucid}" next env.redirect "/channel/#{ucid}"
end end
@ -1520,7 +1531,7 @@ get "/channel/:ucid" do |env|
id = HTTP::Params.parse(href.query.not_nil!)["v"] id = HTTP::Params.parse(href.query.not_nil!)["v"]
title = node.content title = node.content
videos << ChannelVideo.new(id, title, Time.now, Time.now, ucid, author) videos << ChannelVideo.new(id, title, Time.now, Time.now, "", "")
end end
templated "channel" templated "channel"
@ -2002,54 +2013,24 @@ get "/api/v1/trending" do |env|
trending = XML.parse_html(trending) trending = XML.parse_html(trending)
videos = JSON.build do |json| videos = JSON.build do |json|
json.array do json.array do
trending.xpath_nodes(%q(//ul/li[@class="expanded-shelf-content-item-wrapper"])).each do |node| nodeset = trending.xpath_nodes(%q(//ul/li[@class="expanded-shelf-content-item-wrapper"]))
anchor = node.xpath_node(%q(.//h3/a)).not_nil! extract_videos(nodeset).each do |video|
title = anchor.content
id = anchor["href"].lchop("/watch?v=")
anchor = node.xpath_node(%q(.//div[contains(@class, "yt-lockup-byline")]/a)).not_nil!
author = anchor.content
author_url = anchor["href"]
metadata = node.xpath_nodes(%q(.//div[contains(@class,"yt-lockup-meta")]/ul/li))
if metadata.size == 0
next
elsif metadata.size == 1
view_count = metadata[0].content.rchop(" watching").delete(",").to_i64
published = Time.now
else
published = decode_date(metadata[0].content)
view_count = metadata[1].content.rchop(" views")
if view_count == "No"
view_count = 0_i64
else
view_count = view_count.delete(",").to_i64
end
end
description_html = node.xpath_node(%q(.//div[contains(@class, "yt-lockup-description")]))
description, description_html = html_to_description(description_html)
length_seconds = decode_length_seconds(node.xpath_node(%q(.//span[@class="video-time"])).not_nil!.content)
json.object do json.object do
json.field "title", title json.field "title", video.title
json.field "videoId", id json.field "videoId", video.id
json.field "videoThumbnails" do json.field "videoThumbnails" do
generate_thumbnails(json, id) generate_thumbnails(json, video.id)
end end
json.field "lengthSeconds", length_seconds json.field "lengthSeconds", video.length_seconds
json.field "viewCount", view_count json.field "viewCount", video.views
json.field "author", author json.field "author", video.author
json.field "authorUrl", author_url json.field "authorUrl", "/channel/#{video.ucid}"
json.field "published", published.epoch json.field "published", video.published.epoch
json.field "description", description json.field "description", video.description
json.field "descriptionHtml", description_html json.field "descriptionHtml", video.description_html
end end
end end
end end
@ -2096,16 +2077,17 @@ get "/api/v1/channels/:ucid" do |env|
client = make_client(YT_URL) client = make_client(YT_URL)
if !ucid.match(/UC[a-zA-Z0-9_-]{22}/) if !ucid.match(/UC[a-zA-Z0-9_-]{22}/)
rss = client.get("/feeds/videos.xml?user=#{ucid}").body rss = client.get("/feeds/videos.xml?user=#{ucid}")
rss = XML.parse_html(rss) rss = XML.parse_html(rss.body)
ucid = rss.xpath_node("//feed/channelid") ucid = rss.xpath_node("//feed/channelid")
if ucid if !ucid
ucid = ucid.content
else
env.response.content_type = "application/json" env.response.content_type = "application/json"
next {"error" => "User does not exist"}.to_json next {"error" => "User does not exist"}.to_json
end end
ucid = ucid.content
next env.redirect "/api/v1/channels/#{ucid}"
end end
channel = get_channel(ucid, client, PG_DB, pull_all_videos: false) channel = get_channel(ucid, client, PG_DB, pull_all_videos: false)
@ -2212,25 +2194,36 @@ get "/api/v1/channels/:ucid/videos" do |env|
client = make_client(YT_URL) client = make_client(YT_URL)
if !ucid.match(/UC[a-zA-Z0-9_-]{22}/) if !ucid.match(/UC[a-zA-Z0-9_-]{22}/)
rss = client.get("/feeds/videos.xml?user=#{ucid}").body rss = client.get("/feeds/videos.xml?user=#{ucid}")
rss = XML.parse_html(rss) rss = XML.parse_html(rss.body)
ucid = rss.xpath_node("//feed/channelid") ucid = rss.xpath_node("//feed/channelid")
if ucid if !ucid
ucid = ucid.content
else
env.response.content_type = "application/json" env.response.content_type = "application/json"
next {"error" => "User does not exist"}.to_json next {"error" => "User does not exist"}.to_json
end end
ucid = ucid.content
url = "/api/v1/channels/#{ucid}/videos"
if env.params.query
url += "?#{env.params.query}"
end
next env.redirect url
end end
url = produce_videos_url(ucid, page) url = produce_videos_url(ucid, page)
response = client.get(url) response = client.get(url)
json = JSON.parse(response.body) json = JSON.parse(response.body)
if !json["content_html"]? || json["content_html"].as_s.empty? if !json["content_html"]?
env.response.content_type = "application/json" env.response.content_type = "application/json"
next {"error" => "No videos or nonexistent channel"}.to_json
if response.status_code == 500
response = {"Error" => "Channel does not exist"}.to_json
halt env, status_code: 404, response: response
else
next Array(String).new.to_json
end
end end
content_html = json["content_html"].as_s content_html = json["content_html"].as_s
@ -2242,47 +2235,22 @@ get "/api/v1/channels/:ucid/videos" do |env|
videos = JSON.build do |json| videos = JSON.build do |json|
json.array do json.array do
document.xpath_nodes(%q(//li[contains(@class, "feed-item-container")])).each do |node| nodeset = document.xpath_nodes(%q(//li[contains(@class, "feed-item-container")]))
anchor = node.xpath_node(%q(.//h3[contains(@class,"yt-lockup-title")]/a)).not_nil! extract_videos(nodeset, ucid).each do |video|
title = anchor.content.strip
video_id = anchor["href"].lchop("/watch?v=")
metadata = node.xpath_nodes(%q(.//div[contains(@class,"yt-lockup-meta")]/ul/li))
if metadata.size == 0
next
elsif metadata.size == 1
view_count = metadata[0].content.split(" ")[0].delete(",").to_i64
published = Time.now
else
published = decode_date(metadata[0].content)
view_count = metadata[1].content.split(" ")[0]
if view_count == "No"
view_count = 0_i64
else
view_count = view_count.delete(",").to_i64
end
end
description_html = node.xpath_node(%q(.//div[contains(@class, "yt-lockup-description")]))
description, description_html = html_to_description(description_html)
length_seconds = decode_length_seconds(node.xpath_node(%q(.//span[@class="video-time"])).not_nil!.content)
json.object do json.object do
json.field "title", title json.field "title", video.title
json.field "videoId", video_id json.field "videoId", video.id
json.field "videoThumbnails" do json.field "videoThumbnails" do
generate_thumbnails(json, video_id) generate_thumbnails(json, video.id)
end end
json.field "description", description json.field "description", video.description
json.field "descriptionHtml", description_html json.field "descriptionHtml", video.description_html
json.field "viewCount", view_count json.field "viewCount", video.views
json.field "published", published.epoch json.field "published", video.published.epoch
json.field "lengthSeconds", length_seconds json.field "lengthSeconds", video.length_seconds
end end
end end
end end
@ -2344,7 +2312,7 @@ get "/api/v1/search" do |env|
json.field "description", video.description json.field "description", video.description
json.field "descriptionHtml", video.description_html json.field "descriptionHtml", video.description_html
json.field "viewCount", video.view_count json.field "viewCount", video.views
json.field "published", video.published.epoch json.field "published", video.published.epoch
json.field "lengthSeconds", video.length_seconds json.field "lengthSeconds", video.length_seconds
end end

View file

@ -130,69 +130,3 @@ def fetch_channel(ucid, client, db, pull_all_videos = true)
return channel return channel
end end
def extract_channel_videos(document, author, ucid)
channel_videos = [] of Video
document.xpath_nodes(%q(//li[contains(@class, "feed-item-container")])).each do |node|
anchor = node.xpath_node(%q(.//h3[contains(@class,"yt-lockup-title")]/a))
if !anchor
next
end
if anchor["href"].starts_with? "https://www.googleadservices.com"
next
end
title = anchor.content.strip
id = anchor["href"].lchop("/watch?v=")
metadata = node.xpath_nodes(%q(.//div[contains(@class,"yt-lockup-meta")]/ul/li))
if metadata.size == 0
next
elsif metadata.size == 1
view_count = metadata[0].content.split(" ")[0].delete(",").to_i64
published = Time.now
else
published = decode_date(metadata[0].content)
view_count = metadata[1].content.split(" ")[0]
if view_count == "No"
view_count = 0_i64
else
view_count = view_count.delete(",").to_i64
end
end
description_html = node.xpath_node(%q(.//div[contains(@class, "yt-lockup-description")]))
description, description_html = html_to_description(description_html)
length_seconds = node.xpath_node(%q(.//span[@class="video-time"]))
if length_seconds
length_seconds = decode_length_seconds(length_seconds.content)
else
length_seconds = -1
end
info = HTTP::Params.parse("length_seconds=#{length_seconds}")
channel_videos << Video.new(
id,
info,
Time.now,
title,
view_count,
0, # Like count
0, # Dislike count
0.0, # Wilson score
published,
description,
"", # Language,
author,
ucid,
[] of String, # Allowed regions
true, # Is family friendly
"" # Genre
)
end
return channel_videos
end

View file

@ -286,3 +286,91 @@ def html_to_description(description_html)
return description, description_html return description, description_html
end end
def extract_videos(nodeset, ucid = nil)
# TODO: Make this a 'common', so it makes more sense to be used here
videos = [] of SearchVideo
nodeset.each do |node|
anchor = node.xpath_node(%q(.//h3[contains(@class,"yt-lockup-title")]/a))
if !anchor
next
end
if anchor["href"].starts_with? "https://www.googleadservices.com"
next
end
title = anchor.content.strip
id = anchor["href"].lchop("/watch?v=")
if ucid
author = ""
author_id = ""
else
anchor = node.xpath_node(%q(.//div[contains(@class, "yt-lockup-byline")]/a))
if !anchor
next
end
author = anchor.content
author_id = anchor["href"].split("/")[-1]
end
# Skip playlists
if node.xpath_node(%q(.//div[contains(@class, "yt-playlist-renderer")]))
next
end
# Skip movies
if node.xpath_node(%q(.//div[contains(@class, "yt-lockup-movie-top-content")]))
next
end
metadata = node.xpath_nodes(%q(.//div[contains(@class,"yt-lockup-meta")]/ul/li))
if metadata.size == 0
next
elsif metadata.size == 1
if metadata[0].content.starts_with? "Starts"
view_count = 0_i64
published = Time.epoch(metadata[0].xpath_node(%q(.//span)).not_nil!["data-timestamp"].to_i64)
else
view_count = metadata[0].content.lchop("Streamed ").split(" ")[0].delete(",").to_i64
published = Time.now
end
else
published = decode_date(metadata[0].content)
view_count = metadata[1].content.split(" ")[0]
if view_count == "No"
view_count = 0_i64
else
view_count = view_count.delete(",").to_i64
end
end
description_html = node.xpath_node(%q(.//div[contains(@class, "yt-lockup-description")]))
description, description_html = html_to_description(description_html)
length_seconds = node.xpath_node(%q(.//span[@class="video-time"]))
if length_seconds
length_seconds = decode_length_seconds(length_seconds.content)
else
length_seconds = -1
end
videos << SearchVideo.new(
title,
id,
author,
author_id,
published,
view_count,
description,
description_html,
length_seconds,
)
end
return videos
end

View file

@ -5,7 +5,7 @@ class SearchVideo
author: String, author: String,
ucid: String, ucid: String,
published: Time, published: Time,
view_count: Int64, views: Int64,
description: String, description: String,
description_html: String, description_html: String,
length_seconds: Int32, length_seconds: Int32,
@ -20,90 +20,8 @@ def search(query, page = 1, search_params = build_search_params(content_type: "v
end end
html = XML.parse_html(html) html = XML.parse_html(html)
videos = [] of SearchVideo nodeset = html.xpath_nodes(%q(//ol[@class="item-section"]/li))
videos = extract_videos(nodeset)
html.xpath_nodes(%q(//ol[@class="item-section"]/li)).each do |node|
anchor = node.xpath_node(%q(.//h3[contains(@class,"yt-lockup-title")]/a))
if !anchor
next
end
if anchor["href"].starts_with? "https://www.googleadservices.com"
next
end
title = anchor.content.strip
video_id = anchor["href"].lchop("/watch?v=")
anchor = node.xpath_node(%q(.//div[contains(@class, "yt-lockup-byline")]/a))
if !anchor
next
end
author = anchor.content
author_url = anchor["href"]
ucid = author_url.split("/")[-1]
# Skip playlists
if node.xpath_node(%q(.//ol[contains(@class, "yt-lockup-playlist-items")]))
next
end
metadata = node.xpath_nodes(%q(.//div[contains(@class,"yt-lockup-meta")]/ul/li))
if metadata.size == 0
next
elsif metadata.size == 1
# Skip movies
if metadata[0].content.includes? "·"
next
end
if metadata[0].content.starts_with? "Starts"
view_count = 0_i64
published = Time.epoch(metadata[0].xpath_node(%q(.//span)).not_nil!["data-timestamp"].to_i64)
else
view_count = metadata[0].content.lchop("Streamed ").split(" ")[0].delete(",").to_i64
published = Time.now
end
else
# Skip movies
if metadata[0].content.includes? "·"
next
end
published = decode_date(metadata[0].content)
view_count = metadata[1].content.split(" ")[0]
if view_count == "No"
view_count = 0_i64
else
view_count = view_count.delete(",").to_i64
end
end
description_html = node.xpath_node(%q(.//div[contains(@class, "yt-lockup-description")]))
description, description_html = html_to_description(description_html)
length_seconds = node.xpath_node(%q(.//span[@class="video-time"]))
if length_seconds
length_seconds = decode_length_seconds(length_seconds.content)
else
length_seconds = -1
end
video = SearchVideo.new(
title,
video_id,
author,
ucid,
published,
view_count,
description,
description_html,
length_seconds,
)
videos << video
end
return videos return videos
end end