Add support for playlists and channels in search

This commit is contained in:
Omar Roth 2018-09-20 09:36:09 -05:00
parent 1627cfc2fa
commit 62380933b2
11 changed files with 414 additions and 210 deletions

View file

@ -434,6 +434,7 @@ get "/search" do |env|
ucids ||= [] of String
channel = nil
content_type = "all"
date = ""
duration = ""
features = [] of String
@ -447,6 +448,8 @@ get "/search" do |env|
case key
when "channel", "user"
channel = value
when "content_type", "type"
content_type = value
when "date"
date = value
when "duration"
@ -475,7 +478,7 @@ get "/search" do |env|
count = videos.size
else
begin
search_params = produce_search_params(sort: sort, date: date, content_type: "video",
search_params = produce_search_params(sort: sort, date: date, content_type: content_type,
duration: duration, features: features)
rescue ex
error_message = ex.message
@ -1333,12 +1336,12 @@ get "/feed/subscriptions" do |env|
end
videos = PG_DB.query_all("SELECT DISTINCT ON (ucid) * FROM channel_videos WHERE \
ucid IN (#{ucids}) AND id NOT IN (#{watched}) ORDER BY ucid, published DESC",
ucid IN (#{ucids}) AND id NOT IN (#{watched}) ORDER BY ucid, published DESC",
user.subscriptions + user.watched, as: ChannelVideo)
else
args = arg_array(user.subscriptions)
videos = PG_DB.query_all("SELECT DISTINCT ON (ucid) * FROM channel_videos WHERE \
ucid IN (#{args}) ORDER BY ucid, published DESC", user.subscriptions, as: ChannelVideo)
ucid IN (#{args}) ORDER BY ucid, published DESC", user.subscriptions, as: ChannelVideo)
end
videos.sort_by! { |video| video.published }.reverse!
@ -2540,7 +2543,7 @@ get "/api/v1/channels/:ucid" do |env|
json.field "authorThumbnails" do
json.array do
qualities = [32, 48, 76, 100, 512]
qualities = [32, 48, 76, 100, 176, 512]
qualities.each do |quality|
json.object do
@ -2604,102 +2607,102 @@ end
["/api/v1/channels/:ucid/videos", "/api/v1/channels/videos/:ucid"].each do |route|
get route do |env|
ucid = env.params.url["ucid"]
page = env.params.query["page"]?.try &.to_i?
page ||= 1
ucid = env.params.url["ucid"]
page = env.params.query["page"]?.try &.to_i?
page ||= 1
client = make_client(YT_URL)
client = make_client(YT_URL)
if !ucid.match(/UC[a-zA-Z0-9_-]{22}/)
rss = client.get("/feeds/videos.xml?user=#{ucid}")
rss = XML.parse_html(rss.body)
if !ucid.match(/UC[a-zA-Z0-9_-]{22}/)
rss = client.get("/feeds/videos.xml?user=#{ucid}")
rss = XML.parse_html(rss.body)
ucid = rss.xpath_node("//feed/channelid")
if !ucid
env.response.content_type = "application/json"
next {"error" => "User does not exist"}.to_json
end
ucid = ucid.content
author = rss.xpath_node("//author/name").not_nil!.content
next env.redirect "/feed/channel/#{ucid}"
else
rss = client.get("/feeds/videos.xml?channel_id=#{ucid}")
rss = XML.parse_html(rss.body)
ucid = rss.xpath_node("//feed/channelid")
if !ucid
error_message = "User does not exist."
next templated "error"
end
ucid = ucid.content
author = rss.xpath_node("//author/name").not_nil!.content
end
# Auto-generated channels
# https://support.google.com/youtube/answer/2579942
if author.ends_with?(" - Topic") ||
{"Popular on YouTube", "Music", "Sports", "Gaming"}.includes? author
auto_generated = true
end
videos = [] of SearchVideo
2.times do |i|
url = produce_channel_videos_url(ucid, page * 2 + (i - 1), auto_generated: auto_generated)
response = client.get(url)
json = JSON.parse(response.body)
if json["content_html"]? && !json["content_html"].as_s.empty?
document = XML.parse_html(json["content_html"].as_s)
nodeset = document.xpath_nodes(%q(//li[contains(@class, "feed-item-container")]))
if auto_generated
videos += extract_videos(nodeset)
else
videos += extract_videos(nodeset, ucid)
ucid = rss.xpath_node("//feed/channelid")
if !ucid
env.response.content_type = "application/json"
next {"error" => "User does not exist"}.to_json
end
ucid = ucid.content
author = rss.xpath_node("//author/name").not_nil!.content
next env.redirect "/feed/channel/#{ucid}"
else
break
rss = client.get("/feeds/videos.xml?channel_id=#{ucid}")
rss = XML.parse_html(rss.body)
ucid = rss.xpath_node("//feed/channelid")
if !ucid
error_message = "User does not exist."
next templated "error"
end
ucid = ucid.content
author = rss.xpath_node("//author/name").not_nil!.content
end
end
result = JSON.build do |json|
json.array do
videos.each do |video|
json.object do
json.field "title", video.title
json.field "videoId", video.id
# Auto-generated channels
# https://support.google.com/youtube/answer/2579942
if author.ends_with?(" - Topic") ||
{"Popular on YouTube", "Music", "Sports", "Gaming"}.includes? author
auto_generated = true
end
if auto_generated
json.field "author", video.author
json.field "authorId", video.ucid
json.field "authorUrl", "/channel/#{video.ucid}"
else
json.field "author", author
json.field "authorId", ucid
json.field "authorUrl", "/channel/#{ucid}"
videos = [] of SearchVideo
2.times do |i|
url = produce_channel_videos_url(ucid, page * 2 + (i - 1), auto_generated: auto_generated)
response = client.get(url)
json = JSON.parse(response.body)
if json["content_html"]? && !json["content_html"].as_s.empty?
document = XML.parse_html(json["content_html"].as_s)
nodeset = document.xpath_nodes(%q(//li[contains(@class, "feed-item-container")]))
if auto_generated
videos += extract_videos(nodeset)
else
videos += extract_videos(nodeset, ucid)
end
else
break
end
end
result = JSON.build do |json|
json.array do
videos.each do |video|
json.object do
json.field "title", video.title
json.field "videoId", video.id
if auto_generated
json.field "author", video.author
json.field "authorId", video.ucid
json.field "authorUrl", "/channel/#{video.ucid}"
else
json.field "author", author
json.field "authorId", ucid
json.field "authorUrl", "/channel/#{ucid}"
end
json.field "videoThumbnails" do
generate_thumbnails(json, video.id)
end
json.field "description", video.description
json.field "descriptionHtml", video.description_html
json.field "viewCount", video.views
json.field "published", video.published.epoch
json.field "publishedText", "#{recode_date(video.published)} ago"
json.field "lengthSeconds", video.length_seconds
end
json.field "videoThumbnails" do
generate_thumbnails(json, video.id)
end
json.field "description", video.description
json.field "descriptionHtml", video.description_html
json.field "viewCount", video.views
json.field "published", video.published.epoch
json.field "publishedText", "#{recode_date(video.published)} ago"
json.field "lengthSeconds", video.length_seconds
end
end
end
end
env.response.content_type = "application/json"
result
end
env.response.content_type = "application/json"
result
end
end
get "/api/v1/search" do |env|
@ -2722,13 +2725,15 @@ get "/api/v1/search" do |env|
features ||= [] of String
# TODO: Support other content types
content_type = "video"
content_type = env.params.query["type"]?.try &.downcase
content_type ||= "video"
env.response.content_type = "application/json"
begin
search_params = produce_search_params(sort_by, date, content_type, duration, features)
rescue ex
env.response.status_code = 400
next JSON.build do |json|
json.object do
json.field "error", ex.message
@ -2739,26 +2744,79 @@ get "/api/v1/search" do |env|
response = JSON.build do |json|
json.array do
count, search_results = search(query, page, search_params).as(Tuple)
search_results.each do |video|
search_results.each do |item|
json.object do
json.field "title", video.title
json.field "videoId", video.id
case item
when SearchVideo
json.field "type", "video"
json.field "title", item.title
json.field "videoId", item.id
json.field "author", video.author
json.field "authorId", video.ucid
json.field "authorUrl", "/channel/#{video.ucid}"
json.field "author", item.author
json.field "authorId", item.ucid
json.field "authorUrl", "/channel/#{item.ucid}"
json.field "videoThumbnails" do
generate_thumbnails(json, video.id)
json.field "videoThumbnails" do
generate_thumbnails(json, item.id)
end
json.field "description", item.description
json.field "descriptionHtml", item.description_html
json.field "viewCount", item.views
json.field "published", item.published.epoch
json.field "publishedText", "#{recode_date(item.published)} ago"
json.field "lengthSeconds", item.length_seconds
json.field "liveNow", item.live_now
when SearchPlaylist
json.field "type", "playlist"
json.field "title", item.title
json.field "playlistId", item.id
json.field "author", item.author
json.field "authorId", item.ucid
json.field "authorUrl", "/channel/#{item.ucid}"
json.field "videos" do
json.array do
item.videos.each do |video|
json.object do
json.field "title", video.title
json.field "videoId", video.id
json.field "lengthSeconds", video.length_seconds
json.field "videoThumbnails" do
generate_thumbnails(json, video.id)
end
end
end
end
end
when SearchChannel
json.field "type", "channel"
json.field "author", item.author
json.field "authorId", item.ucid
json.field "authorUrl", "/channel/#{item.ucid}"
json.field "authorThumbnails" do
json.array do
qualities = [32, 48, 76, 100, 176, 512]
qualities.each do |quality|
json.object do
json.field "url", item.author_thumbnail.gsub("=s176-", "=s#{quality}-")
json.field "width", quality
json.field "height", quality
end
end
end
end
json.field "subCount", item.subscriber_count
json.field "videoCount", item.video_count
json.field "description", item.description
json.field "descriptionHtml", item.description_html
end
json.field "description", video.description
json.field "descriptionHtml", video.description_html
json.field "viewCount", video.views
json.field "published", video.published.epoch
json.field "publishedText", "#{recode_date(video.published)} ago"
json.field "lengthSeconds", video.length_seconds
end
end
end

View file

@ -196,8 +196,14 @@ def html_to_content(description_html)
end
def extract_videos(nodeset, ucid = nil)
videos = extract_items(nodeset, ucid)
videos.select! { |item| !item.is_a?(SearchChannel | SearchPlaylist) }
videos.map { |video| video.as(SearchVideo) }
end
def extract_items(nodeset, ucid = nil)
# TODO: Make this a 'common', so it makes more sense to be used here
videos = [] of SearchVideo
items = [] of SearchItem
nodeset.each do |node|
anchor = node.xpath_node(%q(.//h3[contains(@class,"yt-lockup-title")]/a))
@ -209,78 +215,147 @@ def extract_videos(nodeset, ucid = nil)
next
end
case node.xpath_node(%q(.//div)).not_nil!["class"]
when .includes? "yt-lockup-playlist"
next
when .includes? "yt-lockup-channel"
next
end
title = anchor.content.strip
id = anchor["href"].lchop("/watch?v=")
if ucid
anchor = node.xpath_node(%q(.//div[contains(@class, "yt-lockup-byline")]/a))
if !anchor
author = ""
author_id = ""
else
anchor = node.xpath_node(%q(.//div[contains(@class, "yt-lockup-byline")]/a))
if !anchor
next
end
author = anchor.content
author_id = anchor["href"].split("/")[-1]
end
metadata = node.xpath_nodes(%q(.//div[contains(@class,"yt-lockup-meta")]/ul/li))
if metadata.empty?
anchor = node.xpath_node(%q(.//h3[contains(@class, "yt-lockup-title")]/a))
if !anchor
next
end
begin
published = decode_date(metadata[0].content.lchop("Streamed ").lchop("Starts "))
rescue ex
end
begin
published ||= Time.epoch(metadata[0].xpath_node(%q(.//span)).not_nil!["data-timestamp"].to_i64)
rescue ex
end
published ||= Time.now
begin
view_count = metadata[0].content.rchop(" watching").delete(",").try &.to_i64?
rescue ex
end
begin
view_count ||= metadata.try &.[1].content.delete("No views,").try &.to_i64?
rescue ex
end
view_count ||= 0_i64
title = anchor.content.strip
id = anchor["href"]
description_html = node.xpath_node(%q(.//div[contains(@class, "yt-lockup-description")]))
description_html, description = html_to_content(description_html)
length_seconds = node.xpath_node(%q(.//span[@class="video-time"]))
if length_seconds
length_seconds = decode_length_seconds(length_seconds.content)
else
length_seconds = -1
end
case node.xpath_node(%q(.//div)).not_nil!["class"]
when .includes? "yt-lockup-playlist"
plid = HTTP::Params.parse(URI.parse(id).query.not_nil!)["list"]
videos << SearchVideo.new(
title,
id,
author,
author_id,
published,
view_count,
description,
description_html,
length_seconds,
)
anchor = node.xpath_node(%q(.//ul[@class="yt-lockup-meta-info"]/li/a))
if anchor
video_count = anchor.content.match(/View full playlist \((?<count>\d+)/).try &.["count"].to_i?
end
video_count ||= 0
videos = [] of SearchPlaylistVideo
node.xpath_nodes(%q(.//ol[contains(@class, "yt-lockup-playlist-items")]/li)).each do |video|
anchor = video.xpath_node(%q(.//a))
if anchor
video_title = anchor.content
id = HTTP::Params.parse(URI.parse(anchor["href"]).query.not_nil!)["v"]
end
video_title ||= ""
id ||= ""
anchor = video.xpath_node(%q(.//span/span))
if anchor
length_seconds = decode_length_seconds(anchor.content)
end
length_seconds ||= 0
videos << SearchPlaylistVideo.new(
video_title,
id,
length_seconds
)
end
items << SearchPlaylist.new(
title,
plid,
author,
author_id,
video_count,
videos
)
when .includes? "yt-lockup-channel"
author = title
ucid = id.split("/")[-1]
author_thumbnail = node.xpath_node(%q(.//div/span/img)).try &.["data-thumb"]?
author_thumbnail ||= node.xpath_node(%q(.//div/span/img)).try &.["src"]
author_thumbnail ||= ""
subscriber_count = node.xpath_node(%q(.//span[contains(@class, "yt-subscriber-count")])).try &.["title"].delete(",").to_i?
subscriber_count ||= 0
video_count = node.xpath_node(%q(.//ul[@class="yt-lockup-meta-info"]/li)).try &.content.split(" ")[0].delete(",").to_i?
video_count ||= 0
items << SearchChannel.new(
author,
ucid,
author_thumbnail,
subscriber_count,
video_count,
description,
description_html
)
else
id = id.lchop("/watch?v=")
metadata = node.xpath_nodes(%q(.//div[contains(@class,"yt-lockup-meta")]/ul/li))
if metadata.empty?
next
end
begin
published = decode_date(metadata[0].content.lchop("Streamed ").lchop("Starts "))
rescue ex
end
begin
published ||= Time.epoch(metadata[0].xpath_node(%q(.//span)).not_nil!["data-timestamp"].to_i64)
rescue ex
end
published ||= Time.now
begin
view_count = metadata[0].content.rchop(" watching").delete(",").try &.to_i64?
rescue ex
end
begin
view_count ||= metadata.try &.[1].content.delete("No views,").try &.to_i64?
rescue ex
end
view_count ||= 0_i64
length_seconds = node.xpath_node(%q(.//span[@class="video-time"]))
if length_seconds
length_seconds = decode_length_seconds(length_seconds.content)
else
length_seconds = -1
end
live_now = node.xpath_node(%q(.//span[contains(@class, "yt-badge-live")]))
if live_now
live_now = true
else
live_now = false
end
items << SearchVideo.new(
title,
id,
author,
author_id,
published,
view_count,
description,
description_html,
length_seconds,
live_now
)
end
end
return videos
return items
end

View file

@ -3,13 +3,17 @@ def crawl_videos(db)
random = Random.new
search(random.base64(3)).as(Tuple)[1].each do |video|
ids << video.id
if video.is_a?(SearchVideo)
ids << video.id
end
end
loop do
if ids.empty?
search(random.base64(3)).as(Tuple)[1].each do |video|
ids << video.id
if video.is_a?(SearchVideo)
ids << video.id
end
end
end

View file

@ -9,9 +9,43 @@ class SearchVideo
description: String,
description_html: String,
length_seconds: Int32,
live_now: Bool,
})
end
class SearchPlaylistVideo
add_mapping({
title: String,
id: String,
length_seconds: Int32,
})
end
class SearchPlaylist
add_mapping({
title: String,
id: String,
author: String,
ucid: String,
video_count: Int32,
videos: Array(SearchPlaylistVideo),
})
end
class SearchChannel
add_mapping({
author: String,
ucid: String,
author_thumbnail: String,
subscriber_count: Int32,
video_count: Int32,
description: String,
description_html: String,
})
end
alias SearchItem = SearchVideo | SearchChannel | SearchPlaylist
def channel_search(query, page, channel)
client = make_client(YT_URL)
@ -26,7 +60,7 @@ def channel_search(query, page, channel)
end
if !canonical
return 0, [] of SearchVideo
return 0, [] of SearchItem
end
ucid = canonical["href"].split("/")[-1]
@ -40,31 +74,31 @@ def channel_search(query, page, channel)
nodeset = document.xpath_nodes(%q(//li[contains(@class, "feed-item-container")]))
count = nodeset.size
videos = extract_videos(nodeset)
items = extract_items(nodeset)
else
count = 0
videos = [] of SearchVideo
items = [] of SearchItem
end
return count, videos
return count, items
end
def search(query, page = 1, search_params = produce_search_params(content_type: "video"))
def search(query, page = 1, search_params = produce_search_params(content_type: "all"))
client = make_client(YT_URL)
if query.empty?
return {0, [] of SearchVideo}
return {0, [] of SearchItem}
end
html = client.get("/results?q=#{URI.escape(query)}&page=#{page}&sp=#{search_params}&disable_polymer=1").body
if html.empty?
return {0, [] of SearchVideo}
return {0, [] of SearchItem}
end
html = XML.parse_html(html)
nodeset = html.xpath_nodes(%q(//ol[@class="item-section"]/li))
videos = extract_videos(nodeset)
items = extract_items(nodeset)
return {nodeset.size, videos}
return {nodeset.size, items}
end
def produce_search_params(sort : String = "relevance", date : String = "", content_type : String = "",
@ -110,8 +144,10 @@ def produce_search_params(sort : String = "relevance", date : String = "", conte
"\x10\x04"
when "show"
"\x10\x05"
else
when "all"
""
else
"\x10\x01"
end
body += case duration

View file

@ -37,8 +37,8 @@
<% videos.each_slice(4) do |slice| %>
<div class="pure-g">
<% slice.each do |video| %>
<%= rendered "components/video" %>
<% slice.each do |item| %>
<%= rendered "components/item" %>
<% end %>
</div>
<% end %>

View file

@ -0,0 +1,54 @@
<div class="pure-u-1 pure-u-md-1-4">
<div class="h-box">
<% case item when %>
<% when SearchChannel %>
<a style="width:100%;" href="/channel/<%= item.ucid %>">
<% if env.get?("user") && env.get("user").as(User).preferences.thin_mode %>
<% else %>
<center>
<img style="width:56.25%;" src="/ggpht<%= URI.parse(item.author_thumbnail).full_path %>"/>
</center>
<% end %>
<p><%= item.author %></p>
</a>
<p><%= number_with_separator(item.subscriber_count) %> subscribers</p>
<h5><%= item.description_html %></h5>
<% when SearchPlaylist %>
<a style="width:100%;" href="/playlist?list=<%= item.id %>">
<% if env.get?("user") && env.get("user").as(User).preferences.thin_mode %>
<% else %>
<img style="width:100%;" src="/vi/<%= item.videos[0].id %>/mqdefault.jpg"/>
<% end %>
<p><%= item.title %></p>
</a>
<p>
<b><a style="width:100%;" href="/channel/<%= item.ucid %>"><%= item.author %></a></b>
</p>
<p><%= number_with_separator(item.video_count) %> videos</p>
<p>PLAYLIST</p>
<% else %>
<% if item.responds_to?(:playlists) && !item.playlists.empty? %>
<% params = "&list=#{item.playlists[0]}" %>
<% else %>
<% params = nil %>
<% end %>
<a style="width:100%;" href="/watch?v=<%= item.id %><%= params %>">
<% if env.get?("user") && env.get("user").as(User).preferences.thin_mode %>
<% else %>
<img style="width:100%;" src="/vi/<%= item.id %>/mqdefault.jpg"/>
<% end %>
<p><%= item.title %></p>
</a>
<% if item.responds_to?(:live_now) && item.live_now %>
<p>LIVE</p>
<% end %>
<p>
<b><a style="width:100%;" href="/channel/<%= item.ucid %>"><%= item.author %></a></b>
</p>
<% if Time.now - item.published > 1.minute %>
<h5>Shared <%= recode_date(item.published) %> ago</h5>
<% end %>
<% end %>
</div>
</div>

View file

@ -1,23 +0,0 @@
<div class="pure-u-1 pure-u-md-1-4">
<div class="h-box">
<% if video.responds_to?(:playlists) && !video.playlists.empty? %>
<% params = "&list=#{video.playlists[0]}" %>
<% else %>
<% params = nil %>
<% end %>
<a style="width:100%;" href="/watch?v=<%= video.id %><%= params %>">
<% if env.get?("user") && env.get("user").as(User).preferences.thin_mode %>
<% else %>
<img style="width:100%;" src="/vi/<%= video.id %>/mqdefault.jpg"/>
<% end %>
<p><%= video.title %></p>
</a>
<p>
<b><a style="width:100%;" href="/channel/<%= video.ucid %>"><%= video.author %></a></b>
</p>
<% if Time.now - video.published > 1.minute %>
<h5>Shared <%= recode_date(video.published) %> ago</h5>
<% end %>
</div>
</div>

View file

@ -4,8 +4,8 @@
<% top_videos.each_slice(4) do |slice| %>
<div class="pure-g">
<% slice.each do |video| %>
<%= rendered "components/video" %>
<% slice.each do |item| %>
<%= rendered "components/item" %>
<% end %>
</div>
<% end %>

View file

@ -26,8 +26,8 @@
<% videos.each_slice(4) do |slice| %>
<div class="pure-g">
<% slice.each do |video| %>
<%= rendered "components/video" %>
<% slice.each do |item| %>
<%= rendered "components/item" %>
<% end %>
</div>
<% end %>

View file

@ -4,8 +4,8 @@
<% videos.each_slice(4) do |slice| %>
<div class="pure-g">
<% slice.each do |video| %>
<%= rendered "components/video" %>
<% slice.each do |item| %>
<%= rendered "components/item" %>
<% end %>
</div>
<% end %>

View file

@ -25,8 +25,8 @@
<% notifications.each_slice(4) do |slice| %>
<div class="pure-g">
<% slice.each do |video| %>
<%= rendered "components/video" %>
<% slice.each do |item| %>
<%= rendered "components/item" %>
<% end %>
</div>
<% end %>
@ -37,8 +37,8 @@
<% videos.each_slice(4) do |slice| %>
<div class="pure-g">
<% slice.each do |video| %>
<%= rendered "components/video" %>
<% slice.each do |item| %>
<%= rendered "components/item" %>
<% end %>
</div>
<% end %>