-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathextract2.rb
executable file
·61 lines (53 loc) · 1.29 KB
/
extract2.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env ruby
require 'json'
require 'htmlentities'
# Calculate top 10 most cited by different people XKCD comics and print output
# to stdout
# Load xkcd.jsonl
imgs = {}
File.foreach("xkcd.jsonl") do |line|
data = JSON.parse(line)
imgs[data["img"]] = data["id"]
end
stats = {}
# Load raw.jsonl
File.foreach("raw.jsonl") do |line|
data = JSON.parse(line)
text = HTMLEntities.new.decode(data["text"])
h = {}
# Grab xkcd.com/<id>, www.xkcd.com/<id>
# and m.xkcd.com/<id>.
m = text.scan(/(?:www\.|m\.)?xkcd\.com\/([0-9]+)/i).flatten
m.each do |id|
id = id.to_i
h[id]=true
end
# Grab various forms of "xkcd #1234"
m = text.scan(/xkcd [^a-z0-9]*([0-9]+)/i).flatten
m.each do |id|
id = id.to_i
h[id]=true
end
# Grab imgs.xkcd.com/<img> and convert to comic id
m = text.scan(/(imgs\.xkcd\.com\/.*?(?:png|gif|jpg))/i).flatten
m.each{|img|
i = "https://" + img
if imgs[i] then
h[imgs[i]] = true
else
#puts "unknown: " + i
end
}
# Add the data from h to stats
h.each do |key, value|
if !stats[key] then
stats[key] = {}
end
stats[key][data["by"]] = true
end
end
# Sort and emit top 10
stats.each do |key, value|
stats[key] = value.length
end
puts stats.sort_by {|_key, value| value}.last(10).reverse().to_h