-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrubyquest.pl
executable file
·98 lines (85 loc) · 3.33 KB
/
rubyquest.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/perl
# The very definition of a oneshot; I wanted to combine RubyQuest into a single
# page with the 4chan clutter removed, which only needs doing once!
# I scraped a list of thread archive links from the tagged list in the link
# below, put them in a file, and fed that to wget, which is why the files are
# all named index.html.\d+.
# Some header crap
print <<'END';
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<link rel="stylesheet" href="http://stuff.veekun.com/reset.css"/>
<style type="text/css">
body { margin: 1em; }
#title { font-size: 3em; margin-bottom: 0.33em; font-weight: bold; color: #c00000; }
#top { font-size: 0.75em; font-style: italic; }
h1 { font-size: 2em; padding-top: 0.5em; margin-top: 0.5em; border-top: 3px double black; font-weight: bold; }
.panel { overflow: hidden; margin: 1em 0; padding-top: 1em; border-top: 1px dotted #999; }
h1 + .panel { border-top: none; }
.panel > a { display: block; margin-right: 1em; float: left; }
q { color: #8080a8; font-style: italic; }
</style>
</head>
<body>
<div id="title">RubyQuest</div>
<p id="top">Thread archives from <a href="http://4chan.thisisnotatrueending.com/archive.html?tags=Ruby">sup/tg/</a>. Also contains lots of discussion and fanart threads.</p>
END
@ARGV = ('index.html', map { "index.html.$_" } 1 .. 30);
# For tracking the current part
my $last_argv = '';
my $cur_part = 0;
# For tracking the current thread id
my $thread_id = '???';
while (<>) {
if (m{<input [^>]+ name="?resto"? [^>]+ value="(\d+)"}x) {
# No idea what this does, but it's the thread id, which we need for images
$thread_id = $1;
next;
}
# First post is split across several lines
if (m{<form.+delform}) {
# Image is on this line, but the <blockquote> is three lines down
for my $n (1 .. 3) {
$_ .= <>;
}
}
# Skip every line not containing Weaver's tripcodes
next if not m{ class="postertrip" > !!(?:vY8rj3XdjnI|t1PjjQn4qVX)< }x;
# Extract text and image filename
my ($text) = m{ <blockquote>(.+)</blockquote> }x;
my ($img, $thumb) = m{ <a \s href="?((?:images/)?\d+\.\w+)"? [^>]+ ><img [^>]+ src="?((?:thumbs/)?\d+s\.\w+)"? }x;
# Bail?
# Bail.
next if not $text and not $img;
### Clean up text
# Remove links to other posts because who cares
$text =~ s{<font class="unkfunc"><a[^>]+class="quotelink"[^>]+>>>\d+</a></font><br ?/?>}{}g;
# Make <font> tags into css because wtf
$text =~ s{<font class="unkfunc">}{<q>}g;
$text =~ s{</font>}{</q>}g;
# Better <br>s I guess eh
$text =~ s{<br ?/?>}{<br/>\n }g;
# Print header if this is a new page
if ($last_argv ne $ARGV) {
warn $ARGV;
$cur_part++;
print qq{<h1>Part $cur_part</h1>\n};
$last_argv = $ARGV;
}
print qq{<div class="panel">\n};
if ($img) {
print qq{ <a href="http://4chan.thisisnotatrueending.com/archive/$thread_id/$img">\n};
print qq{ <img src="http://4chan.thisisnotatrueending.com/archive/$thread_id/$thumb" alt=""/>\n};
print qq{ </a>\n};
}
print qq{ <p>\n};
print qq{ $text\n};
print qq{ </p>\n};
print qq{</div>\n};
}
# "Footer"
print <<'END'
</body>
</html>
END