<?php
* Bash.org scrapper.
*
* One-liners: SELECT * FROM quotes WHERE quote NOT LIKE '%' || CHAR(13) || '%'
*
* @author Adam Klvač <adam@klva.cz>
*/
define('BASH_BROWSE_URL', 'http://bash.org/?browse&p=%d');
libxml_use_internal_errors(true);
$db = new PDO('sqlite:bash.db', null, null, [
PDO::ATTR_ERRMODE => PDO::ERRMODE_EXCEPTION,
PDO::ATTR_DEFAULT_FETCH_MODE => PDO::FETCH_OBJ,
]);
$version = (int) $db->query('PRAGMA user_version')->fetchColumn();
switch (true) {
case $version < 1:
printf("DB migration: 0\n");
$db->exec('CREATE TABLE quotes (id INTEGER PRIMARY KEY, score INTEGER, quote TEXT);');
$db->exec('CREATE TABLE config (key TEXT PRIMARY KEY, value TEXT);');
$db->exec('PRAGMA user_version = 1;');
break;
}
$insert = $db->prepare("INSERT OR IGNORE INTO quotes (id, score, quote) VALUES (?, ?, ?)");
for ($page = get_page();; set_page($page), $page++) {
$url = sprintf(BASH_BROWSE_URL, $page);
printf("GET %s\n", $url);
$file = file_get_contents($url);
while (($file = file_get_contents($url)) === false) {
sleep(60);
}
$dom = new DOMDocument;
if (!$dom->loadHTML(mb_convert_encoding($file, 'ISO-8859-1', 'UTF-8'))) {
exit(1);
}
$xPath = new DOMXPath($dom);
$quotes = [];
foreach ($xPath->query('//p[@class="quote"]') as $element) {
if (!preg_match('~^#(?<id>\d+) \+\((?<score>-?\d+)\)- \[X]$~', $element->nodeValue, $matches)) {
fprintf(STDERR, "p.quote: preg_match() failed [%s]\n", $element->nodeValue);
exit(2);
}
$quotes[] = [
'id' => (int) $matches['id'],
'score' => (int) $matches['score'],
];
}
if (count($quotes) === 0) {
fprintf(STDERR, "no quotes\n");
exit(3);
}
$i = 0;
foreach ($xPath->query('//p[@class="qt"]') as $element) {
if ($i === count($quotes)) {
fprintf(STDERR, "p.qt: count() exceeded\n");
exit(4);
}
$text = $element->nodeValue;
$quotes[$i]['quote'] = $text;
$i++;
}
if ($i !== count($quotes)) {
fprintf(STDERR, "count mismatch\n");
exit(5);
}
foreach ($quotes as $quote) {
$insert->execute([
$quote['id'],
$quote['score'],
$quote['quote'],
]);
}
if (strpos($file, '"qa">End<') === false) {
printf("Done.\n");
exit;
}
sleep(15);
}
function get_page(): int
{
global $db;
return $db->query("SELECT value FROM config WHERE key = 'page'")->fetchColumn() ?: 1;
}
function set_page(int $page): void
{
global $db;
$db->exec("INSERT OR REPLACE INTO config VALUES ('page', $page);");
}