GitGram — hobbesgram-migrate.php — GitGram
Hobbes_OS2_Archive / main / v1.05 / hobbesgram-migrate.php8,916 B↓ Raw
<?php
/**
 * hobbesgram-migrate.php — Data migration & index rebuild script
 *
 * Rebuilds data/index/catalog.json and data/index/search.json in a single
 * pass through data/files/, writing each index file exactly once.
 * Run this after upgrading to a version that adds the file catalog, or after
 * any large SFTP/FTP bulk import.
 *
 * CLI:  php hobbesgram-migrate.php
 * Web:  https://yoursite.com/hobbesgram-migrate.php?run=yes
 *
 * DELETE THIS FILE from the server after running.
 */

// ── Bootstrap ──────────────────────────────────────────────────────────────────
define('HOBBES', true);
require_once __DIR__ . '/config.php';
require_once __DIR__ . '/includes/storage.php';
require_once __DIR__ . '/includes/search.php';

set_time_limit(600);
ini_set('memory_limit', '256M');

$is_cli = (php_sapi_name() === 'cli');

// ── Web guard ──────────────────────────────────────────────────────────────────
if (!$is_cli && ($_GET['run'] ?? '') !== 'yes') {
    header('Content-Type: text/html; charset=utf-8');
    echo '<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8">
<title>Hobbesgram Migration</title>
<style>
body{font-family:monospace;font-size:14px;max-width:700px;margin:40px auto;padding:0 20px;}
h2{border-bottom:2px solid #000;padding-bottom:6px;}
.warn{background:#fffbe6;border:1px solid #cca800;padding:10px 14px;margin:16px 0;}
.btn{display:inline-block;background:#000080;color:#fff;padding:8px 20px;text-decoration:none;font-weight:bold;}
</style></head><body>
<h2>Hobbesgram &mdash; Migration / Index Rebuild</h2>
<div class="warn">
<strong>What this does:</strong><br>
Scans every file in <code>data/files/</code> and rebuilds:<br>
&bull; <code>data/index/catalog.json</code> (fast listing index &mdash; fixes home/browse timeouts)<br>
&bull; <code>data/index/search.json</code> (full-text search index)<br><br>
Safe to run multiple times. Does <strong>not</strong> modify any file metadata.
</div>
<p><a href="?run=yes" class="btn">Run Migration Now</a></p>
<p style="color:#666;font-size:12px;">Delete this file from the server after use.</p>
</body></html>';
    exit;
}

// ── Output helper ──────────────────────────────────────────────────────────────
if (!$is_cli) {
    header('Content-Type: text/plain; charset=utf-8');
    // Disable output buffering so progress streams to browser
    if (ob_get_level()) ob_end_flush();
}

function out(string $msg = ''): void {
    echo $msg . "\n";
    if (!defined('IS_CLI') || !IS_CLI) @flush();
}

if ($is_cli) define('IS_CLI', true);

out('=== Hobbesgram Migration / Index Rebuild ===');
out('Started: ' . date('Y-m-d H:i:s'));
out();

// ── Step 1: Verify / create index directory ────────────────────────────────────
out('-- Step 1: Checking directories');
$index_dir = dirname(CATALOG_FILE);
if (!is_dir($index_dir)) {
    if (mkdir($index_dir, 0755, true)) {
        out('  Created: ' . $index_dir);
    } else {
        out('  ERROR: Could not create ' . $index_dir);
        exit(1);
    }
} else {
    out('  OK: ' . $index_dir);
}

if (!is_dir(FILES_DIR)) {
    out('  ERROR: FILES_DIR does not exist: ' . FILES_DIR);
    exit(1);
}
out();

// ── Step 2: Scan all file JSON records (single pass) ─────────────────────────
out('-- Step 2: Scanning data/files/');

$all_files = glob(FILES_DIR . '/*.json') ?: [];
$total     = count($all_files);
out("  Found {$total} JSON record(s)");
out();

$catalog      = [];   // id => lightweight record
$search_index = [];   // keyword => [ids]
$counts       = [
    'approved'   => 0,
    'pending'    => 0,
    'skipped'    => 0,
    'total_size' => 0,
];

// Stop words (must match search_tokenize() in search.php)
$stop_words = ['the','a','an','and','or','in','of','to','for','is','it','at','as','by'];

function migrate_tokenize(string $text, array $stop): array {
    $text  = strtolower($text);
    $text  = preg_replace('/[^a-z0-9\s]/', ' ', $text);
    $words = preg_split('/\s+/', $text, -1, PREG_SPLIT_NO_EMPTY);
    $words = array_filter($words, fn($w) => strlen($w) >= 3 && !in_array($w, $stop, true));
    return array_unique(array_values($words));
}

$dot_every = max(1, (int)($total / 50)); // print a dot every 2%
$processed = 0;
if (!$is_cli) out('  Progress (one dot per ~2%):');

foreach ($all_files as $json_path) {
    $m = storage_read($json_path);

    if (!$m || empty($m['id'])) {
        $counts['skipped']++;
        continue;
    }

    // ── Catalog entry ──────────────────────────────────────────────────────
    $entry = [];
    foreach (CATALOG_FIELDS as $k) {
        $entry[$k] = $m[$k] ?? null;
    }
    $catalog[$m['id']] = $entry;

    // ── Stats ──────────────────────────────────────────────────────────────
    if (!empty($m['approved'])) {
        $counts['approved']++;
        $counts['total_size'] += (int)($m['size'] ?? 0);

        // ── Search index (approved files only) ────────────────────────────
        $corpus = implode(' ', array_filter([
            $m['title']         ?? '',
            $m['description']   ?? '',
            $m['author']        ?? '',
            $m['version']       ?? '',
            $m['tags']          ?? '',
            $m['category']      ?? '',
            $m['original_name'] ?? '',
            $m['uploader']      ?? '',
            $m['os2_version']   ?? '',
            $m['requirements']  ?? '',
            $m['license']       ?? '',
        ]));
        foreach (migrate_tokenize($corpus, $stop_words) as $kw) {
            $search_index[$kw][] = $m['id'];
        }
    } else {
        $counts['pending']++;
    }

    $processed++;
    if ($processed % $dot_every === 0) {
        echo '.';
        if (!$is_cli) @flush();
    }
}

// Deduplicate search index entries (multiple fields may produce same keyword)
foreach ($search_index as &$ids) {
    $ids = array_unique(array_values($ids));
}
unset($ids);

out();
out();

// ── Step 3: Write catalog ──────────────────────────────────────────────────────
out('-- Step 3: Writing catalog');
if (storage_write(CATALOG_FILE, $catalog, false)) {
    $size = round(filesize(CATALOG_FILE) / 1024, 1);
    out("  OK: " . CATALOG_FILE . " ({$size} KB, " . count($catalog) . " entries)");
} else {
    out('  ERROR: Could not write ' . CATALOG_FILE);
    out('  Check directory permissions on ' . dirname(CATALOG_FILE));
}
out();

// ── Step 4: Write search index ─────────────────────────────────────────────────
out('-- Step 4: Writing search index');
if (storage_write(SEARCH_FILE, $search_index)) {
    $size = round(filesize(SEARCH_FILE) / 1024, 1);
    out("  OK: " . SEARCH_FILE . " ({$size} KB, " . count($search_index) . " keywords)");
} else {
    out('  ERROR: Could not write ' . SEARCH_FILE);
}
out();

// ── Step 5: Report ─────────────────────────────────────────────────────────────
out('-- Results');
out(sprintf('  Total records scanned : %d', $total));
out(sprintf('  Approved (in catalog) : %d', $counts['approved']));
out(sprintf('  Pending               : %d', $counts['pending']));
out(sprintf('  Skipped (bad/empty)   : %d', $counts['skipped']));
out(sprintf('  Total approved size   : %s', format_bytes($counts['total_size'])));
out(sprintf('  Search keywords       : %d', count($search_index)));
out();
out('Finished: ' . date('Y-m-d H:i:s'));
out();
out('DELETE this file from the server: ' . basename(__FILE__));

// ── format_bytes helper (format_size may not be loaded) ───────────────────────
function format_bytes(int $bytes): string {
    if ($bytes >= 1073741824) return round($bytes / 1073741824, 2) . ' GB';
    if ($bytes >= 1048576)    return round($bytes / 1048576, 1)    . ' MB';
    if ($bytes >= 1024)       return round($bytes / 1024, 1)       . ' KB';
    return $bytes . ' B';
}
Ready
GitGram