gekata/server.js

518 lines
18 KiB
JavaScript

// server.js
const express = require('express');
const { chromium } = require('playwright');
const Database = require('better-sqlite3');
// Убираем punycode; используем WHATWG URL + domainToASCII
const { URL, domainToASCII } = require('node:url');
const app = express();
// ---------- Config ----------
const PORT = Number(process.env.PORT || 3000);
const CHROMIUM_PATH = process.env.CHROMIUM_PATH || undefined;
const CACHE_TTL_SECONDS = parseInt(process.env.CACHE_TTL_SECONDS || '21600', 10);
const MAX_REDIRECT_STEPS = parseInt(process.env.MAX_REDIRECT_STEPS || '20', 10);
const NAV_TIMEOUT_MS = parseInt(process.env.NAV_TIMEOUT_MS || '30000', 10);
const QUIET_WINDOW_MS = parseInt(process.env.QUIET_WINDOW_MS || '700', 10);
const PRECHECK_MAX_REDIRECTS = parseInt(process.env.PRECHECK_MAX_REDIRECTS || '15', 10);
const SQLITE_PATH = process.env.SQLITE_PATH || './cache.db';
const DEBUG_ENABLED = String(process.env.DEBUG || '').trim() === '1';
const CHROMIUM_ARGS = [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--no-zygote',
];
// ---------- Logging ----------
const log = {
info: (...a) => console.log(...a),
debug: (...a) => { if (DEBUG_ENABLED) console.log(...a); },
warn: (...a) => console.warn(...a),
error: (...a) => console.error(...a),
};
// ---------- DB ----------
log.info(`[BOOT] SQLite path: ${SQLITE_PATH}`);
const db = new Database(SQLITE_PATH);
db.pragma('journal_mode = WAL');
db.exec(`
CREATE TABLE IF NOT EXISTS domain_cache (
domain TEXT PRIMARY KEY,
result_json TEXT NOT NULL,
final_url TEXT,
redirect_chain_json TEXT,
updated_at INTEGER NOT NULL,
ttl_at INTEGER NOT NULL
);
`);
const stmtSelect = db.prepare(`
SELECT result_json, final_url, redirect_chain_json, updated_at, ttl_at
FROM domain_cache WHERE domain = ?
`);
const stmtUpsert = db.prepare(`
INSERT INTO domain_cache (domain, result_json, final_url, redirect_chain_json, updated_at, ttl_at)
VALUES (@domain, @result_json, @final_url, @redirect_chain_json, @updated_at, @ttl_at)
ON CONFLICT(domain) DO UPDATE SET
result_json = excluded.result_json,
final_url = excluded.final_url,
redirect_chain_json = excluded.redirect_chain_json,
updated_at = excluded.updated_at,
ttl_at = excluded.ttl_at
`);
app.use(express.json());
// ---------- Helpers ----------
function normalizeDomain(input) {
if (!input || typeof input !== 'string') return null;
const s = input.trim().toLowerCase();
try {
// Если это URL, берём hostname; иначе считаем, что это просто хост
const asUrl = /^https?:\/\//i.test(s) ? s : `https://${s}`;
const u = new URL(asUrl);
// Преобразуем к IDNA ASCII (Punycode) через WHATWG util
const ascii = domainToASCII(u.hostname || '');
return ascii || null;
} catch {
// Попытка прямой IDNA-конверсии из строки (на случай голого хоста без схемы)
try {
const ascii = domainToASCII(s);
return ascii || null;
} catch {
return null;
}
}
} // WHATWG URL + url.domainToASCII [web:167][web:161][web:164]
function extractDomain(url) {
try { return new URL(url).hostname.toLowerCase(); } catch { return null; }
}
// эвристика «выглядит как файл»
function looksLikeFilePath(u) {
try {
const { pathname } = new URL(u);
return /\.(?:zip|pdf|png|jpe?g|gif|webp|svg|mp4|mp3|wav|csv|xlsx?|docx?|pptx?|exe|deb|rpm|apk|tar(?:\.gz)?|7z|gz|bz2)$/i.test(pathname);
} catch { return false; }
}
// канонизация URL для детекции петель
function normalizeUrlForLoop(u) {
try {
const x = new URL(u);
x.hash = '';
return x.toString();
} catch { return u; }
}
// ---------- Precheck: manual redirects & classification ----------
async function precheckFollowManually(startUrl) {
let url = startUrl;
const visited = new Set();
let sawHtmlHint = false;
for (let i = 0; i < PRECHECK_MAX_REDIRECTS; i++) {
const norm = normalizeUrlForLoop(url);
if (visited.has(norm)) {
log.debug(`[PRECHECK] Loop at ${norm}`);
return { skip: true, reason: 'redirect-loop', tryBrowser: sawHtmlHint };
}
visited.add(norm);
let res;
try {
res = await fetch(url, { method: 'GET', redirect: 'manual' });
} catch (e) {
log.debug(`[PRECHECK] GET(manual) failed for ${url}: ${e?.message}`);
return { skip: false, reason: null, tryBrowser: false };
}
const status = res.status;
const ct = res.headers.get('content-type') || '';
const cd = res.headers.get('content-disposition') || '';
const loc = res.headers.get('location') || '';
log.debug(`[PRECHECK] step=${i} status=${status} ct="${ct}" cd="${cd || '-'}" loc="${loc || '-'}"`);
const isHtml = /\btext\/html\b/i.test(ct);
if (isHtml) sawHtmlHint = true;
const isAttachment = /attachment/i.test(cd);
if (status === 403) {
return { skip: true, reason: 'forbidden', tryBrowser: true };
}
if (status >= 300 && status < 400 && loc) {
const next = new URL(loc, url).toString();
if (looksLikeFilePath(next) || /download|file|export/i.test(next)) {
return { skip: true, reason: `redirect-to-file(${next})`, tryBrowser: false, finalUrl: next };
}
try {
const probe = await fetch(next, { method: 'GET', redirect: 'manual' });
const pct = probe.headers.get('content-type') || '';
const isHtmlTarget = /\btext\/html\b/i.test(pct);
if (isHtmlTarget) {
return { skip: true, reason: `marketing-redirect(${next})`, tryBrowser: false, finalUrl: next };
}
} catch {}
url = next;
continue;
}
if (isAttachment) return { skip: true, reason: 'attachment', tryBrowser: false, finalUrl: url };
if (!isHtml && ct) return { skip: true, reason: `non-HTML (${ct})`, tryBrowser: false, finalUrl: url };
return { skip: false, reason: null, tryBrowser: false, finalUrl: url };
}
log.debug(`[PRECHECK] Too many redirects >= ${PRECHECK_MAX_REDIRECTS}`);
return { skip: true, reason: `redirect-loop(${PRECHECK_MAX_REDIRECTS})`, tryBrowser: sawHtmlHint, finalUrl: null };
} // [web:167]
// ---------- Browser lifecycle ----------
let browser;
async function ensureBrowser() {
if (browser && browser.isConnected()) return browser;
if (browser) { try { await browser.close(); } catch {} }
log.info(`[BROWSER] Launch headless Chromium`);
browser = await chromium.launch({ executablePath: CHROMIUM_PATH, headless: true, args: CHROMIUM_ARGS });
return browser;
} // [web:151]
// ---------- Redirect chain builder (document-only) ----------
function buildRedirectChainForResponse(resp, maxLen = 50) {
const chain = [];
const req = resp.request();
if (req.resourceType() !== 'document') return chain;
let prev = req.redirectedFrom();
let toUrl = req.url();
const status = resp.status();
while (prev) {
chain.push({ from: prev.url(), to: toUrl, status });
toUrl = prev.url();
prev = prev.redirectedFrom();
if (chain.length >= maxLen) break;
}
return chain.reverse();
} // [web:151]
// ---------- Quiet network window ----------
async function quietWindowWait({ inflightRef, lastChangeRef, timeoutMs, quietMs }) {
const start = Date.now();
while (Date.now() - start < timeoutMs) {
const quietFor = Date.now() - lastChangeRef.value;
if (inflightRef.value === 0 && quietFor >= quietMs) return;
await new Promise(r => setTimeout(r, 100));
}
} // [web:151]
// ---------- Core scan with Playwright ----------
async function scanWithBrowser(originDomain, startUrl, contextOpts = {}) {
const b = await ensureBrowser();
const context = await b.newContext({ acceptDownloads: true, ...contextOpts });
// Безопасный лимитер редиректов для документной навигации
await context.route('**', async route => {
const request = route.request();
const isDoc = request.resourceType() === 'document';
const isNav = request.isNavigationRequest();
if (!(isDoc && isNav)) return route.continue();
try {
const resp = await route.fetch({ maxRedirects: MAX_REDIRECT_STEPS });
const status = resp.status();
const headers = await resp.headers();
const body = await resp.body().catch(() => null);
try {
await route.fulfill({ status, headers, body });
} catch (e) {
log.debug(`[ROUTE] fulfill failed for ${request.url()}: ${e?.message || e}`);
await route.continue();
}
} catch (e) {
const msg = String(e?.message || '');
if (/redirect/i.test(msg) || /too many/i.test(msg)) {
try {
await route.fulfill({
status: 508,
contentType: 'text/plain',
body: 'Loop Detected: too many redirects'
});
} catch (e2) {
log.debug(`[ROUTE] fulfill(508) failed for ${request.url()}: ${e2?.message || e2}`);
await route.continue();
}
} else {
log.debug(`[ROUTE] fetch failed for ${request.url()}: ${msg}`);
await route.continue();
}
}
});
const page = await context.newPage();
const seenDomains = new Set();
const redirectLog = [];
const visitedUrls = new Set();
const inflightRef = { value: 0 };
const lastChangeRef = { value: Date.now() };
if (DEBUG_ENABLED) {
page.on('console', msg => log.debug(`[PAGE.CONSOLE] ${msg.type()}: ${msg.text()}`));
page.on('pageerror', err => log.debug(`[PAGE.ERROR] ${err?.message}`));
page.on('requestfailed', req => log.debug(`[REQ.FAIL] ${req.url()} reason=${req.failure()?.errorText}`));
}
page.on('download', async dl => {
try { await dl.failure().catch(() => {}); } catch {}
log.debug(`[SCAN] Download ignored: ${dl.url()}`);
});
const onReq = req => {
inflightRef.value++;
lastChangeRef.value = Date.now();
const d = extractDomain(req.url());
if (d) seenDomains.add(d);
log.debug(`[REQ] ${req.method()} ${req.url()}`);
};
const onResp = resp => {
inflightRef.value = Math.max(0, inflightRef.value - 1);
lastChangeRef.value = Date.now();
const d = extractDomain(resp.url());
if (d) seenDomains.add(d);
const status = resp.status();
log.debug(`[RESP] ${status} ${resp.url()}`);
if (status >= 300 && status < 400 && resp.request().resourceType() === 'document') {
const piece = buildRedirectChainForResponse(resp, MAX_REDIRECT_STEPS + 5);
redirectLog.push(...piece);
}
};
page.on('request', onReq);
page.on('response', onResp);
try {
log.info(`[SCAN] goto(${startUrl}) domcontentloaded timeout=${NAV_TIMEOUT_MS}`);
let response;
try {
response = await page.goto(startUrl, { waitUntil: 'domcontentloaded', timeout: NAV_TIMEOUT_MS });
} catch (e) {
const msg = String(e?.message || '');
if (/Download is starting/i.test(msg)) {
log.info(`[SCAN] goto triggered download; continue as non-HTML`);
} else {
throw e;
}
}
if (response && response.status && response.status() === 508) {
throw new Error(`Too many redirects (${MAX_REDIRECT_STEPS})`);
}
await quietWindowWait({ inflightRef, lastChangeRef, timeoutMs: NAV_TIMEOUT_MS, quietMs: QUIET_WINDOW_MS });
const finalUrl = page.url();
if (visitedUrls.has(finalUrl)) throw new Error('Redirect loop detected');
visitedUrls.add(finalUrl);
const steps = redirectLog.length;
if (steps > MAX_REDIRECT_STEPS) throw new Error(`Too many redirects (${steps})`);
await context.close();
const relatedDomains = Array.from(seenDomains)
.filter(d => !d.includes('doubleclick') && !d.includes('google'))
.sort();
log.info(`[SCAN] Done finalUrl=${finalUrl} domains=${relatedDomains.length} redirects=${steps}`);
return { finalUrl, relatedDomains, redirectChain: redirectLog };
} catch (e) {
try { await context.close(); } catch {}
try {
if (browser && typeof browser.isConnected === 'function' && !browser.isConnected()) {
await browser.close(); browser = null;
}
} catch {}
log.error(`[SCAN] Error: ${e?.message}`);
throw e;
} finally {
page.off('request', onReq);
page.off('response', onResp);
}
}
// ---------- High-level scan with precheck and escalation ----------
async function scanDomainOnce(originDomain) {
const startUrl = `https://${originDomain}`;
log.info(`[SCAN] Start domain="${originDomain}" url=${startUrl}`);
const pre = await precheckFollowManually(startUrl);
if (pre.skip && (pre.reason === 'attachment' || (pre.reason || '').startsWith('non-HTML'))) {
log.info(`[SCAN] Skip non-HTML/attachment: ${pre.reason}`);
return { finalUrl: pre.finalUrl || startUrl, relatedDomains: [originDomain], redirectChain: [], precheck: pre.reason };
}
let targetUrl = startUrl;
if (pre.skip && /^marketing-redirect/.test(pre.reason || '') && pre.finalUrl) {
log.info(`[SCAN] Marketing redirect -> follow target in browser: ${pre.finalUrl}`);
targetUrl = pre.finalUrl;
} else if (pre.skip && pre.tryBrowser) {
log.info(`[SCAN] Escalation to browser due to ${pre.reason}`);
}
const contextOpts = {
userAgent:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
locale: 'en-US',
timezoneId: 'UTC',
};
try {
const result = await scanWithBrowser(originDomain, targetUrl, contextOpts);
if (!result.relatedDomains.includes(originDomain)) {
result.relatedDomains.unshift(originDomain);
}
return result;
} catch (e) {
log.warn(`[SCAN] Browser escalation failed: ${e?.message}`);
return { finalUrl: targetUrl, relatedDomains: [originDomain], redirectChain: [], precheck: pre.reason || 'blocked' };
}
}
// ---------- Cache helpers ----------
function getFromCache(domain) {
const row = stmtSelect.get(domain);
if (!row) return null;
const now = Math.floor(Date.now() / 1000);
if (row.ttl_at > now) {
try {
const out = {
relatedDomains: JSON.parse(row.result_json),
finalUrl: row.final_url || null,
redirectChain: row.redirect_chain_json ? JSON.parse(row.redirect_chain_json) : [],
cached: true,
cachedAt: row.updated_at,
ttlAt: row.ttl_at,
};
return out;
} catch (e) {
log.warn(`[CACHE] Parse error: ${e?.message}`);
return null;
}
}
return null;
}
function putToCache(domain, result) {
const now = Math.floor(Date.now() / 1000);
const ttlAt = now + CACHE_TTL_SECONDS;
try {
stmtUpsert.run({
domain,
result_json: JSON.stringify(result.relatedDomains || []),
final_url: result.finalUrl || null,
redirect_chain_json: JSON.stringify(result.redirectChain || []),
updated_at: now,
ttl_at: ttlAt,
});
log.info(`[CACHE] Upsert ${domain} ttlAt=${ttlAt}`);
} catch (e) {
log.warn(`[CACHE] Upsert error: ${e?.message}`);
}
}
// ---------- Routes ----------
app.get('/domains', async (req, res) => {
res.type('application/json');
const raw = req.query.domain;
log.info(`[HTTP] /domains?domain=${raw}`);
const domain = normalizeDomain(raw);
if (!domain) {
res.status(400).json({ error: '"domain" must be a valid hostname', code: 'BAD_DOMAIN' });
return;
}
const HARD_TIMEOUT = parseInt(process.env.HARD_TIMEOUT_MS || '70000', 10);
const hardTimer = setTimeout(() => {
try { if (!res.headersSent) res.status(504).json({ error: 'Gateway Timeout', code: 'TIMEOUT' }); } catch {}
}, HARD_TIMEOUT);
try {
const cached = getFromCache(domain);
if (cached) {
log.info(`[HTTP] Cache HIT ${domain}`);
res.status(200).json({
domain,
finalUrl: cached.finalUrl,
relatedDomains: cached.relatedDomains,
redirectChain: cached.redirectChain,
cached: true,
cachedAt: cached.cachedAt,
ttlAt: cached.ttlAt,
status: 'ok'
});
return;
}
const result = await scanDomainOnce(domain);
if (result.precheck) {
if ((result.precheck || '').startsWith('marketing-redirect')) {
res.status(200).json({
domain,
finalUrl: result.finalUrl || `https://${domain}`,
relatedDomains: [domain],
redirectChain: [],
cached: false,
status: 'ok',
note: result.precheck
});
return;
}
res.status(200).json({
domain,
finalUrl: result.finalUrl || `https://${domain}`,
relatedDomains: [domain],
redirectChain: [],
cached: false,
status: (result.precheck === 'forbidden' || result.precheck === 'blocked') ? 'blocked' : 'skipped',
reason: result.precheck
});
return;
}
putToCache(domain, result);
res.status(200).json({
domain,
finalUrl: result.finalUrl,
relatedDomains: result.relatedDomains,
redirectChain: result.redirectChain,
cached: false,
status: 'ok'
});
} catch (e) {
const msg = String(e?.message || 'Internal error');
log.error(`[HTTP] Error for ${domain}: ${msg}`);
const forbidden = /403|forbidden|blocked/i.test(msg);
res.status(forbidden ? 403 : 500).json({
error: forbidden ? 'Forbidden' : 'Internal server error',
code: forbidden ? 'FORBIDDEN' : 'INTERNAL',
details: msg
});
} finally {
clearTimeout(hardTimer);
}
});
app.get('/health', (_req, res) => {
res.type('application/json');
res.json({ ok: true });
});
// ---------- Signals ----------
process.on('SIGTERM', async () => {
log.info('[SIGNAL] SIGTERM');
try { if (browser) await browser.close(); } catch {}
process.exit(0);
});
process.on('SIGINT', async () => {
log.info('[SIGNAL] SIGINT');
try { if (browser) await browser.close(); } catch {}
process.exit(0);
});
// ---------- Start ----------
app.listen(PORT, () => {
log.info(`Domain scanner service listening on port ${PORT}`);
});