Compare commits

...
Sign in to create a new pull request.

16 commits

Author SHA1 Message Date
d6d6027a17 Add yandex to ignore domains
Some checks failed
Build and Push Docker Image / build-and-push (push) Has been cancelled
2025-09-19 13:47:36 +00:00
a038862553 Remove punnycode from server code, unlink old pcode library 2025-09-15 16:27:37 +00:00
35b0ff1cd5 Udpate README.md
Some checks failed
Build and Push Docker Image / build-and-push (push) Has been cancelled
2025-09-13 13:26:58 +00:00
06929de8f6 New logic of redirect. Add DEBUG var
Some checks are pending
Build and Push Docker Image / build-and-push (push) Waiting to run
2025-09-13 12:17:29 +00:00
710c9d6b34 Old server logic
Some checks are pending
Build and Push Docker Image / build-and-push (push) Waiting to run
2025-09-13 09:55:07 +00:00
d3d848884e Add node dep
Some checks are pending
Build and Push Docker Image / build-and-push (push) Waiting to run
2025-09-12 16:41:02 +00:00
72752b1a0b Add tini as INIT in container. Rewrite server.js
Some checks are pending
Build and Push Docker Image / build-and-push (push) Waiting to run
2025-09-12 16:21:33 +00:00
a5a0ed828d Add sqlite dep
Some checks are pending
Build and Push Docker Image / build-and-push (push) Waiting to run
2025-09-12 09:39:06 +00:00
361fa4fafe Add 301 logic and cache
Some checks are pending
Build and Push Docker Image / build-and-push (push) Waiting to run
2025-09-12 09:13:47 +00:00
09f054b888 Use headless chromium
Some checks are pending
Build and Push Docker Image / build-and-push (push) Waiting to run
2025-09-11 09:59:28 +00:00
cdd36afd33 Change base image for OSI to trixie:slim 2025-09-11 09:23:02 +00:00
46b32d6aa7 Change base image for OCI 2025-09-11 08:08:43 +00:00
7c2a58c894 Add env setting to workflow 2025-09-11 07:57:09 +00:00
e1911bf5e8 Fix dockerhub image name 2025-09-11 07:45:35 +00:00
root
fb133dff6c Add Github Workflow 2025-09-11 07:31:31 +00:00
root
ff1330868e Add Github Workflow 2025-09-11 07:31:02 +00:00
9 changed files with 1119 additions and 112 deletions

13
.dockerignore Normal file
View file

@ -0,0 +1,13 @@
*
!package.json
!package-lock.json
!server.js
!ignore-domains.txt
!LICENSE
!README.md

View file

@ -1,42 +0,0 @@
name: Build and Push Docker Image
on:
push:
branches:
- main
paths:
- '**/*' # Триггер при любом изменении репозитория
jobs:
build-and-push:
runs-on: docker
container:
image: docker:24.0.1
steps:
# Установка Docker CLI (если не в базовом образе)
- name: Setup Docker CLI
run: |
apk add --no-cache docker-cli
# Авторизация в Docker Hub - токен необходимо добавить в Secrets
- name: Login to Docker Hub
env:
DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }}
DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}
run: |
echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin
# Клонирование исходников — встроено в Forgejo Actions
# Сборка Docker образа
- name: Build Docker Image
run: |
docker build -t ${DOCKER_USERNAME}/playwright-domain-scanner:latest .
# Push образ на Docker Hub
- name: Push Docker Image
run: |
docker push ${DOCKER_USERNAME}/playwright-domain-scanner:latest

View file

@ -0,0 +1,28 @@
name: Build and Push Docker Image
on:
push:
branches:
- main
jobs:
build-and-push:
runs-on: ubuntu-latest
environment: dockerhub
steps:
- name: Checkout the repository
uses: actions/checkout@v3
- name: Log in to Docker Hub
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- name: Build and push Docker image
uses: docker/build-push-action@v4
with:
push: true
tags: ${{ secrets.DOCKER_USERNAME }}/gekata:latest

View file

@ -1,46 +1,65 @@
# Use official Node.js LTS base image
FROM node:20-slim
# -------- Builder stage --------
FROM debian:bookworm-slim AS builder
ENV DEBIAN_FRONTEND=noninteractive
# Install dependencies for running Chromium
RUN apt-get update && apt-get install -y \
ca-certificates \
fonts-liberation \
libasound2 \
libatk-bridge2.0-0 \
libatk1.0-0 \
libcups2 \
libdbus-1-3 \
libdrm2 \
libgbm1 \
libgtk-3-0 \
libnspr4 \
libnss3 \
libx11-xcb1 \
libxcomposite1 \
libxdamage1 \
libxrandr2 \
xdg-utils \
wget \
--no-install-recommends && \
rm -rf /var/lib/apt/lists/*
# Node + build tools for native modules (better-sqlite3)
RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates curl gnupg \
nodejs npm \
python3 make g++ pkg-config libsqlite3-dev \
&& rm -rf /var/lib/apt/lists/*
# Set working directory
WORKDIR /usr/src/app
WORKDIR /app
# Copy package files and install dependencies
# Copy only manifests first to leverage Docker cache
COPY package*.json ./
COPY ignore-domains.txt ./
RUN npm ci
# Install Playwright browsers (Chromium)
RUN npx playwright install chromium
# Install production deps (build native modules here)
ENV CI=true
RUN npm ci --omit=dev
# Copy app sources
# Copy source
COPY . .
# Expose port
# -------- Runtime stage --------
FROM debian:bookworm-slim
ENV DEBIAN_FRONTEND=noninteractive
# Install tini for proper PID 1 and signal handling
# Install Node.js runtime, Chromium and minimal libs
RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates curl gnupg \
tini \
nodejs npm \
chromium \
libx11-6 libxcomposite1 libxdamage1 libxrandr2 libxkbcommon0 \
libgtk-3-0 libnss3 libdrm2 libgbm1 libasound2 fonts-liberation \
fonts-dejavu-core \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# Copy node_modules and app from builder
COPY --from=builder /app/node_modules /app/node_modules
COPY --from=builder /app/package*.json /app/
COPY . .
# Security: drop root
RUN useradd -ms /bin/bash nodeuser && chown -R nodeuser:nodeuser /app
USER nodeuser
# Environment
ENV PORT=3000 \
PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 \
PLAYWRIGHT_BROWSERS_PATH=0 \
CHROMIUM_PATH=/usr/bin/chromium \
CACHE_TTL_SECONDS=21600
EXPOSE 3000
# Run the service
CMD ["npm", "start"]
# Use tini as PID 1 so we don't need `--init`
ENTRYPOINT ["/usr/bin/tini", "--"]
# Start the service
CMD ["node", "server.js"]

118
README.md
View file

@ -1,3 +1,117 @@
# gekata
**Gekata** — это легковесный сервис на Node.js для извлечения «связных доменов» с веб-страниц, запускаемый в контейнере Debian с Chromium; он сначала делает предзапросы с ручным следованием редиректам, затем при необходимости поднимает безголовый браузер, ограничивает глубину редиректов и кэширует результаты в SQLite через better-sqlite3. Сервис предоставляет HTTP API /domains, возвращающее финальный URL, цепочку редиректов и список связанных доменов, а также /health для проверки готовности.
Extract all domains from site
### Назначение
Gekata сканирует заданный домен, разрешает маркетинговые и другие редиректы до целевой HTML-страницы, загружает её в безголовом Chromium и собирает множество доменных имён из всех сетевых запросов страницы, формируя список «связных доменов» для анализа интеграций, трекинга и CDN. Такой подход работает и для динамических сайтов с клиентским рендерингом.
### Архитектура
- Веб-сервер на Express предоставляет RESTмаршруты, принимает домен, валидирует его и инициирует сканирование, обрабатывая таймауты и коды ошибок.
- Предпроверка делает GET с ручным управлением редиректами, классифицируя сценарии: форс‑редиректы «маркетинга», запреты 403, скачивания и неHTML контент, чтобы экономить запуск браузера.
- Эскалация в безголовый Chromium (через Playwright) выполняет навигацию, применяя ограничение глубины редиректов только для документных переходов и ожидая «тихое окно» сети для стабильного сбора доменов.
- Кэширование результатов в SQLite с TTL ускоряет повторные запросы; используется лучшее для продакшна подключение better-sqlite3 и WALжурналирование для устойчивости.
### Потоки данных
- Вход: GET /domains?domain=<host> — принимает хост или URL, нормализует до ASCII/Punycode и формирует стартовый https:// URL.
- Предобработка: ручной обход 3xx с ограничением шагов; детекция «похожих на файл» ссылок и контента nonHTML; маркетинговый редирект помечается и может быть целевой.
- Сканирование браузером: навигация на целевой URL, слежение за запросами/ответами страницы, сбор доменов из всех сетевых событий, исключая шум (google/doubleclick по эвристике), построение цепочки редиректов для документной навигации.
- Выход: JSON с finalUrl, relatedDomains[], redirectChain[], статусами ok/skipped/blocked и служебными пометками (cached, ttl).
### API
- GET /domains
Параметры: domain — доменное имя или URL.
Ответ 200 ok:
- domain: нормализованный запрошенный домен.
- finalUrl: конечный URL после редиректов/навигации.
- relatedDomains: уникальные домены, замеченные при загрузке страницы.
- redirectChain: массив { from, to, status } для документных 3xx.
- cached: true/false, cachedAt, ttlAt.
- status: ok | skipped | blocked; дополнительные note/reason при skip/blocked.
- GET /health — простой JSON { ok: true } для readiness/liveness.
### Обработка редиректов
- На этапе предпроверки ограничение PRECHECK_MAX_REDIRECTS предотвращает бесконечные цепочки до запуска браузера; 403 заставляет эскалировать в браузер, nonHTML/attachment возвращают немедленный ответ.
- В браузере включён маршрут‑ограничитель только для документной навигации: запросы навигации обрабатываются с maxRedirects, ассеты идут без ограничений, чтобы не ломать рендеринг.
- Если лимит превышен, навигация завершается контролируемо и возвращается ошибка «Too many redirects», переводимая в понятный статус ответа API.
### Кэш и TTL
- SQLite таблица domain_cache хранит: домен, JSON списка доменов, финальный URL, цепочку редиректов, время обновления и ttl_at.
- Повторные обращения до истечения TTL возвращают сохранённый результат без запуска браузера, снижая задержки и нагрузку.
### Контейнеризация
- Образ состоит из двух стадий: builder и runtime, обе на debian:bookworm-slim.
- Стадия builder устанавливает Node.js, компилятор и заголовки SQLite для сборки nativeмодуля bettersqlite3, затем выполняет npm ci с пропуском devзависимостей и копирует исходники.
- Стадия runtime устанавливает tini как корректный PID 1, Node.js runtime, системный Chromium и минимальный набор X/GTK/NSS/GBM/шрифтов, необходимых для безголового режима; копируются node_modules и исходники из builder.
- Создаётся непривилегированный пользователь nodeuser; директория приложения принадлежит ему; сервис запускается не от root.
### Переменные окружения
- PORT — порт HTTP сервера (по умолчанию 3000).
- CHROMIUM_PATH — путь к системному Chromium (/usr/bin/chromium в контейнере).
- CACHE_TTL_SECONDS — срок жизни кэша (по умолчанию 6 часов).
- HARD_TIMEOUT_MS — жёсткий таймаут обработки HTTPзапроса (по умолчанию 70 секунд).
- MAX_REDIRECT_STEPS — максимальная глубина редиректов для документной навигации (по умолчанию 20).
- NAV_TIMEOUT_MS, QUIET_WINDOW_MS — таймауты навигации и «тихого окна» сети.
- DEBUG — включает подробные логи страницы/сетевых событий при значении 1.
### Безопасность и устойчивость
- tini как init обрабатывает сигналы и «зомби» процессы; контейнер корректно завершает Chromium по SIGTERM/SIGINT, предотвращая утечки.
- Запуск под непривилегированным пользователем снижает риск компрометации; Chromium стартует с флагами nosandbox/disablesetuid-sandbox, что совместимо с безпривилегированным окружением контейнеров.
- Ограничение редиректов для документных переходов устраняет зацикливание «маркетинговых» и неверных конфигураций, не влияя на загрузку ассетов.
### Производительность
- npm ci в builderстадии плюс копирование package*.json до исходников задействуют кэш слоёв Docker, ускоряя сборки.
- bettersqlite3 с синхронными подготовленными выражениями обеспечивает быстрый локальный кэш без отдельного сервиса БД.
- Предпроверка HTTP избавляет от лишних подъёмов браузера для неHTML или «прикреплённых» ответов.
### Сборка и запуск
- Сборка образа:
- docker build -t gekata:latest .
- Запуск контейнера:
- docker run --rm -p 3000:3000 -e CACHE_TTL_SECONDS=21600 -e MAX_REDIRECT_STEPS=20 gekata:latest
- Примеры запросов:
- curl -s "http://localhost:3000/health"
- curl -s "http://localhost:3000/domains?domain=forum.xda-developers.com"
### Журналирование и диагностика
- Лог‑метки [BOOT], [HTTP], [SCAN], [BROWSER], [CACHE], [SIGNAL] позволяют быстро локализовать этап и тип события.
- При включённом DEBUG=1 логируются консоль страницы, ошибки, неудавшиеся запросы и сетевые эвенты, что помогает анализировать блокировки, CORS, антибот‑защиту и таймауты.
### Ограничения
- Сайты с жёсткими антибот‑мерами (403/JSчелленджи) могут быть помечены как blocked или потребовать дополнительной эмуляции (например, иные useragent/locale/timezone/proxy).
- Сбор связанных доменов базируется на фактически выполненных сетевых запросах и может меняться при A/B тестах, гео‑таргетинге или различиях по useragent.
### Расширения и доработки
- Добавить белый/чёрный список доменов, тонкую фильтрацию трекеров и интеграций.
- Вынести кэш в внешний SQLiteфайл через volume для сохранения между рестартами, настроить резервное копирование.
- Параметризовать useragent/locale/timezone и добавить поддержку прокси для региональных сценариев.
- Экспортировать полный сетевой журнал и тайминги (HARподобный формат) как опциональную выгрузку.
### Файлы проекта
- server.js — основной сервис, логика API, предобработка, сканирование браузером, кэш, ограничения редиректов, завершение по сигналам.
- Dockerfile — двухстадийная сборка, системный Chromium в рантайме, tini, непривилегированный пользователь, переменные окружения и запуск службы.

View file

@ -1,3 +1,3 @@
doubleclick
google
yandex

414
package-lock.json generated
View file

@ -8,7 +8,9 @@
"name": "playwright-domain-scanner",
"version": "1.0.0",
"dependencies": {
"better-sqlite3": "^9.0.0",
"express": "^4.18.2",
"express-rate-limit": "^8.1.0",
"playwright": "^1.42.0"
}
},
@ -29,6 +31,53 @@
"resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz",
"integrity": "sha512-PCVAQswWemu6UdxsDFFX/+gVeYqKAod3D3UVm91jHwynguOwAvYPhx8nNlM++NqRcK6CxxpUafjmhIdKiHibqg=="
},
"node_modules/base64-js": {
"version": "1.5.1",
"resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz",
"integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
]
},
"node_modules/better-sqlite3": {
"version": "9.6.0",
"resolved": "https://registry.npmjs.org/better-sqlite3/-/better-sqlite3-9.6.0.tgz",
"integrity": "sha512-yR5HATnqeYNVnkaUTf4bOP2dJSnyhP4puJN/QPRyx4YkBEEUxib422n2XzPqDEHjQQqazoYoADdAm5vE15+dAQ==",
"hasInstallScript": true,
"dependencies": {
"bindings": "^1.5.0",
"prebuild-install": "^7.1.1"
}
},
"node_modules/bindings": {
"version": "1.5.0",
"resolved": "https://registry.npmjs.org/bindings/-/bindings-1.5.0.tgz",
"integrity": "sha512-p2q/t/mhvuOj/UeLlV6566GD/guowlr0hHxClI0W9m7MWYkL1F0hLo+0Aexs9HSPCtR1SXQ0TD3MMKrXZajbiQ==",
"dependencies": {
"file-uri-to-path": "1.0.0"
}
},
"node_modules/bl": {
"version": "4.1.0",
"resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz",
"integrity": "sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==",
"dependencies": {
"buffer": "^5.5.0",
"inherits": "^2.0.4",
"readable-stream": "^3.4.0"
}
},
"node_modules/body-parser": {
"version": "1.20.3",
"resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.3.tgz",
@ -52,6 +101,29 @@
"npm": "1.2.8000 || >= 1.4.16"
}
},
"node_modules/buffer": {
"version": "5.7.1",
"resolved": "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz",
"integrity": "sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
],
"dependencies": {
"base64-js": "^1.3.1",
"ieee754": "^1.1.13"
}
},
"node_modules/bytes": {
"version": "3.1.2",
"resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz",
@ -87,6 +159,11 @@
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/chownr": {
"version": "1.1.4",
"resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz",
"integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg=="
},
"node_modules/content-disposition": {
"version": "0.5.4",
"resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.4.tgz",
@ -127,6 +204,28 @@
"ms": "2.0.0"
}
},
"node_modules/decompress-response": {
"version": "6.0.0",
"resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz",
"integrity": "sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ==",
"dependencies": {
"mimic-response": "^3.1.0"
},
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/deep-extend": {
"version": "0.6.0",
"resolved": "https://registry.npmjs.org/deep-extend/-/deep-extend-0.6.0.tgz",
"integrity": "sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==",
"engines": {
"node": ">=4.0.0"
}
},
"node_modules/depd": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/depd/-/depd-2.0.0.tgz",
@ -144,6 +243,14 @@
"npm": "1.2.8000 || >= 1.4.16"
}
},
"node_modules/detect-libc": {
"version": "2.0.4",
"resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.0.4.tgz",
"integrity": "sha512-3UDv+G9CsCKO1WKMGw9fwq/SWJYbI0c5Y7LU1AXYoDdbhE2AHQ6N6Nb34sG8Fj7T5APy8qXDCKuuIHd1BR0tVA==",
"engines": {
"node": ">=8"
}
},
"node_modules/dunder-proto": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz",
@ -170,6 +277,14 @@
"node": ">= 0.8"
}
},
"node_modules/end-of-stream": {
"version": "1.4.5",
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz",
"integrity": "sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg==",
"dependencies": {
"once": "^1.4.0"
}
},
"node_modules/es-define-property": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz",
@ -210,6 +325,14 @@
"node": ">= 0.6"
}
},
"node_modules/expand-template": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/expand-template/-/expand-template-2.0.3.tgz",
"integrity": "sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==",
"engines": {
"node": ">=6"
}
},
"node_modules/express": {
"version": "4.21.2",
"resolved": "https://registry.npmjs.org/express/-/express-4.21.2.tgz",
@ -255,6 +378,28 @@
"url": "https://opencollective.com/express"
}
},
"node_modules/express-rate-limit": {
"version": "8.1.0",
"resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.1.0.tgz",
"integrity": "sha512-4nLnATuKupnmwqiJc27b4dCFmB/T60ExgmtDD7waf4LdrbJ8CPZzZRHYErDYNhoz+ql8fUdYwM/opf90PoPAQA==",
"dependencies": {
"ip-address": "10.0.1"
},
"engines": {
"node": ">= 16"
},
"funding": {
"url": "https://github.com/sponsors/express-rate-limit"
},
"peerDependencies": {
"express": ">= 4.11"
}
},
"node_modules/file-uri-to-path": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/file-uri-to-path/-/file-uri-to-path-1.0.0.tgz",
"integrity": "sha512-0Zt+s3L7Vf1biwWZ29aARiVYLx7iMGnEUl9x33fbB/j3jR81u/O2LbqK+Bm1CDSNDKVtJ/YjwY7TUd5SkeLQLw=="
},
"node_modules/finalhandler": {
"version": "1.3.1",
"resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.3.1.tgz",
@ -288,6 +433,11 @@
"node": ">= 0.6"
}
},
"node_modules/fs-constants": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz",
"integrity": "sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow=="
},
"node_modules/fsevents": {
"version": "2.3.2",
"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
@ -344,6 +494,11 @@
"node": ">= 0.4"
}
},
"node_modules/github-from-package": {
"version": "0.0.0",
"resolved": "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz",
"integrity": "sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw=="
},
"node_modules/gopd": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz",
@ -403,11 +558,43 @@
"node": ">=0.10.0"
}
},
"node_modules/ieee754": {
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz",
"integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
]
},
"node_modules/inherits": {
"version": "2.0.4",
"resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
"integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="
},
"node_modules/ini": {
"version": "1.3.8",
"resolved": "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz",
"integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew=="
},
"node_modules/ip-address": {
"version": "10.0.1",
"resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.0.1.tgz",
"integrity": "sha512-NWv9YLW4PoW2B7xtzaS3NCot75m6nK7Icdv0o3lfMceJVRfSoQwqD4wEH5rLwoKJwUiZ/rfpiVBhnaF0FK4HoA==",
"engines": {
"node": ">= 12"
}
},
"node_modules/ipaddr.js": {
"version": "1.9.1",
"resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz",
@ -478,11 +665,40 @@
"node": ">= 0.6"
}
},
"node_modules/mimic-response": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz",
"integrity": "sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==",
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/minimist": {
"version": "1.2.8",
"resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz",
"integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==",
"funding": {
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/mkdirp-classic": {
"version": "0.5.3",
"resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz",
"integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A=="
},
"node_modules/ms": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
"integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A=="
},
"node_modules/napi-build-utils": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/napi-build-utils/-/napi-build-utils-2.0.0.tgz",
"integrity": "sha512-GEbrYkbfF7MoNaoh2iGG84Mnf/WZfB0GdGEsM8wz7Expx/LlWf5U8t9nvJKXSp3qr5IsEbK04cBGhol/KwOsWA=="
},
"node_modules/negotiator": {
"version": "0.6.3",
"resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.3.tgz",
@ -491,6 +707,17 @@
"node": ">= 0.6"
}
},
"node_modules/node-abi": {
"version": "3.77.0",
"resolved": "https://registry.npmjs.org/node-abi/-/node-abi-3.77.0.tgz",
"integrity": "sha512-DSmt0OEcLoK4i3NuscSbGjOf3bqiDEutejqENSplMSFA/gmB8mkED9G4pKWnPl7MDU4rSHebKPHeitpDfyH0cQ==",
"dependencies": {
"semver": "^7.3.5"
},
"engines": {
"node": ">=10"
}
},
"node_modules/object-inspect": {
"version": "1.13.4",
"resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.4.tgz",
@ -513,6 +740,14 @@
"node": ">= 0.8"
}
},
"node_modules/once": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
"integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==",
"dependencies": {
"wrappy": "1"
}
},
"node_modules/parseurl": {
"version": "1.3.3",
"resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
@ -554,6 +789,31 @@
"node": ">=18"
}
},
"node_modules/prebuild-install": {
"version": "7.1.3",
"resolved": "https://registry.npmjs.org/prebuild-install/-/prebuild-install-7.1.3.tgz",
"integrity": "sha512-8Mf2cbV7x1cXPUILADGI3wuhfqWvtiLA1iclTDbFRZkgRQS0NqsPZphna9V+HyTEadheuPmjaJMsbzKQFOzLug==",
"dependencies": {
"detect-libc": "^2.0.0",
"expand-template": "^2.0.3",
"github-from-package": "0.0.0",
"minimist": "^1.2.3",
"mkdirp-classic": "^0.5.3",
"napi-build-utils": "^2.0.0",
"node-abi": "^3.3.0",
"pump": "^3.0.0",
"rc": "^1.2.7",
"simple-get": "^4.0.0",
"tar-fs": "^2.0.0",
"tunnel-agent": "^0.6.0"
},
"bin": {
"prebuild-install": "bin.js"
},
"engines": {
"node": ">=10"
}
},
"node_modules/proxy-addr": {
"version": "2.0.7",
"resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.7.tgz",
@ -566,6 +826,15 @@
"node": ">= 0.10"
}
},
"node_modules/pump": {
"version": "3.0.3",
"resolved": "https://registry.npmjs.org/pump/-/pump-3.0.3.tgz",
"integrity": "sha512-todwxLMY7/heScKmntwQG8CXVkWUOdYxIvY2s0VWAAMh/nd8SoYiRaKjlr7+iCs984f2P8zvrfWcDDYVb73NfA==",
"dependencies": {
"end-of-stream": "^1.1.0",
"once": "^1.3.1"
}
},
"node_modules/qs": {
"version": "6.13.0",
"resolved": "https://registry.npmjs.org/qs/-/qs-6.13.0.tgz",
@ -602,6 +871,33 @@
"node": ">= 0.8"
}
},
"node_modules/rc": {
"version": "1.2.8",
"resolved": "https://registry.npmjs.org/rc/-/rc-1.2.8.tgz",
"integrity": "sha512-y3bGgqKj3QBdxLbLkomlohkvsA8gdAiUQlSBJnBhfn+BPxg4bc62d8TcBW15wavDfgexCgccckhcZvywyQYPOw==",
"dependencies": {
"deep-extend": "^0.6.0",
"ini": "~1.3.0",
"minimist": "^1.2.0",
"strip-json-comments": "~2.0.1"
},
"bin": {
"rc": "cli.js"
}
},
"node_modules/readable-stream": {
"version": "3.6.2",
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz",
"integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==",
"dependencies": {
"inherits": "^2.0.3",
"string_decoder": "^1.1.1",
"util-deprecate": "^1.0.1"
},
"engines": {
"node": ">= 6"
}
},
"node_modules/safe-buffer": {
"version": "5.2.1",
"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
@ -626,6 +922,17 @@
"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="
},
"node_modules/semver": {
"version": "7.7.2",
"resolved": "https://registry.npmjs.org/semver/-/semver-7.7.2.tgz",
"integrity": "sha512-RF0Fw+rO5AMf9MAyaRXI4AV0Ulj5lMHqVxxdSgiVbixSCXoEmmX/jk0CuJw4+3SqroYO9VoUh+HcuJivvtJemA==",
"bin": {
"semver": "bin/semver.js"
},
"engines": {
"node": ">=10"
}
},
"node_modules/send": {
"version": "0.19.0",
"resolved": "https://registry.npmjs.org/send/-/send-0.19.0.tgz",
@ -749,6 +1056,49 @@
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/simple-concat": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/simple-concat/-/simple-concat-1.0.1.tgz",
"integrity": "sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
]
},
"node_modules/simple-get": {
"version": "4.0.1",
"resolved": "https://registry.npmjs.org/simple-get/-/simple-get-4.0.1.tgz",
"integrity": "sha512-brv7p5WgH0jmQJr1ZDDfKDOSeWWg+OVypG99A/5vYGPqJ6pxiaHLy8nxtFjBA7oMa01ebA9gfh1uMCFqOuXxvA==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/feross"
},
{
"type": "patreon",
"url": "https://www.patreon.com/feross"
},
{
"type": "consulting",
"url": "https://feross.org/support"
}
],
"dependencies": {
"decompress-response": "^6.0.0",
"once": "^1.3.1",
"simple-concat": "^1.0.0"
}
},
"node_modules/statuses": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.1.tgz",
@ -757,6 +1107,48 @@
"node": ">= 0.8"
}
},
"node_modules/string_decoder": {
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz",
"integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==",
"dependencies": {
"safe-buffer": "~5.2.0"
}
},
"node_modules/strip-json-comments": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-2.0.1.tgz",
"integrity": "sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ==",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/tar-fs": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.3.tgz",
"integrity": "sha512-090nwYJDmlhwFwEW3QQl+vaNnxsO2yVsd45eTKRBzSzu+hlb1w2K9inVq5b0ngXuLVqQ4ApvsUHHnu/zQNkWAg==",
"dependencies": {
"chownr": "^1.1.1",
"mkdirp-classic": "^0.5.2",
"pump": "^3.0.0",
"tar-stream": "^2.1.4"
}
},
"node_modules/tar-stream": {
"version": "2.2.0",
"resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-2.2.0.tgz",
"integrity": "sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==",
"dependencies": {
"bl": "^4.0.3",
"end-of-stream": "^1.4.1",
"fs-constants": "^1.0.0",
"inherits": "^2.0.3",
"readable-stream": "^3.1.1"
},
"engines": {
"node": ">=6"
}
},
"node_modules/toidentifier": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.1.tgz",
@ -765,6 +1157,17 @@
"node": ">=0.6"
}
},
"node_modules/tunnel-agent": {
"version": "0.6.0",
"resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz",
"integrity": "sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==",
"dependencies": {
"safe-buffer": "^5.0.1"
},
"engines": {
"node": "*"
}
},
"node_modules/type-is": {
"version": "1.6.18",
"resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz",
@ -785,6 +1188,11 @@
"node": ">= 0.8"
}
},
"node_modules/util-deprecate": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
"integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw=="
},
"node_modules/utils-merge": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz",
@ -800,7 +1208,11 @@
"engines": {
"node": ">= 0.8"
}
},
"node_modules/wrappy": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
"integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ=="
}
}
}

View file

@ -7,8 +7,9 @@
"start": "node server.js"
},
"dependencies": {
"better-sqlite3": "^9.0.0",
"express": "^4.18.2",
"express-rate-limit": "^8.1.0",
"playwright": "^1.42.0"
}
}

514
server.js
View file

@ -1,56 +1,518 @@
// server.js
const express = require('express');
const { chromium } = require('playwright');
const Database = require('better-sqlite3');
// Убираем punycode; используем WHATWG URL + domainToASCII
const { URL, domainToASCII } = require('node:url');
const app = express();
const port = process.env.PORT || 3000;
// ---------- Config ----------
const PORT = Number(process.env.PORT || 3000);
const CHROMIUM_PATH = process.env.CHROMIUM_PATH || undefined;
const CACHE_TTL_SECONDS = parseInt(process.env.CACHE_TTL_SECONDS || '21600', 10);
const MAX_REDIRECT_STEPS = parseInt(process.env.MAX_REDIRECT_STEPS || '20', 10);
const NAV_TIMEOUT_MS = parseInt(process.env.NAV_TIMEOUT_MS || '30000', 10);
const QUIET_WINDOW_MS = parseInt(process.env.QUIET_WINDOW_MS || '700', 10);
const PRECHECK_MAX_REDIRECTS = parseInt(process.env.PRECHECK_MAX_REDIRECTS || '15', 10);
const SQLITE_PATH = process.env.SQLITE_PATH || './cache.db';
const DEBUG_ENABLED = String(process.env.DEBUG || '').trim() === '1';
const CHROMIUM_ARGS = [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--no-zygote',
];
// ---------- Logging ----------
const log = {
info: (...a) => console.log(...a),
debug: (...a) => { if (DEBUG_ENABLED) console.log(...a); },
warn: (...a) => console.warn(...a),
error: (...a) => console.error(...a),
};
// ---------- DB ----------
log.info(`[BOOT] SQLite path: ${SQLITE_PATH}`);
const db = new Database(SQLITE_PATH);
db.pragma('journal_mode = WAL');
db.exec(`
CREATE TABLE IF NOT EXISTS domain_cache (
domain TEXT PRIMARY KEY,
result_json TEXT NOT NULL,
final_url TEXT,
redirect_chain_json TEXT,
updated_at INTEGER NOT NULL,
ttl_at INTEGER NOT NULL
);
`);
const stmtSelect = db.prepare(`
SELECT result_json, final_url, redirect_chain_json, updated_at, ttl_at
FROM domain_cache WHERE domain = ?
`);
const stmtUpsert = db.prepare(`
INSERT INTO domain_cache (domain, result_json, final_url, redirect_chain_json, updated_at, ttl_at)
VALUES (@domain, @result_json, @final_url, @redirect_chain_json, @updated_at, @ttl_at)
ON CONFLICT(domain) DO UPDATE SET
result_json = excluded.result_json,
final_url = excluded.final_url,
redirect_chain_json = excluded.redirect_chain_json,
updated_at = excluded.updated_at,
ttl_at = excluded.ttl_at
`);
app.use(express.json());
function extractDomain(url) {
// ---------- Helpers ----------
function normalizeDomain(input) {
if (!input || typeof input !== 'string') return null;
const s = input.trim().toLowerCase();
try {
return new URL(url).hostname;
// Если это URL, берём hostname; иначе считаем, что это просто хост
const asUrl = /^https?:\/\//i.test(s) ? s : `https://${s}`;
const u = new URL(asUrl);
// Преобразуем к IDNA ASCII (Punycode) через WHATWG util
const ascii = domainToASCII(u.hostname || '');
return ascii || null;
} catch {
// Попытка прямой IDNA-конверсии из строки (на случай голого хоста без схемы)
try {
const ascii = domainToASCII(s);
return ascii || null;
} catch {
return null;
}
}
} // WHATWG URL + url.domainToASCII [web:167][web:161][web:164]
function extractDomain(url) {
try { return new URL(url).hostname.toLowerCase(); } catch { return null; }
}
// эвристика «выглядит как файл»
function looksLikeFilePath(u) {
try {
const { pathname } = new URL(u);
return /\.(?:zip|pdf|png|jpe?g|gif|webp|svg|mp4|mp3|wav|csv|xlsx?|docx?|pptx?|exe|deb|rpm|apk|tar(?:\.gz)?|7z|gz|bz2)$/i.test(pathname);
} catch { return false; }
}
// канонизация URL для детекции петель
function normalizeUrlForLoop(u) {
try {
const x = new URL(u);
x.hash = '';
return x.toString();
} catch { return u; }
}
// ---------- Precheck: manual redirects & classification ----------
async function precheckFollowManually(startUrl) {
let url = startUrl;
const visited = new Set();
let sawHtmlHint = false;
for (let i = 0; i < PRECHECK_MAX_REDIRECTS; i++) {
const norm = normalizeUrlForLoop(url);
if (visited.has(norm)) {
log.debug(`[PRECHECK] Loop at ${norm}`);
return { skip: true, reason: 'redirect-loop', tryBrowser: sawHtmlHint };
}
visited.add(norm);
let res;
try {
res = await fetch(url, { method: 'GET', redirect: 'manual' });
} catch (e) {
log.debug(`[PRECHECK] GET(manual) failed for ${url}: ${e?.message}`);
return { skip: false, reason: null, tryBrowser: false };
}
const status = res.status;
const ct = res.headers.get('content-type') || '';
const cd = res.headers.get('content-disposition') || '';
const loc = res.headers.get('location') || '';
log.debug(`[PRECHECK] step=${i} status=${status} ct="${ct}" cd="${cd || '-'}" loc="${loc || '-'}"`);
const isHtml = /\btext\/html\b/i.test(ct);
if (isHtml) sawHtmlHint = true;
const isAttachment = /attachment/i.test(cd);
if (status === 403) {
return { skip: true, reason: 'forbidden', tryBrowser: true };
}
if (status >= 300 && status < 400 && loc) {
const next = new URL(loc, url).toString();
if (looksLikeFilePath(next) || /download|file|export/i.test(next)) {
return { skip: true, reason: `redirect-to-file(${next})`, tryBrowser: false, finalUrl: next };
}
try {
const probe = await fetch(next, { method: 'GET', redirect: 'manual' });
const pct = probe.headers.get('content-type') || '';
const isHtmlTarget = /\btext\/html\b/i.test(pct);
if (isHtmlTarget) {
return { skip: true, reason: `marketing-redirect(${next})`, tryBrowser: false, finalUrl: next };
}
} catch {}
url = next;
continue;
}
if (isAttachment) return { skip: true, reason: 'attachment', tryBrowser: false, finalUrl: url };
if (!isHtml && ct) return { skip: true, reason: `non-HTML (${ct})`, tryBrowser: false, finalUrl: url };
return { skip: false, reason: null, tryBrowser: false, finalUrl: url };
}
log.debug(`[PRECHECK] Too many redirects >= ${PRECHECK_MAX_REDIRECTS}`);
return { skip: true, reason: `redirect-loop(${PRECHECK_MAX_REDIRECTS})`, tryBrowser: sawHtmlHint, finalUrl: null };
} // [web:167]
// ---------- Browser lifecycle ----------
let browser;
async function ensureBrowser() {
if (browser && browser.isConnected()) return browser;
if (browser) { try { await browser.close(); } catch {} }
log.info(`[BROWSER] Launch headless Chromium`);
browser = await chromium.launch({ executablePath: CHROMIUM_PATH, headless: true, args: CHROMIUM_ARGS });
return browser;
} // [web:151]
// ---------- Redirect chain builder (document-only) ----------
function buildRedirectChainForResponse(resp, maxLen = 50) {
const chain = [];
const req = resp.request();
if (req.resourceType() !== 'document') return chain;
let prev = req.redirectedFrom();
let toUrl = req.url();
const status = resp.status();
while (prev) {
chain.push({ from: prev.url(), to: toUrl, status });
toUrl = prev.url();
prev = prev.redirectedFrom();
if (chain.length >= maxLen) break;
}
return chain.reverse();
} // [web:151]
// ---------- Quiet network window ----------
async function quietWindowWait({ inflightRef, lastChangeRef, timeoutMs, quietMs }) {
const start = Date.now();
while (Date.now() - start < timeoutMs) {
const quietFor = Date.now() - lastChangeRef.value;
if (inflightRef.value === 0 && quietFor >= quietMs) return;
await new Promise(r => setTimeout(r, 100));
}
} // [web:151]
// ---------- Core scan with Playwright ----------
async function scanWithBrowser(originDomain, startUrl, contextOpts = {}) {
const b = await ensureBrowser();
const context = await b.newContext({ acceptDownloads: true, ...contextOpts });
// Безопасный лимитер редиректов для документной навигации
await context.route('**', async route => {
const request = route.request();
const isDoc = request.resourceType() === 'document';
const isNav = request.isNavigationRequest();
if (!(isDoc && isNav)) return route.continue();
try {
const resp = await route.fetch({ maxRedirects: MAX_REDIRECT_STEPS });
const status = resp.status();
const headers = await resp.headers();
const body = await resp.body().catch(() => null);
try {
await route.fulfill({ status, headers, body });
} catch (e) {
log.debug(`[ROUTE] fulfill failed for ${request.url()}: ${e?.message || e}`);
await route.continue();
}
} catch (e) {
const msg = String(e?.message || '');
if (/redirect/i.test(msg) || /too many/i.test(msg)) {
try {
await route.fulfill({
status: 508,
contentType: 'text/plain',
body: 'Loop Detected: too many redirects'
});
} catch (e2) {
log.debug(`[ROUTE] fulfill(508) failed for ${request.url()}: ${e2?.message || e2}`);
await route.continue();
}
} else {
log.debug(`[ROUTE] fetch failed for ${request.url()}: ${msg}`);
await route.continue();
}
}
});
const page = await context.newPage();
const seenDomains = new Set();
const redirectLog = [];
const visitedUrls = new Set();
const inflightRef = { value: 0 };
const lastChangeRef = { value: Date.now() };
if (DEBUG_ENABLED) {
page.on('console', msg => log.debug(`[PAGE.CONSOLE] ${msg.type()}: ${msg.text()}`));
page.on('pageerror', err => log.debug(`[PAGE.ERROR] ${err?.message}`));
page.on('requestfailed', req => log.debug(`[REQ.FAIL] ${req.url()} reason=${req.failure()?.errorText}`));
}
page.on('download', async dl => {
try { await dl.failure().catch(() => {}); } catch {}
log.debug(`[SCAN] Download ignored: ${dl.url()}`);
});
const onReq = req => {
inflightRef.value++;
lastChangeRef.value = Date.now();
const d = extractDomain(req.url());
if (d) seenDomains.add(d);
log.debug(`[REQ] ${req.method()} ${req.url()}`);
};
const onResp = resp => {
inflightRef.value = Math.max(0, inflightRef.value - 1);
lastChangeRef.value = Date.now();
const d = extractDomain(resp.url());
if (d) seenDomains.add(d);
const status = resp.status();
log.debug(`[RESP] ${status} ${resp.url()}`);
if (status >= 300 && status < 400 && resp.request().resourceType() === 'document') {
const piece = buildRedirectChainForResponse(resp, MAX_REDIRECT_STEPS + 5);
redirectLog.push(...piece);
}
};
page.on('request', onReq);
page.on('response', onResp);
try {
log.info(`[SCAN] goto(${startUrl}) domcontentloaded timeout=${NAV_TIMEOUT_MS}`);
let response;
try {
response = await page.goto(startUrl, { waitUntil: 'domcontentloaded', timeout: NAV_TIMEOUT_MS });
} catch (e) {
const msg = String(e?.message || '');
if (/Download is starting/i.test(msg)) {
log.info(`[SCAN] goto triggered download; continue as non-HTML`);
} else {
throw e;
}
}
if (response && response.status && response.status() === 508) {
throw new Error(`Too many redirects (${MAX_REDIRECT_STEPS})`);
}
await quietWindowWait({ inflightRef, lastChangeRef, timeoutMs: NAV_TIMEOUT_MS, quietMs: QUIET_WINDOW_MS });
const finalUrl = page.url();
if (visitedUrls.has(finalUrl)) throw new Error('Redirect loop detected');
visitedUrls.add(finalUrl);
const steps = redirectLog.length;
if (steps > MAX_REDIRECT_STEPS) throw new Error(`Too many redirects (${steps})`);
await context.close();
const relatedDomains = Array.from(seenDomains)
.filter(d => !d.includes('doubleclick') && !d.includes('google'))
.sort();
log.info(`[SCAN] Done finalUrl=${finalUrl} domains=${relatedDomains.length} redirects=${steps}`);
return { finalUrl, relatedDomains, redirectChain: redirectLog };
} catch (e) {
try { await context.close(); } catch {}
try {
if (browser && typeof browser.isConnected === 'function' && !browser.isConnected()) {
await browser.close(); browser = null;
}
} catch {}
log.error(`[SCAN] Error: ${e?.message}`);
throw e;
} finally {
page.off('request', onReq);
page.off('response', onResp);
}
}
// ---------- High-level scan with precheck and escalation ----------
async function scanDomainOnce(originDomain) {
const startUrl = `https://${originDomain}`;
log.info(`[SCAN] Start domain="${originDomain}" url=${startUrl}`);
const pre = await precheckFollowManually(startUrl);
if (pre.skip && (pre.reason === 'attachment' || (pre.reason || '').startsWith('non-HTML'))) {
log.info(`[SCAN] Skip non-HTML/attachment: ${pre.reason}`);
return { finalUrl: pre.finalUrl || startUrl, relatedDomains: [originDomain], redirectChain: [], precheck: pre.reason };
}
let targetUrl = startUrl;
if (pre.skip && /^marketing-redirect/.test(pre.reason || '') && pre.finalUrl) {
log.info(`[SCAN] Marketing redirect -> follow target in browser: ${pre.finalUrl}`);
targetUrl = pre.finalUrl;
} else if (pre.skip && pre.tryBrowser) {
log.info(`[SCAN] Escalation to browser due to ${pre.reason}`);
}
const contextOpts = {
userAgent:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
locale: 'en-US',
timezoneId: 'UTC',
};
try {
const result = await scanWithBrowser(originDomain, targetUrl, contextOpts);
if (!result.relatedDomains.includes(originDomain)) {
result.relatedDomains.unshift(originDomain);
}
return result;
} catch (e) {
log.warn(`[SCAN] Browser escalation failed: ${e?.message}`);
return { finalUrl: targetUrl, relatedDomains: [originDomain], redirectChain: [], precheck: pre.reason || 'blocked' };
}
}
// ---------- Cache helpers ----------
function getFromCache(domain) {
const row = stmtSelect.get(domain);
if (!row) return null;
const now = Math.floor(Date.now() / 1000);
if (row.ttl_at > now) {
try {
const out = {
relatedDomains: JSON.parse(row.result_json),
finalUrl: row.final_url || null,
redirectChain: row.redirect_chain_json ? JSON.parse(row.redirect_chain_json) : [],
cached: true,
cachedAt: row.updated_at,
ttlAt: row.ttl_at,
};
return out;
} catch (e) {
log.warn(`[CACHE] Parse error: ${e?.message}`);
return null;
}
}
return null;
}
function putToCache(domain, result) {
const now = Math.floor(Date.now() / 1000);
const ttlAt = now + CACHE_TTL_SECONDS;
try {
stmtUpsert.run({
domain,
result_json: JSON.stringify(result.relatedDomains || []),
final_url: result.finalUrl || null,
redirect_chain_json: JSON.stringify(result.redirectChain || []),
updated_at: now,
ttl_at: ttlAt,
});
log.info(`[CACHE] Upsert ${domain} ttlAt=${ttlAt}`);
} catch (e) {
log.warn(`[CACHE] Upsert error: ${e?.message}`);
}
}
// ---------- Routes ----------
app.get('/domains', async (req, res) => {
const { domain } = req.query;
res.type('application/json');
const raw = req.query.domain;
log.info(`[HTTP] /domains?domain=${raw}`);
const domain = normalizeDomain(raw);
if (!domain) {
res.status(400).json({ error: '"domain" query parameter is required' });
res.status(400).json({ error: '"domain" must be a valid hostname', code: 'BAD_DOMAIN' });
return;
}
const url = `https://${domain}`;
const seenDomains = new Set();
const HARD_TIMEOUT = parseInt(process.env.HARD_TIMEOUT_MS || '70000', 10);
const hardTimer = setTimeout(() => {
try { if (!res.headersSent) res.status(504).json({ error: 'Gateway Timeout', code: 'TIMEOUT' }); } catch {}
}, HARD_TIMEOUT);
try {
const browser = await chromium.launch({
args: ['--no-sandbox', '--disable-setuid-sandbox']
const cached = getFromCache(domain);
if (cached) {
log.info(`[HTTP] Cache HIT ${domain}`);
res.status(200).json({
domain,
finalUrl: cached.finalUrl,
relatedDomains: cached.relatedDomains,
redirectChain: cached.redirectChain,
cached: true,
cachedAt: cached.cachedAt,
ttlAt: cached.ttlAt,
status: 'ok'
});
const context = await browser.newContext();
const page = await context.newPage();
return;
}
page.on('request', request => {
const d = extractDomain(request.url());
if (d) seenDomains.add(d);
const result = await scanDomainOnce(domain);
if (result.precheck) {
if ((result.precheck || '').startsWith('marketing-redirect')) {
res.status(200).json({
domain,
finalUrl: result.finalUrl || `https://${domain}`,
relatedDomains: [domain],
redirectChain: [],
cached: false,
status: 'ok',
note: result.precheck
});
return;
}
res.status(200).json({
domain,
finalUrl: result.finalUrl || `https://${domain}`,
relatedDomains: [domain],
redirectChain: [],
cached: false,
status: (result.precheck === 'forbidden' || result.precheck === 'blocked') ? 'blocked' : 'skipped',
reason: result.precheck
});
return;
}
await page.goto(url, { waitUntil: 'load', timeout: 30000 });
await browser.close();
// Фильтрация доменов
const filteredDomains = Array.from(seenDomains).filter(d =>
!d.includes('doubleclick') && !d.includes('google')
).sort();
res.json({ domains: filteredDomains });
putToCache(domain, result);
res.status(200).json({
domain,
finalUrl: result.finalUrl,
relatedDomains: result.relatedDomains,
redirectChain: result.redirectChain,
cached: false,
status: 'ok'
});
} catch (e) {
res.status(500).json({ error: e.message || 'Internal server error' });
const msg = String(e?.message || 'Internal error');
log.error(`[HTTP] Error for ${domain}: ${msg}`);
const forbidden = /403|forbidden|blocked/i.test(msg);
res.status(forbidden ? 403 : 500).json({
error: forbidden ? 'Forbidden' : 'Internal server error',
code: forbidden ? 'FORBIDDEN' : 'INTERNAL',
details: msg
});
} finally {
clearTimeout(hardTimer);
}
});
app.listen(port, () => {
console.log(`Domain scanner service listening on port ${port}`);
app.get('/health', (_req, res) => {
res.type('application/json');
res.json({ ok: true });
});
// ---------- Signals ----------
process.on('SIGTERM', async () => {
log.info('[SIGNAL] SIGTERM');
try { if (browser) await browser.close(); } catch {}
process.exit(0);
});
process.on('SIGINT', async () => {
log.info('[SIGNAL] SIGINT');
try { if (browser) await browser.close(); } catch {}
process.exit(0);
});
// ---------- Start ----------
app.listen(PORT, () => {
log.info(`Domain scanner service listening on port ${PORT}`);
});