diff --git a/.dockerignore b/.dockerignore deleted file mode 100644 index 229f78d..0000000 --- a/.dockerignore +++ /dev/null @@ -1,13 +0,0 @@ -* - -!package.json - -!package-lock.json - -!server.js - -!ignore-domains.txt - -!LICENSE - -!README.md diff --git a/.forgejo/workflows/docker-build-push.yaml b/.forgejo/workflows/docker-build-push.yaml new file mode 100644 index 0000000..340cceb --- /dev/null +++ b/.forgejo/workflows/docker-build-push.yaml @@ -0,0 +1,42 @@ +name: Build and Push Docker Image + +on: + push: + branches: + - main + paths: + - '**/*' # Триггер при любом изменении репозитория + +jobs: + build-and-push: + runs-on: docker + + container: + image: docker:24.0.1 + + steps: + # Установка Docker CLI (если не в базовом образе) + - name: Setup Docker CLI + run: | + apk add --no-cache docker-cli + + # Авторизация в Docker Hub - токен необходимо добавить в Secrets + - name: Login to Docker Hub + env: + DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }} + DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} + run: | + echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin + + # Клонирование исходников — встроено в Forgejo Actions + + # Сборка Docker образа + - name: Build Docker Image + run: | + docker build -t ${DOCKER_USERNAME}/playwright-domain-scanner:latest . + + # Push образ на Docker Hub + - name: Push Docker Image + run: | + docker push ${DOCKER_USERNAME}/playwright-domain-scanner:latest + diff --git a/.github/workflows/docker-build-push.yaml b/.github/workflows/docker-build-push.yaml deleted file mode 100644 index be6ac82..0000000 --- a/.github/workflows/docker-build-push.yaml +++ /dev/null @@ -1,28 +0,0 @@ -name: Build and Push Docker Image - -on: - push: - branches: - - main - -jobs: - build-and-push: - runs-on: ubuntu-latest - environment: dockerhub - - steps: - - name: Checkout the repository - uses: actions/checkout@v3 - - - name: Log in to Docker Hub - uses: docker/login-action@v2 - with: - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_PASSWORD }} - - - name: Build and push Docker image - uses: docker/build-push-action@v4 - with: - push: true - tags: ${{ secrets.DOCKER_USERNAME }}/gekata:latest - diff --git a/Dockerfile b/Dockerfile index 628bd48..f95e58b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,65 +1,46 @@ -# -------- Builder stage -------- -FROM debian:bookworm-slim AS builder -ENV DEBIAN_FRONTEND=noninteractive +# Use official Node.js LTS base image +FROM node:20-slim -# Node + build tools for native modules (better-sqlite3) -RUN apt-get update && apt-get install -y --no-install-recommends \ - ca-certificates curl gnupg \ - nodejs npm \ - python3 make g++ pkg-config libsqlite3-dev \ - && rm -rf /var/lib/apt/lists/* +# Install dependencies for running Chromium +RUN apt-get update && apt-get install -y \ + ca-certificates \ + fonts-liberation \ + libasound2 \ + libatk-bridge2.0-0 \ + libatk1.0-0 \ + libcups2 \ + libdbus-1-3 \ + libdrm2 \ + libgbm1 \ + libgtk-3-0 \ + libnspr4 \ + libnss3 \ + libx11-xcb1 \ + libxcomposite1 \ + libxdamage1 \ + libxrandr2 \ + xdg-utils \ + wget \ + --no-install-recommends && \ + rm -rf /var/lib/apt/lists/* -WORKDIR /app +# Set working directory +WORKDIR /usr/src/app -# Copy only manifests first to leverage Docker cache +# Copy package files and install dependencies COPY package*.json ./ +COPY ignore-domains.txt ./ +RUN npm ci -# Install production deps (build native modules here) -ENV CI=true -RUN npm ci --omit=dev +# Install Playwright browsers (Chromium) +RUN npx playwright install chromium -# Copy source +# Copy app sources COPY . . -# -------- Runtime stage -------- -FROM debian:bookworm-slim -ENV DEBIAN_FRONTEND=noninteractive - -# Install tini for proper PID 1 and signal handling -# Install Node.js runtime, Chromium and minimal libs -RUN apt-get update && apt-get install -y --no-install-recommends \ - ca-certificates curl gnupg \ - tini \ - nodejs npm \ - chromium \ - libx11-6 libxcomposite1 libxdamage1 libxrandr2 libxkbcommon0 \ - libgtk-3-0 libnss3 libdrm2 libgbm1 libasound2 fonts-liberation \ - fonts-dejavu-core \ - && rm -rf /var/lib/apt/lists/* - -WORKDIR /app - -# Copy node_modules and app from builder -COPY --from=builder /app/node_modules /app/node_modules -COPY --from=builder /app/package*.json /app/ -COPY . . - -# Security: drop root -RUN useradd -ms /bin/bash nodeuser && chown -R nodeuser:nodeuser /app -USER nodeuser - -# Environment -ENV PORT=3000 \ - PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 \ - PLAYWRIGHT_BROWSERS_PATH=0 \ - CHROMIUM_PATH=/usr/bin/chromium \ - CACHE_TTL_SECONDS=21600 - +# Expose port EXPOSE 3000 -# Use tini as PID 1 so we don't need `--init` -ENTRYPOINT ["/usr/bin/tini", "--"] - -# Start the service -CMD ["node", "server.js"] +# Run the service +CMD ["npm", "start"] diff --git a/README.md b/README.md index 874aedf..2990f20 100644 --- a/README.md +++ b/README.md @@ -1,117 +1,3 @@ -**Gekata** — это легковесный сервис на Node.js для извлечения «связных доменов» с веб-страниц, запускаемый в контейнере Debian с Chromium; он сначала делает предзапросы с ручным следованием редиректам, затем при необходимости поднимает безголовый браузер, ограничивает глубину редиректов и кэширует результаты в SQLite через better-sqlite3. Сервис предоставляет HTTP API /domains, возвращающее финальный URL, цепочку редиректов и список связанных доменов, а также /health для проверки готовности. +# gekata -### Назначение - -Gekata сканирует заданный домен, разрешает маркетинговые и другие редиректы до целевой HTML-страницы, загружает её в безголовом Chromium и собирает множество доменных имён из всех сетевых запросов страницы, формируя список «связных доменов» для анализа интеграций, трекинга и CDN. Такой подход работает и для динамических сайтов с клиентским рендерингом. - -### Архитектура - -- Веб-сервер на Express предоставляет REST‑маршруты, принимает домен, валидирует его и инициирует сканирование, обрабатывая таймауты и коды ошибок. -- Предпроверка делает GET с ручным управлением редиректами, классифицируя сценарии: форс‑редиректы «маркетинга», запреты 403, скачивания и не‑HTML контент, чтобы экономить запуск браузера. -- Эскалация в безголовый Chromium (через Playwright) выполняет навигацию, применяя ограничение глубины редиректов только для документных переходов и ожидая «тихое окно» сети для стабильного сбора доменов. -- Кэширование результатов в SQLite с TTL ускоряет повторные запросы; используется лучшее для продакшна подключение better-sqlite3 и WAL‑журналирование для устойчивости. - - -### Потоки данных - -- Вход: GET /domains?domain= — принимает хост или URL, нормализует до ASCII/Punycode и формирует стартовый https:// URL. -- Предобработка: ручной обход 3xx с ограничением шагов; детекция «похожих на файл» ссылок и контента non‑HTML; маркетинговый редирект помечается и может быть целевой. -- Сканирование браузером: навигация на целевой URL, слежение за запросами/ответами страницы, сбор доменов из всех сетевых событий, исключая шум (google/doubleclick по эвристике), построение цепочки редиректов для документной навигации. -- Выход: JSON с finalUrl, relatedDomains[], redirectChain[], статусами ok/skipped/blocked и служебными пометками (cached, ttl). - - -### API - -- GET /domains -Параметры: domain — доменное имя или URL. -Ответ 200 ok: - - domain: нормализованный запрошенный домен. - - finalUrl: конечный URL после редиректов/навигации. - - relatedDomains: уникальные домены, замеченные при загрузке страницы. - - redirectChain: массив { from, to, status } для документных 3xx. - - cached: true/false, cachedAt, ttlAt. - - status: ok | skipped | blocked; дополнительные note/reason при skip/blocked. -- GET /health — простой JSON { ok: true } для readiness/liveness. - - -### Обработка редиректов - -- На этапе предпроверки ограничение PRECHECK_MAX_REDIRECTS предотвращает бесконечные цепочки до запуска браузера; 403 заставляет эскалировать в браузер, non‑HTML/attachment возвращают немедленный ответ. -- В браузере включён маршрут‑ограничитель только для документной навигации: запросы навигации обрабатываются с maxRedirects, ассеты идут без ограничений, чтобы не ломать рендеринг. -- Если лимит превышен, навигация завершается контролируемо и возвращается ошибка «Too many redirects», переводимая в понятный статус ответа API. - - -### Кэш и TTL - -- SQLite таблица domain_cache хранит: домен, JSON списка доменов, финальный URL, цепочку редиректов, время обновления и ttl_at. -- Повторные обращения до истечения TTL возвращают сохранённый результат без запуска браузера, снижая задержки и нагрузку. - - -### Контейнеризация - -- Образ состоит из двух стадий: builder и runtime, обе на debian:bookworm-slim. -- Стадия builder устанавливает Node.js, компилятор и заголовки SQLite для сборки native‑модуля better‑sqlite3, затем выполняет npm ci с пропуском dev‑зависимостей и копирует исходники. -- Стадия runtime устанавливает tini как корректный PID 1, Node.js runtime, системный Chromium и минимальный набор X/GTK/NSS/GBM/шрифтов, необходимых для безголового режима; копируются node_modules и исходники из builder. -- Создаётся непривилегированный пользователь nodeuser; директория приложения принадлежит ему; сервис запускается не от root. - - -### Переменные окружения - -- PORT — порт HTTP сервера (по умолчанию 3000). -- CHROMIUM_PATH — путь к системному Chromium (/usr/bin/chromium в контейнере). -- CACHE_TTL_SECONDS — срок жизни кэша (по умолчанию 6 часов). -- HARD_TIMEOUT_MS — жёсткий таймаут обработки HTTP‑запроса (по умолчанию 70 секунд). -- MAX_REDIRECT_STEPS — максимальная глубина редиректов для документной навигации (по умолчанию 20). -- NAV_TIMEOUT_MS, QUIET_WINDOW_MS — таймауты навигации и «тихого окна» сети. -- DEBUG — включает подробные логи страницы/сетевых событий при значении 1. - - -### Безопасность и устойчивость - -- tini как init обрабатывает сигналы и «зомби» процессы; контейнер корректно завершает Chromium по SIGTERM/SIGINT, предотвращая утечки. -- Запуск под непривилегированным пользователем снижает риск компрометации; Chromium стартует с флагами no‑sandbox/disable‑setuid-sandbox, что совместимо с безпривилегированным окружением контейнеров. -- Ограничение редиректов для документных переходов устраняет зацикливание «маркетинговых» и неверных конфигураций, не влияя на загрузку ассетов. - - -### Производительность - -- npm ci в builder‑стадии плюс копирование package*.json до исходников задействуют кэш слоёв Docker, ускоряя сборки. -- better‑sqlite3 с синхронными подготовленными выражениями обеспечивает быстрый локальный кэш без отдельного сервиса БД. -- Предпроверка HTTP избавляет от лишних подъёмов браузера для не‑HTML или «прикреплённых» ответов. - - -### Сборка и запуск - -- Сборка образа: - - docker build -t gekata:latest . -- Запуск контейнера: - - docker run --rm -p 3000:3000 -e CACHE_TTL_SECONDS=21600 -e MAX_REDIRECT_STEPS=20 gekata:latest -- Примеры запросов: - - curl -s "http://localhost:3000/health" - - curl -s "http://localhost:3000/domains?domain=forum.xda-developers.com" - - -### Журналирование и диагностика - -- Лог‑метки [BOOT], [HTTP], [SCAN], [BROWSER], [CACHE], [SIGNAL] позволяют быстро локализовать этап и тип события. -- При включённом DEBUG=1 логируются консоль страницы, ошибки, неудавшиеся запросы и сетевые эвенты, что помогает анализировать блокировки, CORS, антибот‑защиту и таймауты. - - -### Ограничения - -- Сайты с жёсткими антибот‑мерами (403/JS‑челленджи) могут быть помечены как blocked или потребовать дополнительной эмуляции (например, иные user‑agent/locale/timezone/proxy). -- Сбор связанных доменов базируется на фактически выполненных сетевых запросах и может меняться при A/B тестах, гео‑таргетинге или различиях по user‑agent. - - -### Расширения и доработки - -- Добавить белый/чёрный список доменов, тонкую фильтрацию трекеров и интеграций. -- Вынести кэш в внешний SQLite‑файл через volume для сохранения между рестартами, настроить резервное копирование. -- Параметризовать user‑agent/locale/timezone и добавить поддержку прокси для региональных сценариев. -- Экспортировать полный сетевой журнал и тайминги (HAR‑подобный формат) как опциональную выгрузку. - - -### Файлы проекта - -- server.js — основной сервис, логика API, предобработка, сканирование браузером, кэш, ограничения редиректов, завершение по сигналам. -- Dockerfile — двухстадийная сборка, системный Chromium в рантайме, tini, непривилегированный пользователь, переменные окружения и запуск службы. +Extract all domains from site \ No newline at end of file diff --git a/ignore-domains.txt b/ignore-domains.txt index d6033b8..bf4888e 100644 --- a/ignore-domains.txt +++ b/ignore-domains.txt @@ -1,3 +1,3 @@ doubleclick google -yandex + diff --git a/package-lock.json b/package-lock.json index 3e08c4c..2c45af6 100644 --- a/package-lock.json +++ b/package-lock.json @@ -8,9 +8,7 @@ "name": "playwright-domain-scanner", "version": "1.0.0", "dependencies": { - "better-sqlite3": "^9.0.0", "express": "^4.18.2", - "express-rate-limit": "^8.1.0", "playwright": "^1.42.0" } }, @@ -31,53 +29,6 @@ "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz", "integrity": "sha512-PCVAQswWemu6UdxsDFFX/+gVeYqKAod3D3UVm91jHwynguOwAvYPhx8nNlM++NqRcK6CxxpUafjmhIdKiHibqg==" }, - "node_modules/base64-js": { - "version": "1.5.1", - "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", - "integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ] - }, - "node_modules/better-sqlite3": { - "version": "9.6.0", - "resolved": "https://registry.npmjs.org/better-sqlite3/-/better-sqlite3-9.6.0.tgz", - "integrity": "sha512-yR5HATnqeYNVnkaUTf4bOP2dJSnyhP4puJN/QPRyx4YkBEEUxib422n2XzPqDEHjQQqazoYoADdAm5vE15+dAQ==", - "hasInstallScript": true, - "dependencies": { - "bindings": "^1.5.0", - "prebuild-install": "^7.1.1" - } - }, - "node_modules/bindings": { - "version": "1.5.0", - "resolved": "https://registry.npmjs.org/bindings/-/bindings-1.5.0.tgz", - "integrity": "sha512-p2q/t/mhvuOj/UeLlV6566GD/guowlr0hHxClI0W9m7MWYkL1F0hLo+0Aexs9HSPCtR1SXQ0TD3MMKrXZajbiQ==", - "dependencies": { - "file-uri-to-path": "1.0.0" - } - }, - "node_modules/bl": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz", - "integrity": "sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==", - "dependencies": { - "buffer": "^5.5.0", - "inherits": "^2.0.4", - "readable-stream": "^3.4.0" - } - }, "node_modules/body-parser": { "version": "1.20.3", "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.3.tgz", @@ -101,29 +52,6 @@ "npm": "1.2.8000 || >= 1.4.16" } }, - "node_modules/buffer": { - "version": "5.7.1", - "resolved": "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz", - "integrity": "sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ], - "dependencies": { - "base64-js": "^1.3.1", - "ieee754": "^1.1.13" - } - }, "node_modules/bytes": { "version": "3.1.2", "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz", @@ -159,11 +87,6 @@ "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/chownr": { - "version": "1.1.4", - "resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz", - "integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==" - }, "node_modules/content-disposition": { "version": "0.5.4", "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.4.tgz", @@ -204,28 +127,6 @@ "ms": "2.0.0" } }, - "node_modules/decompress-response": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz", - "integrity": "sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ==", - "dependencies": { - "mimic-response": "^3.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/deep-extend": { - "version": "0.6.0", - "resolved": "https://registry.npmjs.org/deep-extend/-/deep-extend-0.6.0.tgz", - "integrity": "sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==", - "engines": { - "node": ">=4.0.0" - } - }, "node_modules/depd": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/depd/-/depd-2.0.0.tgz", @@ -243,14 +144,6 @@ "npm": "1.2.8000 || >= 1.4.16" } }, - "node_modules/detect-libc": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.0.4.tgz", - "integrity": "sha512-3UDv+G9CsCKO1WKMGw9fwq/SWJYbI0c5Y7LU1AXYoDdbhE2AHQ6N6Nb34sG8Fj7T5APy8qXDCKuuIHd1BR0tVA==", - "engines": { - "node": ">=8" - } - }, "node_modules/dunder-proto": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", @@ -277,14 +170,6 @@ "node": ">= 0.8" } }, - "node_modules/end-of-stream": { - "version": "1.4.5", - "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz", - "integrity": "sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg==", - "dependencies": { - "once": "^1.4.0" - } - }, "node_modules/es-define-property": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz", @@ -325,14 +210,6 @@ "node": ">= 0.6" } }, - "node_modules/expand-template": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/expand-template/-/expand-template-2.0.3.tgz", - "integrity": "sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==", - "engines": { - "node": ">=6" - } - }, "node_modules/express": { "version": "4.21.2", "resolved": "https://registry.npmjs.org/express/-/express-4.21.2.tgz", @@ -378,28 +255,6 @@ "url": "https://opencollective.com/express" } }, - "node_modules/express-rate-limit": { - "version": "8.1.0", - "resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.1.0.tgz", - "integrity": "sha512-4nLnATuKupnmwqiJc27b4dCFmB/T60ExgmtDD7waf4LdrbJ8CPZzZRHYErDYNhoz+ql8fUdYwM/opf90PoPAQA==", - "dependencies": { - "ip-address": "10.0.1" - }, - "engines": { - "node": ">= 16" - }, - "funding": { - "url": "https://github.com/sponsors/express-rate-limit" - }, - "peerDependencies": { - "express": ">= 4.11" - } - }, - "node_modules/file-uri-to-path": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/file-uri-to-path/-/file-uri-to-path-1.0.0.tgz", - "integrity": "sha512-0Zt+s3L7Vf1biwWZ29aARiVYLx7iMGnEUl9x33fbB/j3jR81u/O2LbqK+Bm1CDSNDKVtJ/YjwY7TUd5SkeLQLw==" - }, "node_modules/finalhandler": { "version": "1.3.1", "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.3.1.tgz", @@ -433,11 +288,6 @@ "node": ">= 0.6" } }, - "node_modules/fs-constants": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz", - "integrity": "sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==" - }, "node_modules/fsevents": { "version": "2.3.2", "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", @@ -494,11 +344,6 @@ "node": ">= 0.4" } }, - "node_modules/github-from-package": { - "version": "0.0.0", - "resolved": "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz", - "integrity": "sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==" - }, "node_modules/gopd": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", @@ -558,43 +403,11 @@ "node": ">=0.10.0" } }, - "node_modules/ieee754": { - "version": "1.2.1", - "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz", - "integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ] - }, "node_modules/inherits": { "version": "2.0.4", "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==" }, - "node_modules/ini": { - "version": "1.3.8", - "resolved": "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz", - "integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==" - }, - "node_modules/ip-address": { - "version": "10.0.1", - "resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.0.1.tgz", - "integrity": "sha512-NWv9YLW4PoW2B7xtzaS3NCot75m6nK7Icdv0o3lfMceJVRfSoQwqD4wEH5rLwoKJwUiZ/rfpiVBhnaF0FK4HoA==", - "engines": { - "node": ">= 12" - } - }, "node_modules/ipaddr.js": { "version": "1.9.1", "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz", @@ -665,40 +478,11 @@ "node": ">= 0.6" } }, - "node_modules/mimic-response": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz", - "integrity": "sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==", - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/minimist": { - "version": "1.2.8", - "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz", - "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==", - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/mkdirp-classic": { - "version": "0.5.3", - "resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz", - "integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==" - }, "node_modules/ms": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==" }, - "node_modules/napi-build-utils": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/napi-build-utils/-/napi-build-utils-2.0.0.tgz", - "integrity": "sha512-GEbrYkbfF7MoNaoh2iGG84Mnf/WZfB0GdGEsM8wz7Expx/LlWf5U8t9nvJKXSp3qr5IsEbK04cBGhol/KwOsWA==" - }, "node_modules/negotiator": { "version": "0.6.3", "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.3.tgz", @@ -707,17 +491,6 @@ "node": ">= 0.6" } }, - "node_modules/node-abi": { - "version": "3.77.0", - "resolved": "https://registry.npmjs.org/node-abi/-/node-abi-3.77.0.tgz", - "integrity": "sha512-DSmt0OEcLoK4i3NuscSbGjOf3bqiDEutejqENSplMSFA/gmB8mkED9G4pKWnPl7MDU4rSHebKPHeitpDfyH0cQ==", - "dependencies": { - "semver": "^7.3.5" - }, - "engines": { - "node": ">=10" - } - }, "node_modules/object-inspect": { "version": "1.13.4", "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.4.tgz", @@ -740,14 +513,6 @@ "node": ">= 0.8" } }, - "node_modules/once": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", - "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==", - "dependencies": { - "wrappy": "1" - } - }, "node_modules/parseurl": { "version": "1.3.3", "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz", @@ -789,31 +554,6 @@ "node": ">=18" } }, - "node_modules/prebuild-install": { - "version": "7.1.3", - "resolved": "https://registry.npmjs.org/prebuild-install/-/prebuild-install-7.1.3.tgz", - "integrity": "sha512-8Mf2cbV7x1cXPUILADGI3wuhfqWvtiLA1iclTDbFRZkgRQS0NqsPZphna9V+HyTEadheuPmjaJMsbzKQFOzLug==", - "dependencies": { - "detect-libc": "^2.0.0", - "expand-template": "^2.0.3", - "github-from-package": "0.0.0", - "minimist": "^1.2.3", - "mkdirp-classic": "^0.5.3", - "napi-build-utils": "^2.0.0", - "node-abi": "^3.3.0", - "pump": "^3.0.0", - "rc": "^1.2.7", - "simple-get": "^4.0.0", - "tar-fs": "^2.0.0", - "tunnel-agent": "^0.6.0" - }, - "bin": { - "prebuild-install": "bin.js" - }, - "engines": { - "node": ">=10" - } - }, "node_modules/proxy-addr": { "version": "2.0.7", "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.7.tgz", @@ -826,15 +566,6 @@ "node": ">= 0.10" } }, - "node_modules/pump": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.3.tgz", - "integrity": "sha512-todwxLMY7/heScKmntwQG8CXVkWUOdYxIvY2s0VWAAMh/nd8SoYiRaKjlr7+iCs984f2P8zvrfWcDDYVb73NfA==", - "dependencies": { - "end-of-stream": "^1.1.0", - "once": "^1.3.1" - } - }, "node_modules/qs": { "version": "6.13.0", "resolved": "https://registry.npmjs.org/qs/-/qs-6.13.0.tgz", @@ -871,33 +602,6 @@ "node": ">= 0.8" } }, - "node_modules/rc": { - "version": "1.2.8", - "resolved": "https://registry.npmjs.org/rc/-/rc-1.2.8.tgz", - "integrity": "sha512-y3bGgqKj3QBdxLbLkomlohkvsA8gdAiUQlSBJnBhfn+BPxg4bc62d8TcBW15wavDfgexCgccckhcZvywyQYPOw==", - "dependencies": { - "deep-extend": "^0.6.0", - "ini": "~1.3.0", - "minimist": "^1.2.0", - "strip-json-comments": "~2.0.1" - }, - "bin": { - "rc": "cli.js" - } - }, - "node_modules/readable-stream": { - "version": "3.6.2", - "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz", - "integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==", - "dependencies": { - "inherits": "^2.0.3", - "string_decoder": "^1.1.1", - "util-deprecate": "^1.0.1" - }, - "engines": { - "node": ">= 6" - } - }, "node_modules/safe-buffer": { "version": "5.2.1", "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", @@ -922,17 +626,6 @@ "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==" }, - "node_modules/semver": { - "version": "7.7.2", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.2.tgz", - "integrity": "sha512-RF0Fw+rO5AMf9MAyaRXI4AV0Ulj5lMHqVxxdSgiVbixSCXoEmmX/jk0CuJw4+3SqroYO9VoUh+HcuJivvtJemA==", - "bin": { - "semver": "bin/semver.js" - }, - "engines": { - "node": ">=10" - } - }, "node_modules/send": { "version": "0.19.0", "resolved": "https://registry.npmjs.org/send/-/send-0.19.0.tgz", @@ -1056,49 +749,6 @@ "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/simple-concat": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/simple-concat/-/simple-concat-1.0.1.tgz", - "integrity": "sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ] - }, - "node_modules/simple-get": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/simple-get/-/simple-get-4.0.1.tgz", - "integrity": "sha512-brv7p5WgH0jmQJr1ZDDfKDOSeWWg+OVypG99A/5vYGPqJ6pxiaHLy8nxtFjBA7oMa01ebA9gfh1uMCFqOuXxvA==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ], - "dependencies": { - "decompress-response": "^6.0.0", - "once": "^1.3.1", - "simple-concat": "^1.0.0" - } - }, "node_modules/statuses": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.1.tgz", @@ -1107,48 +757,6 @@ "node": ">= 0.8" } }, - "node_modules/string_decoder": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz", - "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==", - "dependencies": { - "safe-buffer": "~5.2.0" - } - }, - "node_modules/strip-json-comments": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-2.0.1.tgz", - "integrity": "sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ==", - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/tar-fs": { - "version": "2.1.3", - "resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.3.tgz", - "integrity": "sha512-090nwYJDmlhwFwEW3QQl+vaNnxsO2yVsd45eTKRBzSzu+hlb1w2K9inVq5b0ngXuLVqQ4ApvsUHHnu/zQNkWAg==", - "dependencies": { - "chownr": "^1.1.1", - "mkdirp-classic": "^0.5.2", - "pump": "^3.0.0", - "tar-stream": "^2.1.4" - } - }, - "node_modules/tar-stream": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-2.2.0.tgz", - "integrity": "sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==", - "dependencies": { - "bl": "^4.0.3", - "end-of-stream": "^1.4.1", - "fs-constants": "^1.0.0", - "inherits": "^2.0.3", - "readable-stream": "^3.1.1" - }, - "engines": { - "node": ">=6" - } - }, "node_modules/toidentifier": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.1.tgz", @@ -1157,17 +765,6 @@ "node": ">=0.6" } }, - "node_modules/tunnel-agent": { - "version": "0.6.0", - "resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz", - "integrity": "sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==", - "dependencies": { - "safe-buffer": "^5.0.1" - }, - "engines": { - "node": "*" - } - }, "node_modules/type-is": { "version": "1.6.18", "resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz", @@ -1188,11 +785,6 @@ "node": ">= 0.8" } }, - "node_modules/util-deprecate": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", - "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==" - }, "node_modules/utils-merge": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz", @@ -1208,11 +800,7 @@ "engines": { "node": ">= 0.8" } - }, - "node_modules/wrappy": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", - "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==" } } } + diff --git a/package.json b/package.json index a69fb55..da44098 100644 --- a/package.json +++ b/package.json @@ -7,9 +7,8 @@ "start": "node server.js" }, "dependencies": { - "better-sqlite3": "^9.0.0", "express": "^4.18.2", - "express-rate-limit": "^8.1.0", "playwright": "^1.42.0" } } + diff --git a/server.js b/server.js index bdd242b..f6ef7fe 100644 --- a/server.js +++ b/server.js @@ -1,518 +1,56 @@ -// server.js const express = require('express'); const { chromium } = require('playwright'); -const Database = require('better-sqlite3'); -// Убираем punycode; используем WHATWG URL + domainToASCII -const { URL, domainToASCII } = require('node:url'); const app = express(); - -// ---------- Config ---------- -const PORT = Number(process.env.PORT || 3000); -const CHROMIUM_PATH = process.env.CHROMIUM_PATH || undefined; -const CACHE_TTL_SECONDS = parseInt(process.env.CACHE_TTL_SECONDS || '21600', 10); -const MAX_REDIRECT_STEPS = parseInt(process.env.MAX_REDIRECT_STEPS || '20', 10); -const NAV_TIMEOUT_MS = parseInt(process.env.NAV_TIMEOUT_MS || '30000', 10); -const QUIET_WINDOW_MS = parseInt(process.env.QUIET_WINDOW_MS || '700', 10); -const PRECHECK_MAX_REDIRECTS = parseInt(process.env.PRECHECK_MAX_REDIRECTS || '15', 10); -const SQLITE_PATH = process.env.SQLITE_PATH || './cache.db'; -const DEBUG_ENABLED = String(process.env.DEBUG || '').trim() === '1'; -const CHROMIUM_ARGS = [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-gpu', - '--no-zygote', -]; - -// ---------- Logging ---------- -const log = { - info: (...a) => console.log(...a), - debug: (...a) => { if (DEBUG_ENABLED) console.log(...a); }, - warn: (...a) => console.warn(...a), - error: (...a) => console.error(...a), -}; - -// ---------- DB ---------- -log.info(`[BOOT] SQLite path: ${SQLITE_PATH}`); -const db = new Database(SQLITE_PATH); -db.pragma('journal_mode = WAL'); -db.exec(` -CREATE TABLE IF NOT EXISTS domain_cache ( - domain TEXT PRIMARY KEY, - result_json TEXT NOT NULL, - final_url TEXT, - redirect_chain_json TEXT, - updated_at INTEGER NOT NULL, - ttl_at INTEGER NOT NULL -); -`); -const stmtSelect = db.prepare(` - SELECT result_json, final_url, redirect_chain_json, updated_at, ttl_at - FROM domain_cache WHERE domain = ? -`); -const stmtUpsert = db.prepare(` -INSERT INTO domain_cache (domain, result_json, final_url, redirect_chain_json, updated_at, ttl_at) -VALUES (@domain, @result_json, @final_url, @redirect_chain_json, @updated_at, @ttl_at) -ON CONFLICT(domain) DO UPDATE SET - result_json = excluded.result_json, - final_url = excluded.final_url, - redirect_chain_json = excluded.redirect_chain_json, - updated_at = excluded.updated_at, - ttl_at = excluded.ttl_at -`); +const port = process.env.PORT || 3000; app.use(express.json()); -// ---------- Helpers ---------- -function normalizeDomain(input) { - if (!input || typeof input !== 'string') return null; - const s = input.trim().toLowerCase(); - try { - // Если это URL, берём hostname; иначе считаем, что это просто хост - const asUrl = /^https?:\/\//i.test(s) ? s : `https://${s}`; - const u = new URL(asUrl); - // Преобразуем к IDNA ASCII (Punycode) через WHATWG util - const ascii = domainToASCII(u.hostname || ''); - return ascii || null; - } catch { - // Попытка прямой IDNA-конверсии из строки (на случай голого хоста без схемы) - try { - const ascii = domainToASCII(s); - return ascii || null; - } catch { - return null; - } - } -} // WHATWG URL + url.domainToASCII [web:167][web:161][web:164] - function extractDomain(url) { - try { return new URL(url).hostname.toLowerCase(); } catch { return null; } -} - -// эвристика «выглядит как файл» -function looksLikeFilePath(u) { try { - const { pathname } = new URL(u); - return /\.(?:zip|pdf|png|jpe?g|gif|webp|svg|mp4|mp3|wav|csv|xlsx?|docx?|pptx?|exe|deb|rpm|apk|tar(?:\.gz)?|7z|gz|bz2)$/i.test(pathname); - } catch { return false; } -} - -// канонизация URL для детекции петель -function normalizeUrlForLoop(u) { - try { - const x = new URL(u); - x.hash = ''; - return x.toString(); - } catch { return u; } -} - -// ---------- Precheck: manual redirects & classification ---------- -async function precheckFollowManually(startUrl) { - let url = startUrl; - const visited = new Set(); - let sawHtmlHint = false; - for (let i = 0; i < PRECHECK_MAX_REDIRECTS; i++) { - const norm = normalizeUrlForLoop(url); - if (visited.has(norm)) { - log.debug(`[PRECHECK] Loop at ${norm}`); - return { skip: true, reason: 'redirect-loop', tryBrowser: sawHtmlHint }; - } - visited.add(norm); - let res; - try { - res = await fetch(url, { method: 'GET', redirect: 'manual' }); - } catch (e) { - log.debug(`[PRECHECK] GET(manual) failed for ${url}: ${e?.message}`); - return { skip: false, reason: null, tryBrowser: false }; - } - const status = res.status; - const ct = res.headers.get('content-type') || ''; - const cd = res.headers.get('content-disposition') || ''; - const loc = res.headers.get('location') || ''; - log.debug(`[PRECHECK] step=${i} status=${status} ct="${ct}" cd="${cd || '-'}" loc="${loc || '-'}"`); - const isHtml = /\btext\/html\b/i.test(ct); - if (isHtml) sawHtmlHint = true; - const isAttachment = /attachment/i.test(cd); - if (status === 403) { - return { skip: true, reason: 'forbidden', tryBrowser: true }; - } - if (status >= 300 && status < 400 && loc) { - const next = new URL(loc, url).toString(); - if (looksLikeFilePath(next) || /download|file|export/i.test(next)) { - return { skip: true, reason: `redirect-to-file(${next})`, tryBrowser: false, finalUrl: next }; - } - try { - const probe = await fetch(next, { method: 'GET', redirect: 'manual' }); - const pct = probe.headers.get('content-type') || ''; - const isHtmlTarget = /\btext\/html\b/i.test(pct); - if (isHtmlTarget) { - return { skip: true, reason: `marketing-redirect(${next})`, tryBrowser: false, finalUrl: next }; - } - } catch {} - url = next; - continue; - } - if (isAttachment) return { skip: true, reason: 'attachment', tryBrowser: false, finalUrl: url }; - if (!isHtml && ct) return { skip: true, reason: `non-HTML (${ct})`, tryBrowser: false, finalUrl: url }; - return { skip: false, reason: null, tryBrowser: false, finalUrl: url }; - } - log.debug(`[PRECHECK] Too many redirects >= ${PRECHECK_MAX_REDIRECTS}`); - return { skip: true, reason: `redirect-loop(${PRECHECK_MAX_REDIRECTS})`, tryBrowser: sawHtmlHint, finalUrl: null }; -} // [web:167] - -// ---------- Browser lifecycle ---------- -let browser; -async function ensureBrowser() { - if (browser && browser.isConnected()) return browser; - if (browser) { try { await browser.close(); } catch {} } - log.info(`[BROWSER] Launch headless Chromium`); - browser = await chromium.launch({ executablePath: CHROMIUM_PATH, headless: true, args: CHROMIUM_ARGS }); - return browser; -} // [web:151] - -// ---------- Redirect chain builder (document-only) ---------- -function buildRedirectChainForResponse(resp, maxLen = 50) { - const chain = []; - const req = resp.request(); - if (req.resourceType() !== 'document') return chain; - let prev = req.redirectedFrom(); - let toUrl = req.url(); - const status = resp.status(); - while (prev) { - chain.push({ from: prev.url(), to: toUrl, status }); - toUrl = prev.url(); - prev = prev.redirectedFrom(); - if (chain.length >= maxLen) break; - } - return chain.reverse(); -} // [web:151] - -// ---------- Quiet network window ---------- -async function quietWindowWait({ inflightRef, lastChangeRef, timeoutMs, quietMs }) { - const start = Date.now(); - while (Date.now() - start < timeoutMs) { - const quietFor = Date.now() - lastChangeRef.value; - if (inflightRef.value === 0 && quietFor >= quietMs) return; - await new Promise(r => setTimeout(r, 100)); - } -} // [web:151] - -// ---------- Core scan with Playwright ---------- -async function scanWithBrowser(originDomain, startUrl, contextOpts = {}) { - const b = await ensureBrowser(); - const context = await b.newContext({ acceptDownloads: true, ...contextOpts }); - - // Безопасный лимитер редиректов для документной навигации - await context.route('**', async route => { - const request = route.request(); - const isDoc = request.resourceType() === 'document'; - const isNav = request.isNavigationRequest(); - if (!(isDoc && isNav)) return route.continue(); - try { - const resp = await route.fetch({ maxRedirects: MAX_REDIRECT_STEPS }); - const status = resp.status(); - const headers = await resp.headers(); - const body = await resp.body().catch(() => null); - try { - await route.fulfill({ status, headers, body }); - } catch (e) { - log.debug(`[ROUTE] fulfill failed for ${request.url()}: ${e?.message || e}`); - await route.continue(); - } - } catch (e) { - const msg = String(e?.message || ''); - if (/redirect/i.test(msg) || /too many/i.test(msg)) { - try { - await route.fulfill({ - status: 508, - contentType: 'text/plain', - body: 'Loop Detected: too many redirects' - }); - } catch (e2) { - log.debug(`[ROUTE] fulfill(508) failed for ${request.url()}: ${e2?.message || e2}`); - await route.continue(); - } - } else { - log.debug(`[ROUTE] fetch failed for ${request.url()}: ${msg}`); - await route.continue(); - } - } - }); - - const page = await context.newPage(); - - const seenDomains = new Set(); - const redirectLog = []; - const visitedUrls = new Set(); - const inflightRef = { value: 0 }; - const lastChangeRef = { value: Date.now() }; - - if (DEBUG_ENABLED) { - page.on('console', msg => log.debug(`[PAGE.CONSOLE] ${msg.type()}: ${msg.text()}`)); - page.on('pageerror', err => log.debug(`[PAGE.ERROR] ${err?.message}`)); - page.on('requestfailed', req => log.debug(`[REQ.FAIL] ${req.url()} reason=${req.failure()?.errorText}`)); - } - - page.on('download', async dl => { - try { await dl.failure().catch(() => {}); } catch {} - log.debug(`[SCAN] Download ignored: ${dl.url()}`); - }); - - const onReq = req => { - inflightRef.value++; - lastChangeRef.value = Date.now(); - const d = extractDomain(req.url()); - if (d) seenDomains.add(d); - log.debug(`[REQ] ${req.method()} ${req.url()}`); - }; - const onResp = resp => { - inflightRef.value = Math.max(0, inflightRef.value - 1); - lastChangeRef.value = Date.now(); - const d = extractDomain(resp.url()); - if (d) seenDomains.add(d); - const status = resp.status(); - log.debug(`[RESP] ${status} ${resp.url()}`); - if (status >= 300 && status < 400 && resp.request().resourceType() === 'document') { - const piece = buildRedirectChainForResponse(resp, MAX_REDIRECT_STEPS + 5); - redirectLog.push(...piece); - } - }; - page.on('request', onReq); - page.on('response', onResp); - - try { - log.info(`[SCAN] goto(${startUrl}) domcontentloaded timeout=${NAV_TIMEOUT_MS}`); - let response; - try { - response = await page.goto(startUrl, { waitUntil: 'domcontentloaded', timeout: NAV_TIMEOUT_MS }); - } catch (e) { - const msg = String(e?.message || ''); - if (/Download is starting/i.test(msg)) { - log.info(`[SCAN] goto triggered download; continue as non-HTML`); - } else { - throw e; - } - } - - if (response && response.status && response.status() === 508) { - throw new Error(`Too many redirects (${MAX_REDIRECT_STEPS})`); - } - - await quietWindowWait({ inflightRef, lastChangeRef, timeoutMs: NAV_TIMEOUT_MS, quietMs: QUIET_WINDOW_MS }); - const finalUrl = page.url(); - - if (visitedUrls.has(finalUrl)) throw new Error('Redirect loop detected'); - visitedUrls.add(finalUrl); - - const steps = redirectLog.length; - if (steps > MAX_REDIRECT_STEPS) throw new Error(`Too many redirects (${steps})`); - - await context.close(); - - const relatedDomains = Array.from(seenDomains) - .filter(d => !d.includes('doubleclick') && !d.includes('google')) - .sort(); - - log.info(`[SCAN] Done finalUrl=${finalUrl} domains=${relatedDomains.length} redirects=${steps}`); - return { finalUrl, relatedDomains, redirectChain: redirectLog }; - } catch (e) { - try { await context.close(); } catch {} - try { - if (browser && typeof browser.isConnected === 'function' && !browser.isConnected()) { - await browser.close(); browser = null; - } - } catch {} - log.error(`[SCAN] Error: ${e?.message}`); - throw e; - } finally { - page.off('request', onReq); - page.off('response', onResp); + return new URL(url).hostname; + } catch { + return null; } } -// ---------- High-level scan with precheck and escalation ---------- -async function scanDomainOnce(originDomain) { - const startUrl = `https://${originDomain}`; - log.info(`[SCAN] Start domain="${originDomain}" url=${startUrl}`); - const pre = await precheckFollowManually(startUrl); - - if (pre.skip && (pre.reason === 'attachment' || (pre.reason || '').startsWith('non-HTML'))) { - log.info(`[SCAN] Skip non-HTML/attachment: ${pre.reason}`); - return { finalUrl: pre.finalUrl || startUrl, relatedDomains: [originDomain], redirectChain: [], precheck: pre.reason }; - } - - let targetUrl = startUrl; - - if (pre.skip && /^marketing-redirect/.test(pre.reason || '') && pre.finalUrl) { - log.info(`[SCAN] Marketing redirect -> follow target in browser: ${pre.finalUrl}`); - targetUrl = pre.finalUrl; - } else if (pre.skip && pre.tryBrowser) { - log.info(`[SCAN] Escalation to browser due to ${pre.reason}`); - } - - const contextOpts = { - userAgent: - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - locale: 'en-US', - timezoneId: 'UTC', - }; - - try { - const result = await scanWithBrowser(originDomain, targetUrl, contextOpts); - if (!result.relatedDomains.includes(originDomain)) { - result.relatedDomains.unshift(originDomain); - } - return result; - } catch (e) { - log.warn(`[SCAN] Browser escalation failed: ${e?.message}`); - return { finalUrl: targetUrl, relatedDomains: [originDomain], redirectChain: [], precheck: pre.reason || 'blocked' }; - } -} - -// ---------- Cache helpers ---------- -function getFromCache(domain) { - const row = stmtSelect.get(domain); - if (!row) return null; - const now = Math.floor(Date.now() / 1000); - if (row.ttl_at > now) { - try { - const out = { - relatedDomains: JSON.parse(row.result_json), - finalUrl: row.final_url || null, - redirectChain: row.redirect_chain_json ? JSON.parse(row.redirect_chain_json) : [], - cached: true, - cachedAt: row.updated_at, - ttlAt: row.ttl_at, - }; - return out; - } catch (e) { - log.warn(`[CACHE] Parse error: ${e?.message}`); - return null; - } - } - return null; -} -function putToCache(domain, result) { - const now = Math.floor(Date.now() / 1000); - const ttlAt = now + CACHE_TTL_SECONDS; - try { - stmtUpsert.run({ - domain, - result_json: JSON.stringify(result.relatedDomains || []), - final_url: result.finalUrl || null, - redirect_chain_json: JSON.stringify(result.redirectChain || []), - updated_at: now, - ttl_at: ttlAt, - }); - log.info(`[CACHE] Upsert ${domain} ttlAt=${ttlAt}`); - } catch (e) { - log.warn(`[CACHE] Upsert error: ${e?.message}`); - } -} - -// ---------- Routes ---------- app.get('/domains', async (req, res) => { - res.type('application/json'); - const raw = req.query.domain; - log.info(`[HTTP] /domains?domain=${raw}`); - const domain = normalizeDomain(raw); + const { domain } = req.query; if (!domain) { - res.status(400).json({ error: '"domain" must be a valid hostname', code: 'BAD_DOMAIN' }); + res.status(400).json({ error: '"domain" query parameter is required' }); return; } - const HARD_TIMEOUT = parseInt(process.env.HARD_TIMEOUT_MS || '70000', 10); - const hardTimer = setTimeout(() => { - try { if (!res.headersSent) res.status(504).json({ error: 'Gateway Timeout', code: 'TIMEOUT' }); } catch {} - }, HARD_TIMEOUT); + const url = `https://${domain}`; + const seenDomains = new Set(); try { - const cached = getFromCache(domain); - if (cached) { - log.info(`[HTTP] Cache HIT ${domain}`); - res.status(200).json({ - domain, - finalUrl: cached.finalUrl, - relatedDomains: cached.relatedDomains, - redirectChain: cached.redirectChain, - cached: true, - cachedAt: cached.cachedAt, - ttlAt: cached.ttlAt, - status: 'ok' - }); - return; - } - - const result = await scanDomainOnce(domain); - - if (result.precheck) { - if ((result.precheck || '').startsWith('marketing-redirect')) { - res.status(200).json({ - domain, - finalUrl: result.finalUrl || `https://${domain}`, - relatedDomains: [domain], - redirectChain: [], - cached: false, - status: 'ok', - note: result.precheck - }); - return; - } - res.status(200).json({ - domain, - finalUrl: result.finalUrl || `https://${domain}`, - relatedDomains: [domain], - redirectChain: [], - cached: false, - status: (result.precheck === 'forbidden' || result.precheck === 'blocked') ? 'blocked' : 'skipped', - reason: result.precheck - }); - return; - } - - putToCache(domain, result); - res.status(200).json({ - domain, - finalUrl: result.finalUrl, - relatedDomains: result.relatedDomains, - redirectChain: result.redirectChain, - cached: false, - status: 'ok' + const browser = await chromium.launch({ + args: ['--no-sandbox', '--disable-setuid-sandbox'] }); + const context = await browser.newContext(); + const page = await context.newPage(); + + page.on('request', request => { + const d = extractDomain(request.url()); + if (d) seenDomains.add(d); + }); + + await page.goto(url, { waitUntil: 'load', timeout: 30000 }); + await browser.close(); + + // Фильтрация доменов + const filteredDomains = Array.from(seenDomains).filter(d => + !d.includes('doubleclick') && !d.includes('google') + ).sort(); + + res.json({ domains: filteredDomains }); } catch (e) { - const msg = String(e?.message || 'Internal error'); - log.error(`[HTTP] Error for ${domain}: ${msg}`); - const forbidden = /403|forbidden|blocked/i.test(msg); - res.status(forbidden ? 403 : 500).json({ - error: forbidden ? 'Forbidden' : 'Internal server error', - code: forbidden ? 'FORBIDDEN' : 'INTERNAL', - details: msg - }); - } finally { - clearTimeout(hardTimer); + res.status(500).json({ error: e.message || 'Internal server error' }); } }); -app.get('/health', (_req, res) => { - res.type('application/json'); - res.json({ ok: true }); -}); - -// ---------- Signals ---------- -process.on('SIGTERM', async () => { - log.info('[SIGNAL] SIGTERM'); - try { if (browser) await browser.close(); } catch {} - process.exit(0); -}); -process.on('SIGINT', async () => { - log.info('[SIGNAL] SIGINT'); - try { if (browser) await browser.close(); } catch {} - process.exit(0); -}); - -// ---------- Start ---------- -app.listen(PORT, () => { - log.info(`Domain scanner service listening on port ${PORT}`); +app.listen(port, () => { + console.log(`Domain scanner service listening on port ${port}`); });