From 27f6a953ce4a5866ff80667b4fabb3041f42b28f Mon Sep 17 00:00:00 2001
From: Luna <luna@example.com>
Date: Fri, 13 Feb 2026 23:14:48 +0100
Subject: [PATCH] Switch to free-proxy list

---
 README.md               |  24 +++------
 scripts/test-search.mjs |  43 +++++++++++++++
 src/config.js           |   8 +--
 src/search.js           | 114 ++++++++++++++++++++++++----------------
 4 files changed, 119 insertions(+), 70 deletions(-)
 create mode 100644 scripts/test-search.mjs

diff --git a/README.md b/README.md
index c910910..e317c32 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,8 @@ Nova is a friendly, slightly witty Discord companion that chats naturally in DMs
 - Dynamic per-message prompt directives that tune Nova's tone (empathetic, hype, roleplay, etc.) before every OpenAI call.
 - Lightweight DuckDuckGo scraping for "Google-like" answers without paid APIs (locally cached).
 - Guard rails that refuse "ignore previous instructions"-style jailbreak attempts plus a configurable search blacklist.
-- All DuckDuckGo requests are relayed through rotating ProxyScrape HTTP proxies so Nova never hits the web from its real IP.
+- All DuckDuckGo requests are relayed through a rotating pool sourced from [free-proxy-list.net](https://free-proxy-list.net/en/) so Nova never hits the web from its real IP.
+- The same blacklist applies to everyday conversation—if a user message contains a banned term, Nova declines the topic outright.
 
 ## Prerequisites
 - Node.js 18+
@@ -36,10 +37,8 @@ Nova is a friendly, slightly witty Discord companion that chats naturally in DMs
    - `BOT_CHANNEL_ID`: Optional guild channel ID where the bot can reply without mentions
    - `CODER_USER_ID`: Optional Discord user ID to receive surprise DMs every 0–6 hours
    - `ENABLE_WEB_SEARCH`: Set to `false` to disable DuckDuckGo lookups (default `true`)
-   - `ENABLE_PROXY_SCRAPE`: Set to `false` only if you want to bypass ProxyScrape and hit DuckDuckGo directly (default `true`)
-   - `PROXYSCRAPE_ENDPOINT`: Optional override for the proxy list endpoint (defaults to elite HTTPS-capable HTTP proxies)
-   - `PROXYSCRAPE_REFRESH_MS`: How long to cache the proxy list locally (default 600000 ms)
-   - `PROXYSCRAPE_ATTEMPTS`: Max proxy retries per search request (default 5)
+   - `PROXY_POOL_REFRESH_MS`: Optional override for how long to cache the verified proxy list locally (default 600000 ms)
+   - `PROXY_POOL_ATTEMPTS`: Max proxy retries per search request (default 5)
 
 ## Running
 - Development: `npm run dev`
@@ -93,8 +92,8 @@ README.md
 - `src/search.js` scrapes DuckDuckGo's HTML endpoint with a normal browser user-agent, extracts the top results (title/link/snippet), and caches them for 10 minutes to avoid hammering the site.
 - `bot.js` detects when a question sounds “live” (mentions today/news/google/etc.) and injects the formatted snippets into the prompt as "Live intel". No paid APIs involved—it’s just outbound HTTPS from your machine.
 - Toggle this via `ENABLE_WEB_SEARCH=false` if you don’t want Nova to look things up.
-- DuckDuckGo traffic is routed through the free ProxyScrape list (HTTP proxies with HTTPS support). The bot downloads a fresh pool every `PROXYSCRAPE_REFRESH_MS`, rotates through them, and refuses to search if no proxy is available so your origin IP never touches suspicious sites directly. Tune the endpoint/refresh/attempt knobs with the env vars above if you need different regions or paid pools.
-- Edit `data/filter.txt` to maintain a newline-delimited list of banned search keywords/phrases; matching queries are blocked before hitting DuckDuckGo and Nova is instructed to refuse them.
+- DuckDuckGo traffic is routed through the frequently updated HTTPS proxies published on [free-proxy-list.net](https://free-proxy-list.net/en/). Nova scrapes the table, keeps only HTTPS-capable, non-transparent entries, refreshes the pool every `PROXY_POOL_REFRESH_MS`, and refuses to search if no proxy is available so your origin IP never touches suspicious sites directly. Tune the refresh/attempt knobs with the env vars above if you need different cadences.
+- Edit `data/filter.txt` to maintain a newline-delimited list of banned keywords/phrases; matching queries are blocked before hitting DuckDuckGo *and* Nova refuses to discuss them in normal chat.
 - Every entry in `data/search.log` records which proxy (or cache) served the lookup so you can audit traffic paths quickly.
 
 ## Proactive Pings
@@ -102,17 +101,6 @@ README.md
 - Each ping goes through OpenAI with the prompt "you havent messaged your coder in a while, and you wanna chat with him!" so responses stay playful and unscripted.
 - The ping gets typed out (`sendTyping`) for realism and is stored back into the memory layers so the next incoming reply has context.
 
-## Update Log
-- **2026-02-13 — Dynamic personality + multi-message riffs:** Added the instinctive persona prompt with tone mirroring, `<SPLIT>`-based multi-bubble replies, and proactive coder pings so Nova feels alive in DMs.
-- **2026-02-13 — Memory intelligence:** Implemented embeddings-backed long-term memory, short-term buffers, transcript summarization, and heuristic importance pruning stored in `data/memory.json`.
-- **2026-02-13 — Live intel & directives:** Introduced DuckDuckGo scraping, per-turn dynamic prompt directives (tone, roleplay, instruction compliance), and env toggles (`ENABLE_WEB_SEARCH`, `CODER_USER_ID`).
-- **2026-02-13 — UX polish:** Added typing indicators, persona-aware fallback replies, mention cleaning, and README/docs covering setup, memory internals, web search, and deployment tips.
-- **2026-02-13 — Conversational control:** Tuned system prompt to avoid forced follow-up questions, raised temperature for looser banter, and reinforced Nova's awareness of DuckDuckGo lookups plus `<SPLIT>` usage.
-- **2026-02-13 — Statement-first vibes:** Reworked persona to favor bold statements over reflexive questions and dialed back temperature so Nova keeps the vibe without interrogating users.
-- **2026-02-13 — Search logging:** Every DuckDuckGo lookup now appends a line to `data/search.log` with timestamp, query, and the snippets shared with Nova.
-- **2026-02-13 — Safeguards:** Added prompt bypass detection and a file-based DuckDuckGo filter (`data/filter.txt`) to keep Nova from honoring jailbreak requests or searching off-limits topics.
-- **2026-02-13 — Proxy-based search:** DuckDuckGo scraping now tunnels through ProxyScrape relays with automatic rotation/retries and clear prompts when the proxy pool is down, plus new env toggles for tuning the proxy source.
-
 ## Notes
 - The bot retries OpenAI requests up to 3 times with incremental backoff when rate limited.
 - `data/memory.json` is ignored by git but will grow with usage; back it up if you want persistent personality.
diff --git a/scripts/test-search.mjs b/scripts/test-search.mjs
new file mode 100644
index 0000000..370184e
--- /dev/null
+++ b/scripts/test-search.mjs
@@ -0,0 +1,43 @@
+#!/usr/bin/env node
+import { searchWeb } from '../src/search.js';
+
+const sampleQueries = [
+  'latest space telescope discoveries',
+  'ocean temperature anomalies',
+  'history of the marimba',
+  'why do cats trill',
+  'how do lichens survive',
+  'largest desert bloom',
+  'roman concrete durability',
+  'origami engineering projects',
+  'fossil fuel phaseout updates',
+  'tree ring climate data'
+];
+
+const providedQuery = process.argv.slice(2).join(' ').trim();
+const query = providedQuery || sampleQueries[Math.floor(Math.random() * sampleQueries.length)];
+
+console.log(`Searching DuckDuckGo for: "${query}"\n`);
+
+try {
+  const { results, proxy, fromCache } = await searchWeb(query, 5);
+  console.log(`Proxy used: ${proxy || 'none'}${fromCache ? ' (cache hit)' : ''}`);
+  if (!results.length) {
+    console.log('No results returned.');
+    process.exit(0);
+  }
+  results.forEach((entry, idx) => {
+    console.log(`${idx + 1}. ${entry.title}`);
+    console.log(`   ${entry.url}`);
+    if (entry.snippet) {
+      console.log(`   ${entry.snippet}`);
+    }
+    console.log('');
+  });
+} catch (error) {
+  console.error('Search failed:', error.message);
+  if (error.code) {
+    console.error('Error code:', error.code);
+  }
+  process.exit(1);
+}
diff --git a/src/config.js b/src/config.js
index be38879..fc2ebe3 100644
--- a/src/config.js
+++ b/src/config.js
@@ -17,12 +17,8 @@ export const config = {
   embedModel: process.env.OPENAI_EMBED_MODEL || 'text-embedding-3-small',
   preferredChannel: process.env.BOT_CHANNEL_ID || null,
   enableWebSearch: process.env.ENABLE_WEB_SEARCH !== 'false',
-  proxyScrapeEnabled: process.env.ENABLE_PROXY_SCRAPE !== 'false',
-  proxyScrapeEndpoint:
-    process.env.PROXYSCRAPE_ENDPOINT
-    || 'https://api.proxyscrape.com/v4/free-proxy-list/get?request=getproxies&protocol=http&timeout=8000&country=all&ssl=yes&anonymity=elite&limit=200',
-  proxyScrapeRefreshMs: Number(process.env.PROXYSCRAPE_REFRESH_MS || 10 * 60 * 1000),
-  proxyScrapeMaxAttempts: Number(process.env.PROXYSCRAPE_ATTEMPTS || 5),
+  proxyPoolRefreshMs: Number(process.env.PROXY_POOL_REFRESH_MS || 10 * 60 * 1000),
+  proxyPoolMaxAttempts: Number(process.env.PROXY_POOL_ATTEMPTS || 5),
   coderUserId: process.env.CODER_USER_ID || null,
   maxCoderPingIntervalMs: 6 * 60 * 60 * 1000,
   shortTermLimit: 10,
diff --git a/src/search.js b/src/search.js
index c34e52a..ec7723e 100644
--- a/src/search.js
+++ b/src/search.js
@@ -10,6 +10,10 @@ const filterFile = path.resolve('data', 'filter.txt');
 const cache = new Map();
 const CACHE_TTL_MS = 10 * 60 * 1000; // 10 minutes
 const FILTER_CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
+const FREE_PROXY_LIST_URL = 'https://free-proxy-list.net/en/';
+const PROXY_LINE_REGEX = /^\d{1,3}(?:\.\d{1,3}){3}:\d{2,5}$/;
+const PROXY_REFRESH_MS = config.proxyPoolRefreshMs || 10 * 60 * 1000;
+const PROXY_MAX_ATTEMPTS = Math.max(1, config.proxyPoolMaxAttempts || 5);
 
 let cachedFilters = { terms: [], expires: 0 };
 let proxyPool = [];
@@ -68,6 +72,10 @@ async function loadBlockedTerms() {
   }
 }
 
+export async function detectFilteredPhrase(text) {
+  return findBlockedTerm(text);
+}
+
 async function findBlockedTerm(query) {
   if (!query) return null;
   const lowered = query.toLowerCase();
@@ -88,12 +96,17 @@ function createProxyUnavailableError(reason) {
   return error;
 }
 
-function parseProxyList(raw) {
-  if (!raw) return [];
-  return raw
-    .split(/\r?\n/)
+function normalizeProxyEntries(entries) {
+  if (!entries?.length) return [];
+  const seen = new Set();
+  entries
     .map((line) => line.trim())
-    .filter((line) => line && !line.startsWith('#'));
+    .forEach((line) => {
+      if (PROXY_LINE_REGEX.test(line) && !seen.has(line)) {
+        seen.add(line);
+      }
+    });
+  return Array.from(seen);
 }
 
 function removeProxyFromPool(proxy) {
@@ -105,35 +118,57 @@ function removeProxyFromPool(proxy) {
   }
 }
 
-async function hydrateProxyPool() {
-  if (!config.proxyScrapeEnabled) {
-    proxyPool = [];
-    proxyPoolExpires = 0;
-    proxyCursor = 0;
-    return;
-  }
-  const endpoint = config.proxyScrapeEndpoint;
-  const response = await fetch(endpoint, {
+async function fetchFreeProxyList() {
+  const response = await fetch(FREE_PROXY_LIST_URL, {
     headers: {
-      Accept: 'text/plain',
-      'User-Agent': 'NovaBot/1.0 (+https://github.com/) ProxyScrape client',
+      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0 Safari/537.36',
+      Accept: 'text/html',
     },
   });
   if (!response.ok) {
-    throw createProxyUnavailableError(`Failed to fetch proxy list (HTTP ${response.status})`);
+    throw new Error(`Failed to fetch free-proxy-list.net feed (HTTP ${response.status})`);
   }
-  const text = await response.text();
-  const proxies = parseProxyList(text);
-  if (!proxies.length) {
-    throw createProxyUnavailableError('Proxy list came back empty');
+  const html = await response.text();
+  const $ = loadHtml(html);
+  const table = $('table.table.table-striped.table-bordered').first();
+  const entries = [];
+  table.find('tbody tr').each((_, row) => {
+    const cells = $(row).find('td');
+    if (!cells?.length) return undefined;
+    const ip = $(cells[0]).text().trim();
+    const port = $(cells[1]).text().trim();
+    const anonymity = $(cells[4]).text().trim().toLowerCase();
+    const https = $(cells[6]).text().trim().toLowerCase();
+    if (ip && port && https === 'yes' && !anonymity.includes('transparent')) {
+      entries.push(`${ip}:${port}`);
+    }
+    return undefined;
+  });
+  return normalizeProxyEntries(entries);
+}
+
+async function hydrateProxyPool() {
+  let lastError = null;
+
+  try {
+    const verifiedProxies = await fetchFreeProxyList();
+    if (!verifiedProxies.length) {
+      throw new Error('free-proxy-list.net returned zero usable entries');
+    }
+    proxyPool = verifiedProxies;
+    proxyPoolExpires = Date.now() + PROXY_REFRESH_MS;
+    proxyCursor = 0;
+    console.info(`[search] Loaded ${verifiedProxies.length} proxies from free-proxy-list.net`);
+    return;
+  } catch (error) {
+    lastError = error;
+    console.warn(`[search] Free proxy source failed: ${error.message}`);
   }
-  proxyPool = proxies;
-  proxyPoolExpires = Date.now() + (config.proxyScrapeRefreshMs || 10 * 60 * 1000);
-  proxyCursor = 0;
+
+  throw createProxyUnavailableError(lastError?.message || 'Proxy list unavailable');
 }
 
 async function ensureProxyPool() {
-  if (!config.proxyScrapeEnabled) return;
   if (proxyPool.length && Date.now() < proxyPoolExpires) {
     return;
   }
@@ -142,8 +177,8 @@ async function ensureProxyPool() {
 
 async function getProxyInfo() {
   await ensureProxyPool();
-  if (!config.proxyScrapeEnabled || !proxyPool.length) {
-    return null;
+  if (!proxyPool.length) {
+    throw createProxyUnavailableError('Proxy pool empty');
   }
   const proxy = proxyPool[proxyCursor % proxyPool.length];
   proxyCursor = (proxyCursor + 1) % proxyPool.length;
@@ -154,22 +189,15 @@ async function getProxyInfo() {
 }
 
 async function fetchDuckDuckGoHtml(url, headers) {
-  const maxAttempts = config.proxyScrapeEnabled
-    ? Math.max(1, config.proxyScrapeMaxAttempts || 5)
-    : 1;
+  const maxAttempts = PROXY_MAX_ATTEMPTS;
   let lastError = null;
 
   for (let attempt = 0; attempt < maxAttempts; attempt += 1) {
     let proxyInfo = null;
     try {
       const options = { headers };
-      if (config.proxyScrapeEnabled) {
-        proxyInfo = await getProxyInfo();
-        if (!proxyInfo) {
-          throw createProxyUnavailableError('No proxies available');
-        }
-        options.dispatcher = proxyInfo.agent;
-      }
+      proxyInfo = await getProxyInfo();
+      options.dispatcher = proxyInfo.agent;
       const response = await fetch(url, options);
       if (!response.ok) {
         throw new Error(`DuckDuckGo request failed (${response.status})`);
@@ -181,19 +209,13 @@ async function fetchDuckDuckGoHtml(url, headers) {
       };
     } catch (error) {
       lastError = error;
-      if (!config.proxyScrapeEnabled) {
-        break;
-      }
       if (proxyInfo?.proxy) {
         removeProxyFromPool(proxyInfo.proxy);
       }
     }
   }
 
-  if (config.proxyScrapeEnabled) {
-    throw createProxyUnavailableError(lastError?.message || 'All proxies failed');
-  }
-  throw lastError || new Error('DuckDuckGo fetch failed');
+  throw createProxyUnavailableError(lastError?.message || 'All proxies failed');
 }
 
 export async function searchWeb(query, limit = 3) {
@@ -220,7 +242,7 @@ export async function searchWeb(query, limit = 3) {
   try {
     const { html: fetchedHtml, proxy } = await fetchDuckDuckGoHtml(`https://duckduckgo.com/html/?${params.toString()}`, headers);
     html = fetchedHtml;
-    proxyLabel = config.proxyScrapeEnabled ? proxy || 'proxy-unknown' : 'direct';
+    proxyLabel = proxy || 'proxy-unknown';
   } catch (error) {
     if (error?.code === 'SEARCH_PROXY_UNAVAILABLE') {
       throw error;
@@ -243,7 +265,7 @@ export async function searchWeb(query, limit = 3) {
   });
 
   setCache(query, results);
-  return { results, proxy: proxyLabel || (config.proxyScrapeEnabled ? 'proxy-unknown' : 'direct'), fromCache: false };
+  return { results, proxy: proxyLabel || 'proxy-unknown', fromCache: false };
 }
 
 export async function appendSearchLog({ userId, query, results, proxy }) {