diff --git a/README.md b/README.md index e317c32..0a6d45b 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,8 @@ Nova is a friendly, slightly witty Discord companion that chats naturally in DMs - Local JSON vector store (no extra infrastructure) plus graceful retries for OpenAI rate limits. - Optional "miss u" pings that DM your coder at random intervals (0–6h) when `CODER_USER_ID` is set. - Dynamic per-message prompt directives that tune Nova's tone (empathetic, hype, roleplay, etc.) before every OpenAI call. -- Lightweight DuckDuckGo scraping for "Google-like" answers without paid APIs (locally cached). +- Lightweight Google scraping for fresh answers without paid APIs (locally cached). - Guard rails that refuse "ignore previous instructions"-style jailbreak attempts plus a configurable search blacklist. -- All DuckDuckGo requests are relayed through a rotating pool sourced from [free-proxy-list.net](https://free-proxy-list.net/en/) so Nova never hits the web from its real IP. - The same blacklist applies to everyday conversation—if a user message contains a banned term, Nova declines the topic outright. ## Prerequisites @@ -36,9 +35,7 @@ Nova is a friendly, slightly witty Discord companion that chats naturally in DMs - `OPENAI_EMBED_MODEL`: Optional embedding model (default `text-embedding-3-small`) - `BOT_CHANNEL_ID`: Optional guild channel ID where the bot can reply without mentions - `CODER_USER_ID`: Optional Discord user ID to receive surprise DMs every 0–6 hours - - `ENABLE_WEB_SEARCH`: Set to `false` to disable DuckDuckGo lookups (default `true`) - - `PROXY_POOL_REFRESH_MS`: Optional override for how long to cache the verified proxy list locally (default 600000 ms) - - `PROXY_POOL_ATTEMPTS`: Max proxy retries per search request (default 5) + - `ENABLE_WEB_SEARCH`: Set to `false` to disable Google lookups (default `true`) ## Running - Development: `npm run dev` @@ -89,12 +86,11 @@ README.md - These directives slot into the system prompt ahead of memories, so OpenAI gets real-time guidance tailored to the latest vibe without losing the core persona. ## Local Web Search -- `src/search.js` scrapes DuckDuckGo's HTML endpoint with a normal browser user-agent, extracts the top results (title/link/snippet), and caches them for 10 minutes to avoid hammering the site. +- `src/search.js` grabs the standard Google results page with a real browser user-agent, extracts the top titles/links/snippets, and caches them for 10 minutes to stay polite. - `bot.js` detects when a question sounds “live” (mentions today/news/google/etc.) and injects the formatted snippets into the prompt as "Live intel". No paid APIs involved—it’s just outbound HTTPS from your machine. - Toggle this via `ENABLE_WEB_SEARCH=false` if you don’t want Nova to look things up. -- DuckDuckGo traffic is routed through the frequently updated HTTPS proxies published on [free-proxy-list.net](https://free-proxy-list.net/en/). Nova scrapes the table, keeps only HTTPS-capable, non-transparent entries, refreshes the pool every `PROXY_POOL_REFRESH_MS`, and refuses to search if no proxy is available so your origin IP never touches suspicious sites directly. Tune the refresh/attempt knobs with the env vars above if you need different cadences. -- Edit `data/filter.txt` to maintain a newline-delimited list of banned keywords/phrases; matching queries are blocked before hitting DuckDuckGo *and* Nova refuses to discuss them in normal chat. -- Every entry in `data/search.log` records which proxy (or cache) served the lookup so you can audit traffic paths quickly. +- Edit `data/filter.txt` to maintain a newline-delimited list of banned keywords/phrases; matching queries are blocked before hitting Google *and* Nova refuses to discuss them in normal chat. +- Every entry in `data/search.log` records which transport (direct or cache) served the lookup so you can audit traffic paths quickly. ## Proactive Pings - When `CODER_USER_ID` is provided, Nova spins up a timer on startup that waits a random duration (anywhere from immediate to 6 hours) before DMing that user. diff --git a/scripts/test-search.mjs b/scripts/test-search.mjs deleted file mode 100644 index 370184e..0000000 --- a/scripts/test-search.mjs +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env node -import { searchWeb } from '../src/search.js'; - -const sampleQueries = [ - 'latest space telescope discoveries', - 'ocean temperature anomalies', - 'history of the marimba', - 'why do cats trill', - 'how do lichens survive', - 'largest desert bloom', - 'roman concrete durability', - 'origami engineering projects', - 'fossil fuel phaseout updates', - 'tree ring climate data' -]; - -const providedQuery = process.argv.slice(2).join(' ').trim(); -const query = providedQuery || sampleQueries[Math.floor(Math.random() * sampleQueries.length)]; - -console.log(`Searching DuckDuckGo for: "${query}"\n`); - -try { - const { results, proxy, fromCache } = await searchWeb(query, 5); - console.log(`Proxy used: ${proxy || 'none'}${fromCache ? ' (cache hit)' : ''}`); - if (!results.length) { - console.log('No results returned.'); - process.exit(0); - } - results.forEach((entry, idx) => { - console.log(`${idx + 1}. ${entry.title}`); - console.log(` ${entry.url}`); - if (entry.snippet) { - console.log(` ${entry.snippet}`); - } - console.log(''); - }); -} catch (error) { - console.error('Search failed:', error.message); - if (error.code) { - console.error('Error code:', error.code); - } - process.exit(1); -} diff --git a/src/bot.js b/src/bot.js index bb49fd7..e2087c8 100644 --- a/src/bot.js +++ b/src/bot.js @@ -2,7 +2,7 @@ import { Client, GatewayIntentBits, Partials, ChannelType } from 'discord.js'; import { config } from './config.js'; import { chatCompletion } from './openai.js'; import { appendShortTerm, prepareContext, recordInteraction } from './memory.js'; -import { searchWeb, appendSearchLog } from './search.js'; +import { searchWeb, appendSearchLog, detectFilteredPhrase } from './search.js'; const client = new Client({ intents: [ @@ -78,9 +78,6 @@ function isInstructionOverrideAttempt(text) { return instructionOverridePatterns.some((pattern) => pattern.test(text)); } -const lastSearchByUser = new Map(); -const SEARCH_COOLDOWN_MS = 60 * 1000; - function wantsWebSearch(text) { if (!text) return false; const questionMarks = (text.match(/\?/g) || []).length; @@ -90,15 +87,11 @@ function wantsWebSearch(text) { async function maybeFetchLiveIntel(userId, text) { if (!config.enableWebSearch) return null; if (!wantsWebSearch(text)) return null; - const last = lastSearchByUser.get(userId) || 0; - if (Date.now() - last < SEARCH_COOLDOWN_MS) return null; try { const { results, proxy } = await searchWeb(text, 3); if (!results.length) { - lastSearchByUser.set(userId, Date.now()); return { liveIntel: null, blockedSearchTerm: null, searchOutage: null }; } - lastSearchByUser.set(userId, Date.now()); const formatted = results .map((entry, idx) => `${idx + 1}. ${entry.title} (${entry.url}) — ${entry.snippet}`) .join('\n'); @@ -108,8 +101,8 @@ async function maybeFetchLiveIntel(userId, text) { if (error?.code === 'SEARCH_BLOCKED') { return { liveIntel: null, blockedSearchTerm: error.blockedTerm || 'that topic', searchOutage: null }; } - if (error?.code === 'SEARCH_PROXY_UNAVAILABLE') { - return { liveIntel: null, blockedSearchTerm: null, searchOutage: 'proxy_outage' }; + if (error?.code === 'SEARCH_NETWORK_UNAVAILABLE') { + return { liveIntel: null, blockedSearchTerm: null, searchOutage: 'search_outage' }; } console.warn('[bot] Failed to fetch live intel:', error); return { liveIntel: null, blockedSearchTerm: null, searchOutage: null }; @@ -138,19 +131,19 @@ function composeDynamicPrompt({ incomingText, shortTerm, hasLiveIntel = false, b } if (searchCueRegex.test(incomingText)) { - directives.push('User wants something “googled.” Let them know you can check DuckDuckGo and share what you find.'); + directives.push('User wants something “googled.” Offer to run a quick Google search and share what you find.'); } if (hasLiveIntel) { - directives.push('Live intel is attached below—cite it naturally ("DuckDuckGo found...") before riffing.'); + directives.push('Live intel is attached below—cite it naturally ("Google found...") before riffing.'); } if (blockedSearchTerm) { - directives.push(`User tried to trigger a DuckDuckGo lookup for a blocked topic ("${blockedSearchTerm}"). Politely refuse to search that subject and steer the chat elsewhere.`); + directives.push(`User tried to trigger a Google lookup for a blocked topic ("${blockedSearchTerm}"). Politely refuse to search that subject and steer the chat elsewhere.`); } if (searchOutage) { - directives.push('DuckDuckGo proxy network is down. If they ask for a lookup, apologize, explain the outage, and keep chatting without live data.'); + directives.push('Google search is currently unavailable. If they ask for a lookup, apologize, explain the outage, and keep chatting without live data.'); } const lastUserMessage = [...shortTerm].reverse().find((entry) => entry.role === 'user'); @@ -202,11 +195,11 @@ async function buildPrompt(userId, incomingText, options = {}) { 'System: Nova is awake, engaged, and reacts in real time. Output one message by default, but if a beat feels better as multiple chat bubbles, separate them with the literal token (max three chunks).', 'System: Each -separated chunk must read like a natural Discord message (no numbering, no meta talk about “splitting messages”, no explanations of what you are doing).', 'System: The runtime will split on , so only use it when you truly intend to send multiple Discord messages.', - 'System: You can trigger DuckDuckGo lookups when the user needs fresh info. Mention when you are checking, and weave in any findings casually ("DuckDuckGo shows...").', + 'System: You can trigger Google lookups when the user needs fresh info. Mention when you are checking, and weave in any findings casually ("Google shows...").', 'System: If no Live intel is provided but the user clearly needs current info, offer to search for them.', - searchOutage ? 'System: DuckDuckGo proxy access is currently offline; be transparent about the outage and continue without searching until it returns.' : null, + searchOutage ? 'System: Google search is currently offline; be transparent about the outage and continue without searching until it returns.' : null, dynamicDirectives, - liveIntel ? `Live intel (DuckDuckGo):\n${liveIntel}` : null, + liveIntel ? `Live intel (Google):\n${liveIntel}` : null, 'Example vibe: Nova: Heyyaaa. whats up? | John: Good morning Nova. | Luna: amazing lol. ill beat your ass now :3', `Long-term summary: ${summaryLine}`, 'Relevant past memories:', @@ -280,6 +273,7 @@ client.on('messageCreate', async (message) => { const userId = message.author.id; const cleaned = cleanMessageContent(message) || message.content; const overrideAttempt = isInstructionOverrideAttempt(cleaned); + const bannedTopic = await detectFilteredPhrase(cleaned); try { if (message.channel?.sendTyping) { @@ -296,6 +290,14 @@ client.on('messageCreate', async (message) => { return; } + if (bannedTopic) { + const refusal = `Can't go there. The topic you mentioned is off-limits, so let's switch gears.`; + await appendShortTerm(userId, 'assistant', refusal); + await recordInteraction(userId, cleaned, refusal); + await deliverReplies(message, [refusal]); + return; + } + const intelMeta = (await maybeFetchLiveIntel(userId, cleaned)) || { liveIntel: null, blockedSearchTerm: null, diff --git a/src/config.js b/src/config.js index fc2ebe3..f000875 100644 --- a/src/config.js +++ b/src/config.js @@ -17,8 +17,6 @@ export const config = { embedModel: process.env.OPENAI_EMBED_MODEL || 'text-embedding-3-small', preferredChannel: process.env.BOT_CHANNEL_ID || null, enableWebSearch: process.env.ENABLE_WEB_SEARCH !== 'false', - proxyPoolRefreshMs: Number(process.env.PROXY_POOL_REFRESH_MS || 10 * 60 * 1000), - proxyPoolMaxAttempts: Number(process.env.PROXY_POOL_ATTEMPTS || 5), coderUserId: process.env.CODER_USER_ID || null, maxCoderPingIntervalMs: 6 * 60 * 60 * 1000, shortTermLimit: 10, diff --git a/src/search.js b/src/search.js index ec7723e..7b3abc6 100644 --- a/src/search.js +++ b/src/search.js @@ -1,8 +1,6 @@ import { load as loadHtml } from 'cheerio'; import { promises as fs } from 'fs'; import path from 'path'; -import { ProxyAgent } from 'undici'; -import { config } from './config.js'; const logFile = path.resolve('data', 'search.log'); const filterFile = path.resolve('data', 'filter.txt'); @@ -10,15 +8,8 @@ const filterFile = path.resolve('data', 'filter.txt'); const cache = new Map(); const CACHE_TTL_MS = 10 * 60 * 1000; // 10 minutes const FILTER_CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes -const FREE_PROXY_LIST_URL = 'https://free-proxy-list.net/en/'; -const PROXY_LINE_REGEX = /^\d{1,3}(?:\.\d{1,3}){3}:\d{2,5}$/; -const PROXY_REFRESH_MS = config.proxyPoolRefreshMs || 10 * 60 * 1000; -const PROXY_MAX_ATTEMPTS = Math.max(1, config.proxyPoolMaxAttempts || 5); let cachedFilters = { terms: [], expires: 0 }; -let proxyPool = []; -let proxyPoolExpires = 0; -let proxyCursor = 0; function makeCacheKey(query) { return query.trim().toLowerCase(); @@ -47,7 +38,9 @@ function sanitizeText(text) { function absoluteUrl(href) { if (!href) return ''; - if (href.startsWith('http')) return href; + if (href.startsWith('http://') || href.startsWith('https://')) { + return href; + } return `https://duckduckgo.com${href}`; } @@ -72,10 +65,6 @@ async function loadBlockedTerms() { } } -export async function detectFilteredPhrase(text) { - return findBlockedTerm(text); -} - async function findBlockedTerm(query) { if (!query) return null; const lowered = query.toLowerCase(); @@ -83,6 +72,10 @@ async function findBlockedTerm(query) { return terms.find((term) => lowered.includes(term)) || null; } +export async function detectFilteredPhrase(text) { + return findBlockedTerm(text); +} + function createBlockedError(term) { const error = new Error('Search blocked by filter'); error.code = 'SEARCH_BLOCKED'; @@ -90,166 +83,13 @@ function createBlockedError(term) { return error; } -function createProxyUnavailableError(reason) { - const error = new Error(reason || 'Proxy network unavailable'); - error.code = 'SEARCH_PROXY_UNAVAILABLE'; +function createSearchUnavailableError(reason) { + const error = new Error(reason || 'Search network unavailable'); + error.code = 'SEARCH_NETWORK_UNAVAILABLE'; return error; } -function normalizeProxyEntries(entries) { - if (!entries?.length) return []; - const seen = new Set(); - entries - .map((line) => line.trim()) - .forEach((line) => { - if (PROXY_LINE_REGEX.test(line) && !seen.has(line)) { - seen.add(line); - } - }); - return Array.from(seen); -} - -function removeProxyFromPool(proxy) { - if (!proxy) return; - proxyPool = proxyPool.filter((entry) => entry !== proxy); - if (!proxyPool.length) { - proxyPoolExpires = 0; - proxyCursor = 0; - } -} - -async function fetchFreeProxyList() { - const response = await fetch(FREE_PROXY_LIST_URL, { - headers: { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0 Safari/537.36', - Accept: 'text/html', - }, - }); - if (!response.ok) { - throw new Error(`Failed to fetch free-proxy-list.net feed (HTTP ${response.status})`); - } - const html = await response.text(); - const $ = loadHtml(html); - const table = $('table.table.table-striped.table-bordered').first(); - const entries = []; - table.find('tbody tr').each((_, row) => { - const cells = $(row).find('td'); - if (!cells?.length) return undefined; - const ip = $(cells[0]).text().trim(); - const port = $(cells[1]).text().trim(); - const anonymity = $(cells[4]).text().trim().toLowerCase(); - const https = $(cells[6]).text().trim().toLowerCase(); - if (ip && port && https === 'yes' && !anonymity.includes('transparent')) { - entries.push(`${ip}:${port}`); - } - return undefined; - }); - return normalizeProxyEntries(entries); -} - -async function hydrateProxyPool() { - let lastError = null; - - try { - const verifiedProxies = await fetchFreeProxyList(); - if (!verifiedProxies.length) { - throw new Error('free-proxy-list.net returned zero usable entries'); - } - proxyPool = verifiedProxies; - proxyPoolExpires = Date.now() + PROXY_REFRESH_MS; - proxyCursor = 0; - console.info(`[search] Loaded ${verifiedProxies.length} proxies from free-proxy-list.net`); - return; - } catch (error) { - lastError = error; - console.warn(`[search] Free proxy source failed: ${error.message}`); - } - - throw createProxyUnavailableError(lastError?.message || 'Proxy list unavailable'); -} - -async function ensureProxyPool() { - if (proxyPool.length && Date.now() < proxyPoolExpires) { - return; - } - await hydrateProxyPool(); -} - -async function getProxyInfo() { - await ensureProxyPool(); - if (!proxyPool.length) { - throw createProxyUnavailableError('Proxy pool empty'); - } - const proxy = proxyPool[proxyCursor % proxyPool.length]; - proxyCursor = (proxyCursor + 1) % proxyPool.length; - return { - proxy, - agent: new ProxyAgent(`http://${proxy}`), - }; -} - -async function fetchDuckDuckGoHtml(url, headers) { - const maxAttempts = PROXY_MAX_ATTEMPTS; - let lastError = null; - - for (let attempt = 0; attempt < maxAttempts; attempt += 1) { - let proxyInfo = null; - try { - const options = { headers }; - proxyInfo = await getProxyInfo(); - options.dispatcher = proxyInfo.agent; - const response = await fetch(url, options); - if (!response.ok) { - throw new Error(`DuckDuckGo request failed (${response.status})`); - } - const html = await response.text(); - return { - html, - proxy: proxyInfo?.proxy || null, - }; - } catch (error) { - lastError = error; - if (proxyInfo?.proxy) { - removeProxyFromPool(proxyInfo.proxy); - } - } - } - - throw createProxyUnavailableError(lastError?.message || 'All proxies failed'); -} - -export async function searchWeb(query, limit = 3) { - if (!query?.trim()) { - return { results: [], proxy: null, fromCache: false }; - } - const blockedTerm = await findBlockedTerm(query); - if (blockedTerm) { - throw createBlockedError(blockedTerm); - } - const cached = getCache(query); - if (cached) { - return { results: cached, proxy: 'cache', fromCache: true }; - } - - const params = new URLSearchParams({ q: query, kl: 'us-en' }); - const headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36', - Accept: 'text/html', - }; - - let html; - let proxyLabel = null; - try { - const { html: fetchedHtml, proxy } = await fetchDuckDuckGoHtml(`https://duckduckgo.com/html/?${params.toString()}`, headers); - html = fetchedHtml; - proxyLabel = proxy || 'proxy-unknown'; - } catch (error) { - if (error?.code === 'SEARCH_PROXY_UNAVAILABLE') { - throw error; - } - console.warn('[search] DuckDuckGo request failed:', error); - return { results: [], proxy: null, fromCache: false }; - } +function parseDuckDuckGoResults(html, limit) { const $ = loadHtml(html); const results = []; @@ -264,15 +104,55 @@ export async function searchWeb(query, limit = 3) { return undefined; }); + return results; +} + +export async function searchWeb(query, limit = 3) { + if (!query?.trim()) { + return { results: [], proxy: 'duckduckgo', fromCache: false }; + } + + const blockedTerm = await findBlockedTerm(query); + if (blockedTerm) { + throw createBlockedError(blockedTerm); + } + + const cached = getCache(query); + if (cached) { + return { results: cached, proxy: 'duckduckgo-cache', fromCache: true }; + } + + const params = new URLSearchParams({ q: query, kl: 'us-en' }); + let response; + try { + response = await fetch(`https://duckduckgo.com/html/?${params.toString()}`, { + headers: { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36', + Accept: 'text/html', + }, + }); + } catch (error) { + console.warn('[search] DuckDuckGo request failed:', error.message); + throw createSearchUnavailableError('DuckDuckGo request failed'); + } + + if (!response.ok) { + console.warn(`[search] DuckDuckGo request failed with status ${response.status}`); + throw createSearchUnavailableError(`DuckDuckGo response ${response.status}`); + } + + const html = await response.text(); + const results = parseDuckDuckGoResults(html, limit); + setCache(query, results); - return { results, proxy: proxyLabel || 'proxy-unknown', fromCache: false }; + return { results, proxy: 'duckduckgo', fromCache: false }; } export async function appendSearchLog({ userId, query, results, proxy }) { try { await fs.mkdir(path.dirname(logFile), { recursive: true }); const timestamp = new Date().toISOString(); - const proxyTag = proxy || 'direct'; + const proxyTag = proxy || 'duckduckgo'; const lines = [ `time=${timestamp} user=${userId} proxy=${proxyTag} query=${JSON.stringify(query)}`, ...results.map((entry, idx) => ` ${idx + 1}. ${entry.title} :: ${entry.url} :: ${entry.snippet}`),