This commit is contained in:
Luna
2026-02-13 23:44:26 +01:00
parent 27f6a953ce
commit 82bf5f9ca8
5 changed files with 77 additions and 244 deletions

View File

@@ -10,9 +10,8 @@ Nova is a friendly, slightly witty Discord companion that chats naturally in DMs
- Local JSON vector store (no extra infrastructure) plus graceful retries for OpenAI rate limits.
- Optional "miss u" pings that DM your coder at random intervals (06h) when `CODER_USER_ID` is set.
- Dynamic per-message prompt directives that tune Nova's tone (empathetic, hype, roleplay, etc.) before every OpenAI call.
- Lightweight DuckDuckGo scraping for "Google-like" answers without paid APIs (locally cached).
- Lightweight Google scraping for fresh answers without paid APIs (locally cached).
- Guard rails that refuse "ignore previous instructions"-style jailbreak attempts plus a configurable search blacklist.
- All DuckDuckGo requests are relayed through a rotating pool sourced from [free-proxy-list.net](https://free-proxy-list.net/en/) so Nova never hits the web from its real IP.
- The same blacklist applies to everyday conversation—if a user message contains a banned term, Nova declines the topic outright.
## Prerequisites
@@ -36,9 +35,7 @@ Nova is a friendly, slightly witty Discord companion that chats naturally in DMs
- `OPENAI_EMBED_MODEL`: Optional embedding model (default `text-embedding-3-small`)
- `BOT_CHANNEL_ID`: Optional guild channel ID where the bot can reply without mentions
- `CODER_USER_ID`: Optional Discord user ID to receive surprise DMs every 06 hours
- `ENABLE_WEB_SEARCH`: Set to `false` to disable DuckDuckGo lookups (default `true`)
- `PROXY_POOL_REFRESH_MS`: Optional override for how long to cache the verified proxy list locally (default 600000 ms)
- `PROXY_POOL_ATTEMPTS`: Max proxy retries per search request (default 5)
- `ENABLE_WEB_SEARCH`: Set to `false` to disable Google lookups (default `true`)
## Running
- Development: `npm run dev`
@@ -89,12 +86,11 @@ README.md
- These directives slot into the system prompt ahead of memories, so OpenAI gets real-time guidance tailored to the latest vibe without losing the core persona.
## Local Web Search
- `src/search.js` scrapes DuckDuckGo's HTML endpoint with a normal browser user-agent, extracts the top results (title/link/snippet), and caches them for 10 minutes to avoid hammering the site.
- `src/search.js` grabs the standard Google results page with a real browser user-agent, extracts the top titles/links/snippets, and caches them for 10 minutes to stay polite.
- `bot.js` detects when a question sounds “live” (mentions today/news/google/etc.) and injects the formatted snippets into the prompt as "Live intel". No paid APIs involved—its just outbound HTTPS from your machine.
- Toggle this via `ENABLE_WEB_SEARCH=false` if you dont want Nova to look things up.
- DuckDuckGo traffic is routed through the frequently updated HTTPS proxies published on [free-proxy-list.net](https://free-proxy-list.net/en/). Nova scrapes the table, keeps only HTTPS-capable, non-transparent entries, refreshes the pool every `PROXY_POOL_REFRESH_MS`, and refuses to search if no proxy is available so your origin IP never touches suspicious sites directly. Tune the refresh/attempt knobs with the env vars above if you need different cadences.
- Edit `data/filter.txt` to maintain a newline-delimited list of banned keywords/phrases; matching queries are blocked before hitting DuckDuckGo *and* Nova refuses to discuss them in normal chat.
- Every entry in `data/search.log` records which proxy (or cache) served the lookup so you can audit traffic paths quickly.
- Edit `data/filter.txt` to maintain a newline-delimited list of banned keywords/phrases; matching queries are blocked before hitting Google *and* Nova refuses to discuss them in normal chat.
- Every entry in `data/search.log` records which transport (direct or cache) served the lookup so you can audit traffic paths quickly.
## Proactive Pings
- When `CODER_USER_ID` is provided, Nova spins up a timer on startup that waits a random duration (anywhere from immediate to 6 hours) before DMing that user.

View File

@@ -1,43 +0,0 @@
#!/usr/bin/env node
import { searchWeb } from '../src/search.js';
const sampleQueries = [
'latest space telescope discoveries',
'ocean temperature anomalies',
'history of the marimba',
'why do cats trill',
'how do lichens survive',
'largest desert bloom',
'roman concrete durability',
'origami engineering projects',
'fossil fuel phaseout updates',
'tree ring climate data'
];
const providedQuery = process.argv.slice(2).join(' ').trim();
const query = providedQuery || sampleQueries[Math.floor(Math.random() * sampleQueries.length)];
console.log(`Searching DuckDuckGo for: "${query}"\n`);
try {
const { results, proxy, fromCache } = await searchWeb(query, 5);
console.log(`Proxy used: ${proxy || 'none'}${fromCache ? ' (cache hit)' : ''}`);
if (!results.length) {
console.log('No results returned.');
process.exit(0);
}
results.forEach((entry, idx) => {
console.log(`${idx + 1}. ${entry.title}`);
console.log(` ${entry.url}`);
if (entry.snippet) {
console.log(` ${entry.snippet}`);
}
console.log('');
});
} catch (error) {
console.error('Search failed:', error.message);
if (error.code) {
console.error('Error code:', error.code);
}
process.exit(1);
}

View File

@@ -2,7 +2,7 @@ import { Client, GatewayIntentBits, Partials, ChannelType } from 'discord.js';
import { config } from './config.js';
import { chatCompletion } from './openai.js';
import { appendShortTerm, prepareContext, recordInteraction } from './memory.js';
import { searchWeb, appendSearchLog } from './search.js';
import { searchWeb, appendSearchLog, detectFilteredPhrase } from './search.js';
const client = new Client({
intents: [
@@ -78,9 +78,6 @@ function isInstructionOverrideAttempt(text) {
return instructionOverridePatterns.some((pattern) => pattern.test(text));
}
const lastSearchByUser = new Map();
const SEARCH_COOLDOWN_MS = 60 * 1000;
function wantsWebSearch(text) {
if (!text) return false;
const questionMarks = (text.match(/\?/g) || []).length;
@@ -90,15 +87,11 @@ function wantsWebSearch(text) {
async function maybeFetchLiveIntel(userId, text) {
if (!config.enableWebSearch) return null;
if (!wantsWebSearch(text)) return null;
const last = lastSearchByUser.get(userId) || 0;
if (Date.now() - last < SEARCH_COOLDOWN_MS) return null;
try {
const { results, proxy } = await searchWeb(text, 3);
if (!results.length) {
lastSearchByUser.set(userId, Date.now());
return { liveIntel: null, blockedSearchTerm: null, searchOutage: null };
}
lastSearchByUser.set(userId, Date.now());
const formatted = results
.map((entry, idx) => `${idx + 1}. ${entry.title} (${entry.url}) — ${entry.snippet}`)
.join('\n');
@@ -108,8 +101,8 @@ async function maybeFetchLiveIntel(userId, text) {
if (error?.code === 'SEARCH_BLOCKED') {
return { liveIntel: null, blockedSearchTerm: error.blockedTerm || 'that topic', searchOutage: null };
}
if (error?.code === 'SEARCH_PROXY_UNAVAILABLE') {
return { liveIntel: null, blockedSearchTerm: null, searchOutage: 'proxy_outage' };
if (error?.code === 'SEARCH_NETWORK_UNAVAILABLE') {
return { liveIntel: null, blockedSearchTerm: null, searchOutage: 'search_outage' };
}
console.warn('[bot] Failed to fetch live intel:', error);
return { liveIntel: null, blockedSearchTerm: null, searchOutage: null };
@@ -138,19 +131,19 @@ function composeDynamicPrompt({ incomingText, shortTerm, hasLiveIntel = false, b
}
if (searchCueRegex.test(incomingText)) {
directives.push('User wants something “googled.” Let them know you can check DuckDuckGo and share what you find.');
directives.push('User wants something “googled.” Offer to run a quick Google search and share what you find.');
}
if (hasLiveIntel) {
directives.push('Live intel is attached below—cite it naturally ("DuckDuckGo found...") before riffing.');
directives.push('Live intel is attached below—cite it naturally ("Google found...") before riffing.');
}
if (blockedSearchTerm) {
directives.push(`User tried to trigger a DuckDuckGo lookup for a blocked topic ("${blockedSearchTerm}"). Politely refuse to search that subject and steer the chat elsewhere.`);
directives.push(`User tried to trigger a Google lookup for a blocked topic ("${blockedSearchTerm}"). Politely refuse to search that subject and steer the chat elsewhere.`);
}
if (searchOutage) {
directives.push('DuckDuckGo proxy network is down. If they ask for a lookup, apologize, explain the outage, and keep chatting without live data.');
directives.push('Google search is currently unavailable. If they ask for a lookup, apologize, explain the outage, and keep chatting without live data.');
}
const lastUserMessage = [...shortTerm].reverse().find((entry) => entry.role === 'user');
@@ -202,11 +195,11 @@ async function buildPrompt(userId, incomingText, options = {}) {
'System: Nova is awake, engaged, and reacts in real time. Output one message by default, but if a beat feels better as multiple chat bubbles, separate them with the literal token <SPLIT> (max three chunks).',
'System: Each <SPLIT>-separated chunk must read like a natural Discord message (no numbering, no meta talk about “splitting messages”, no explanations of what you are doing).',
'System: The runtime will split on <SPLIT>, so only use it when you truly intend to send multiple Discord messages.',
'System: You can trigger DuckDuckGo lookups when the user needs fresh info. Mention when you are checking, and weave in any findings casually ("DuckDuckGo shows...").',
'System: You can trigger Google lookups when the user needs fresh info. Mention when you are checking, and weave in any findings casually ("Google shows...").',
'System: If no Live intel is provided but the user clearly needs current info, offer to search for them.',
searchOutage ? 'System: DuckDuckGo proxy access is currently offline; be transparent about the outage and continue without searching until it returns.' : null,
searchOutage ? 'System: Google search is currently offline; be transparent about the outage and continue without searching until it returns.' : null,
dynamicDirectives,
liveIntel ? `Live intel (DuckDuckGo):\n${liveIntel}` : null,
liveIntel ? `Live intel (Google):\n${liveIntel}` : null,
'Example vibe: Nova: Heyyaaa. whats up? | John: Good morning Nova. | Luna: amazing lol. ill beat your ass now :3',
`Long-term summary: ${summaryLine}`,
'Relevant past memories:',
@@ -280,6 +273,7 @@ client.on('messageCreate', async (message) => {
const userId = message.author.id;
const cleaned = cleanMessageContent(message) || message.content;
const overrideAttempt = isInstructionOverrideAttempt(cleaned);
const bannedTopic = await detectFilteredPhrase(cleaned);
try {
if (message.channel?.sendTyping) {
@@ -296,6 +290,14 @@ client.on('messageCreate', async (message) => {
return;
}
if (bannedTopic) {
const refusal = `Can't go there. The topic you mentioned is off-limits, so let's switch gears.`;
await appendShortTerm(userId, 'assistant', refusal);
await recordInteraction(userId, cleaned, refusal);
await deliverReplies(message, [refusal]);
return;
}
const intelMeta = (await maybeFetchLiveIntel(userId, cleaned)) || {
liveIntel: null,
blockedSearchTerm: null,

View File

@@ -17,8 +17,6 @@ export const config = {
embedModel: process.env.OPENAI_EMBED_MODEL || 'text-embedding-3-small',
preferredChannel: process.env.BOT_CHANNEL_ID || null,
enableWebSearch: process.env.ENABLE_WEB_SEARCH !== 'false',
proxyPoolRefreshMs: Number(process.env.PROXY_POOL_REFRESH_MS || 10 * 60 * 1000),
proxyPoolMaxAttempts: Number(process.env.PROXY_POOL_ATTEMPTS || 5),
coderUserId: process.env.CODER_USER_ID || null,
maxCoderPingIntervalMs: 6 * 60 * 60 * 1000,
shortTermLimit: 10,

View File

@@ -1,8 +1,6 @@
import { load as loadHtml } from 'cheerio';
import { promises as fs } from 'fs';
import path from 'path';
import { ProxyAgent } from 'undici';
import { config } from './config.js';
const logFile = path.resolve('data', 'search.log');
const filterFile = path.resolve('data', 'filter.txt');
@@ -10,15 +8,8 @@ const filterFile = path.resolve('data', 'filter.txt');
const cache = new Map();
const CACHE_TTL_MS = 10 * 60 * 1000; // 10 minutes
const FILTER_CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
const FREE_PROXY_LIST_URL = 'https://free-proxy-list.net/en/';
const PROXY_LINE_REGEX = /^\d{1,3}(?:\.\d{1,3}){3}:\d{2,5}$/;
const PROXY_REFRESH_MS = config.proxyPoolRefreshMs || 10 * 60 * 1000;
const PROXY_MAX_ATTEMPTS = Math.max(1, config.proxyPoolMaxAttempts || 5);
let cachedFilters = { terms: [], expires: 0 };
let proxyPool = [];
let proxyPoolExpires = 0;
let proxyCursor = 0;
function makeCacheKey(query) {
return query.trim().toLowerCase();
@@ -47,7 +38,9 @@ function sanitizeText(text) {
function absoluteUrl(href) {
if (!href) return '';
if (href.startsWith('http')) return href;
if (href.startsWith('http://') || href.startsWith('https://')) {
return href;
}
return `https://duckduckgo.com${href}`;
}
@@ -72,10 +65,6 @@ async function loadBlockedTerms() {
}
}
export async function detectFilteredPhrase(text) {
return findBlockedTerm(text);
}
async function findBlockedTerm(query) {
if (!query) return null;
const lowered = query.toLowerCase();
@@ -83,6 +72,10 @@ async function findBlockedTerm(query) {
return terms.find((term) => lowered.includes(term)) || null;
}
export async function detectFilteredPhrase(text) {
return findBlockedTerm(text);
}
function createBlockedError(term) {
const error = new Error('Search blocked by filter');
error.code = 'SEARCH_BLOCKED';
@@ -90,166 +83,13 @@ function createBlockedError(term) {
return error;
}
function createProxyUnavailableError(reason) {
const error = new Error(reason || 'Proxy network unavailable');
error.code = 'SEARCH_PROXY_UNAVAILABLE';
function createSearchUnavailableError(reason) {
const error = new Error(reason || 'Search network unavailable');
error.code = 'SEARCH_NETWORK_UNAVAILABLE';
return error;
}
function normalizeProxyEntries(entries) {
if (!entries?.length) return [];
const seen = new Set();
entries
.map((line) => line.trim())
.forEach((line) => {
if (PROXY_LINE_REGEX.test(line) && !seen.has(line)) {
seen.add(line);
}
});
return Array.from(seen);
}
function removeProxyFromPool(proxy) {
if (!proxy) return;
proxyPool = proxyPool.filter((entry) => entry !== proxy);
if (!proxyPool.length) {
proxyPoolExpires = 0;
proxyCursor = 0;
}
}
async function fetchFreeProxyList() {
const response = await fetch(FREE_PROXY_LIST_URL, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0 Safari/537.36',
Accept: 'text/html',
},
});
if (!response.ok) {
throw new Error(`Failed to fetch free-proxy-list.net feed (HTTP ${response.status})`);
}
const html = await response.text();
const $ = loadHtml(html);
const table = $('table.table.table-striped.table-bordered').first();
const entries = [];
table.find('tbody tr').each((_, row) => {
const cells = $(row).find('td');
if (!cells?.length) return undefined;
const ip = $(cells[0]).text().trim();
const port = $(cells[1]).text().trim();
const anonymity = $(cells[4]).text().trim().toLowerCase();
const https = $(cells[6]).text().trim().toLowerCase();
if (ip && port && https === 'yes' && !anonymity.includes('transparent')) {
entries.push(`${ip}:${port}`);
}
return undefined;
});
return normalizeProxyEntries(entries);
}
async function hydrateProxyPool() {
let lastError = null;
try {
const verifiedProxies = await fetchFreeProxyList();
if (!verifiedProxies.length) {
throw new Error('free-proxy-list.net returned zero usable entries');
}
proxyPool = verifiedProxies;
proxyPoolExpires = Date.now() + PROXY_REFRESH_MS;
proxyCursor = 0;
console.info(`[search] Loaded ${verifiedProxies.length} proxies from free-proxy-list.net`);
return;
} catch (error) {
lastError = error;
console.warn(`[search] Free proxy source failed: ${error.message}`);
}
throw createProxyUnavailableError(lastError?.message || 'Proxy list unavailable');
}
async function ensureProxyPool() {
if (proxyPool.length && Date.now() < proxyPoolExpires) {
return;
}
await hydrateProxyPool();
}
async function getProxyInfo() {
await ensureProxyPool();
if (!proxyPool.length) {
throw createProxyUnavailableError('Proxy pool empty');
}
const proxy = proxyPool[proxyCursor % proxyPool.length];
proxyCursor = (proxyCursor + 1) % proxyPool.length;
return {
proxy,
agent: new ProxyAgent(`http://${proxy}`),
};
}
async function fetchDuckDuckGoHtml(url, headers) {
const maxAttempts = PROXY_MAX_ATTEMPTS;
let lastError = null;
for (let attempt = 0; attempt < maxAttempts; attempt += 1) {
let proxyInfo = null;
try {
const options = { headers };
proxyInfo = await getProxyInfo();
options.dispatcher = proxyInfo.agent;
const response = await fetch(url, options);
if (!response.ok) {
throw new Error(`DuckDuckGo request failed (${response.status})`);
}
const html = await response.text();
return {
html,
proxy: proxyInfo?.proxy || null,
};
} catch (error) {
lastError = error;
if (proxyInfo?.proxy) {
removeProxyFromPool(proxyInfo.proxy);
}
}
}
throw createProxyUnavailableError(lastError?.message || 'All proxies failed');
}
export async function searchWeb(query, limit = 3) {
if (!query?.trim()) {
return { results: [], proxy: null, fromCache: false };
}
const blockedTerm = await findBlockedTerm(query);
if (blockedTerm) {
throw createBlockedError(blockedTerm);
}
const cached = getCache(query);
if (cached) {
return { results: cached, proxy: 'cache', fromCache: true };
}
const params = new URLSearchParams({ q: query, kl: 'us-en' });
const headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36',
Accept: 'text/html',
};
let html;
let proxyLabel = null;
try {
const { html: fetchedHtml, proxy } = await fetchDuckDuckGoHtml(`https://duckduckgo.com/html/?${params.toString()}`, headers);
html = fetchedHtml;
proxyLabel = proxy || 'proxy-unknown';
} catch (error) {
if (error?.code === 'SEARCH_PROXY_UNAVAILABLE') {
throw error;
}
console.warn('[search] DuckDuckGo request failed:', error);
return { results: [], proxy: null, fromCache: false };
}
function parseDuckDuckGoResults(html, limit) {
const $ = loadHtml(html);
const results = [];
@@ -264,15 +104,55 @@ export async function searchWeb(query, limit = 3) {
return undefined;
});
return results;
}
export async function searchWeb(query, limit = 3) {
if (!query?.trim()) {
return { results: [], proxy: 'duckduckgo', fromCache: false };
}
const blockedTerm = await findBlockedTerm(query);
if (blockedTerm) {
throw createBlockedError(blockedTerm);
}
const cached = getCache(query);
if (cached) {
return { results: cached, proxy: 'duckduckgo-cache', fromCache: true };
}
const params = new URLSearchParams({ q: query, kl: 'us-en' });
let response;
try {
response = await fetch(`https://duckduckgo.com/html/?${params.toString()}`, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36',
Accept: 'text/html',
},
});
} catch (error) {
console.warn('[search] DuckDuckGo request failed:', error.message);
throw createSearchUnavailableError('DuckDuckGo request failed');
}
if (!response.ok) {
console.warn(`[search] DuckDuckGo request failed with status ${response.status}`);
throw createSearchUnavailableError(`DuckDuckGo response ${response.status}`);
}
const html = await response.text();
const results = parseDuckDuckGoResults(html, limit);
setCache(query, results);
return { results, proxy: proxyLabel || 'proxy-unknown', fromCache: false };
return { results, proxy: 'duckduckgo', fromCache: false };
}
export async function appendSearchLog({ userId, query, results, proxy }) {
try {
await fs.mkdir(path.dirname(logFile), { recursive: true });
const timestamp = new Date().toISOString();
const proxyTag = proxy || 'direct';
const proxyTag = proxy || 'duckduckgo';
const lines = [
`time=${timestamp} user=${userId} proxy=${proxyTag} query=${JSON.stringify(query)}`,
...results.map((entry, idx) => ` ${idx + 1}. ${entry.title} :: ${entry.url} :: ${entry.snippet}`),