# IndiaSearch MVP # Single-file prototype with: # 1) Simple crawler + indexer (SQLite FTS5) # 2) Flask web app with a search UI # 3) Language hinting (basic devanagari detection) # # Save this file as `indian_search_mvp.py` and follow the instructions below. """ Instructions to run: 1. Create a virtualenv (recommended) and install dependencies: python -m venv venv source venv/bin/activate # Linux / macOS venv\Scripts\activate # Windows pip install -r requirements.txt # requirements.txt contents: # Flask # requests # beautifulsoup4 OR simply: pip install Flask requests beautifulsoup4 2. Edit the SEED_URLS list near the top of this file to include Indian pages you want indexed (.in, news, govt etc.) 3. To build the index (first run) and start the web server, run: python indian_search_mvp.py --reindex This will crawl seeds, index content in a local SQLite file `search_index.db`, and start the Flask app. 4. Open http://127.0.0.1:5000 in your browser. Use the search box. Notes / Limitations: - This is a prototype: no distributed crawling, no politeness features, no robots.txt checking. - Fuzzy search is provided by SQLite FTS5 MATCH operator; for production use replace with Elasticsearch/OpenSearch. - Language detection is rudimentary (Devanagari character check). Replace with better NLP libs for production. --------------- CODE --------------- import argparse import sqlite3 import requests from bs4 import BeautifulSoup from urllib.parse import urlparse import re import time from flask import Flask, request, render_template_string, g, redirect, url_for # ----------------- Configuration ----------------- DB_FILE = 'search_index.db' SEED_URLS = [ 'https://www.india.gov.in/', 'https://www.mygov.in/', 'https://www.freepressjournal.in/', 'https://www.thehindu.com/', 'https://www.bbc.com/hindi', # Add more Indian sites (.in, regional news, govt pages, blogs) ] USER_AGENT = 'IndiaSearch-MVP/0.1 (+https://example.local)' CRAWL_DELAY = 1.0 # seconds between requests (be polite) # ----------------- Utilities --------------------- DEVANAGARI_RE = re.compile(r'[\u0900-\u097F]') def detect_language_from_text(text: str) -> str: # Very simple: if contains Devanagari chars -> hi else en if DEVANAGARI_RE.search(text): return 'hi' return 'en' # ----------------- Indexer / DB ------------------ def get_db(): db = getattr(g, '_database', None) if db is None: db = g._database = sqlite3.connect(DB_FILE) db.row_factory = sqlite3.Row return db def init_db(): conn = sqlite3.connect(DB_FILE) c = conn.cursor() # Create FTS5 virtual table for full-text search # columns: url, title, content, language c.execute(""" CREATE VIRTUAL TABLE IF NOT EXISTS docs USING fts5( url UNINDEXED, title, content, language UNINDEXED ); """) # simple table to store crawl metadata c.execute(""" CREATE TABLE IF NOT EXISTS meta ( url TEXT PRIMARY KEY, last_crawled INTEGER, status TEXT ); """) conn.commit() conn.close() def index_document(url: str, title: str, content: str, language: str): conn = sqlite3.connect(DB_FILE) c = conn.cursor() # delete existing entry for same url c.execute('DELETE FROM docs WHERE url = ?', (url,)) c.execute('INSERT INTO docs (url, title, content, language) VALUES (?, ?, ?, ?)', (url, title, content, language)) c.execute('REPLACE INTO meta (url, last_crawled, status) VALUES (?, ?, ?)', (url, int(time.time()), 'ok')) conn.commit() conn.close() # ----------------- Crawler ----------------------- def fetch_page(url: str, timeout=10): headers = {'User-Agent': USER_AGENT} try: resp = requests.get(url, headers=headers, timeout=timeout) resp.raise_for_status() return resp.text except Exception as e: print(f"Failed to fetch {url}: {e}") return None def extract_text_from_html(html: str) -> (str, str): soup = BeautifulSoup(html, 'html.parser') title_tag = soup.title.string.strip() if soup.title and soup.title.string else '' # remove scripts/styles for s in soup(['script', 'style', 'noscript', 'header', 'footer', 'nav']): s.decompose() texts = soup.get_text(separator=' ', strip=True) # collapse whitespace texts = re.sub(r'\s+', ' ', texts) return title_tag, texts def is_same_domain(seed: str, url: str) -> bool: try: return urlparse(seed).netloc == urlparse(url).netloc except: return False def crawl_and_index(seeds: list): print(f"Starting crawl of {len(seeds)} seeds...") for u in seeds: print(f"Crawling: {u}") html = fetch_page(u) if not html: continue title, text = extract_text_from_html(html) language = detect_language_from_text(title + '\n' + text) index_document(u, title, text, language) print(f"Indexed: {u} (lang={language})") time.sleep(CRAWL_DELAY) print("Crawl complete.") # ----------------- Flask App --------------------- app = Flask(__name__) SEARCH_TEMPLATE = """ IndiaSearch - MVP """ @app.teardown_appcontext def close_connection(exception): db = getattr(g, '_database', None) if db is not None: db.close() def search_index(query: str, lang: str=None, limit=20): conn = sqlite3.connect(DB_FILE) conn.row_factory = sqlite3.Row c = conn.cursor() # Use FTS5 MATCH syntax for full-text search # We will use simple tokenization: search in title and content # For language filtering, filter by language column q_escaped = query.replace('"', '""') sql = "SELECT url, title, snippet(docs, 2, '', '', '...', 40) as snippet, language FROM docs WHERE docs MATCH ?" params = [q_escaped] if lang: sql = "SELECT url, title, snippet(docs, 2, '', '', '...', 40) as snippet, language FROM docs WHERE docs MATCH ? AND language = ?" params = [q_escaped, lang] sql += " LIMIT ?" params.append(limit) try: c.execute(sql, params) rows = c.fetchall() results = [] for r in rows: results.append({'url': r['url'], 'title': r['title'], 'snippet': r['snippet'], 'language': r['language']}) return results except Exception as e: print('Search error:', e) return [] @app.route('/') def home(): return redirect(url_for('search')) @app.route('/search') def search(): q = request.args.get('q', '') lang = request.args.get('lang', '') or None results = None if q: results = search_index(q, lang=lang) return render_template_string(SEARCH_TEMPLATE, q=q, lang=lang, results=results) # ----------------- Command-line runner ---------------- if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--reindex', action='store_true', help='Crawl seeds and (re)build the index before starting server') parser.add_argument('--host', default='127.0.0.1') parser.add_argument('--port', default=5000, type=int) args = parser.parse_args() init_db() if args.reindex: crawl_and_index(SEED_URLS) print(f"Starting Flask app on {args.host}:{args.port}...") # start Flask dev server -- for prototype only app.run(host=args.host, port=args.port, debug=True) # End of file