Scrape one or more URLs
// Via ReaderClient (recommended) const reader = new ReaderClient(); const result = await reader.scrape(options); // Standalone function import { scrape } from "@vakra-dev/reader"; const result = await scrape(options);
scrape(options: ScrapeOptions): Promise<ScrapeResult>
import { ReaderClient } from "@vakra-dev/reader"; const reader = new ReaderClient(); const result = await reader.scrape({ urls: ["https://example.com"], formats: ["markdown", "html"], }); console.log(result.data[0].markdown); console.log(result.data[0].metadata.website.title); await reader.close();
Promise<ScrapeResult>
interface ScrapeResult { data: WebsiteScrapeResult[]; batchMetadata: BatchMetadata; }
const result = await reader.scrape({ urls: ["https://example.com", "https://example.org"], formats: ["markdown", "html"], });
const result = await reader.scrape({ urls: manyUrls, batchConcurrency: 5, onProgress: (p) => console.log(`${p.completed}/${p.total}`), });
const result = await reader.scrape({ urls: ["https://example.com"], onlyMainContent: true, // Default includeTags: [".article-content"], excludeTags: [".comments", ".sidebar"], });
const result = await reader.scrape({ urls: manyUrls, timeoutMs: 60000, // Per page batchTimeoutMs: 300000, // Total batch });
const result = await reader.scrape({ urls: ["https://example.com"], proxy: { host: "proxy.example.com", port: 8080, username: "user", password: "pass", }, });
import { ReaderClient, TimeoutError, NetworkError } from "@vakra-dev/reader"; try { const result = await reader.scrape({ urls: ["https://example.com"] }); } catch (error) { if (error instanceof TimeoutError) { console.log("Request timed out"); } else if (error instanceof NetworkError) { console.log("Network error:", error.message); } }
const result = await reader.scrape({ urls: ["https://valid.com", "https://invalid-url.example"], }); console.log("Successful:", result.batchMetadata.successfulUrls); console.log("Failed:", result.batchMetadata.failedUrls); if (result.batchMetadata.errors) { result.batchMetadata.errors.forEach((e) => { console.log(`${e.url}: ${e.error}`); }); }