Skip to main content

Type Definition

interface ScrapeOptions {
  // Required
  urls: string[];

  // Output
  formats?: Array<"markdown" | "html">;

  // Content extraction
  onlyMainContent?: boolean;
  includeTags?: string[];
  excludeTags?: string[];
  removeAds?: boolean;
  removeBase64Images?: boolean;

  // Request configuration
  userAgent?: string;
  headers?: Record<string, string>;
  timeoutMs?: number;
  waitForSelector?: string;
  skipTLSVerification?: boolean;

  // URL filtering
  includePatterns?: string[];
  excludePatterns?: string[];

  // Batch processing
  batchConcurrency?: number;
  batchTimeoutMs?: number;
  maxRetries?: number;
  onProgress?: (progress: Progress) => void;

  // Proxy
  proxy?: ProxyConfig;

  // Debugging
  verbose?: boolean;
  showChrome?: boolean;
}

Options Reference

Required Options

OptionTypeDescription
urlsstring[]Array of URLs to scrape

Output Options

OptionTypeDefaultDescription
formatsArray<"markdown" | "html">["markdown"]Output formats to include

Content Extraction Options

OptionTypeDefaultDescription
onlyMainContentbooleantrueExtract only main content, removing nav/header/footer
includeTagsstring[][]CSS selectors for elements to keep
excludeTagsstring[][]CSS selectors for elements to remove
removeAdsbooleantrueRemove ad and tracking elements
removeBase64ImagesbooleantrueRemove base64-encoded images

Request Configuration Options

OptionTypeDefaultDescription
userAgentstringundefinedCustom user agent string
headersRecord<string, string>undefinedCustom headers for requests
timeoutMsnumber30000Request timeout per page (ms)
waitForSelectorstringundefinedCSS selector to wait for before extraction
skipTLSVerificationbooleantrueSkip TLS/SSL certificate verification

URL Filtering Options

OptionTypeDefaultDescription
includePatternsstring[][]URL patterns to include (regex)
excludePatternsstring[][]URL patterns to exclude (regex)

Batch Processing Options

OptionTypeDefaultDescription
batchConcurrencynumber1URLs to process in parallel
batchTimeoutMsnumber300000Timeout for entire batch (ms)
maxRetriesnumber2Max retry attempts for failed URLs
onProgressfunctionundefinedProgress callback

Proxy Options

OptionTypeDefaultDescription
proxyProxyConfigundefinedProxy configuration

Debugging Options

OptionTypeDefaultDescription
verbosebooleanfalseEnable verbose logging
showChromebooleanfalseShow browser window

ProxyConfig

interface ProxyConfig {
  url?: string; // Full proxy URL
  type?: "datacenter" | "residential"; // Proxy type
  host?: string; // Proxy host
  port?: number; // Proxy port
  username?: string; // Username
  password?: string; // Password
  country?: string; // Country code (e.g., "us")
}

Progress Callback

interface Progress {
  completed: number; // URLs completed
  total: number; // Total URLs
  currentUrl: string; // Current URL being processed
}

Examples

Basic

await reader.scrape({
  urls: ["https://example.com"],
});

Full Options

await reader.scrape({
  urls: ["https://example.com", "https://example.org"],
  formats: ["markdown", "html"],
  onlyMainContent: true,
  excludeTags: [".comments", ".sidebar"],
  timeoutMs: 60000,
  batchConcurrency: 3,
  maxRetries: 3,
  onProgress: (p) => console.log(`${p.completed}/${p.total}`),
  verbose: true,
});