Web scrapingFebruary 21, 2025

Web Scraping Profesional con Puppeteer: Guía Completa 2025

Aprende a crear scrapers robustos y eficientes con Puppeteer. Técnicas avanzadas, ejemplos prácticos y mejores prácticas.

anthonvg

@anthonvg_

Web Scraping Profesional con Puppeteer

Puppeteer se ha convertido en una de las herramientas más potentes para web scraping y automatización web. En esta guía, aprenderás a crear scrapers eficientes y robustos que pueden manejar sitios web modernos y dinámicos.

Puppeteer es una biblioteca de Node.js que proporciona una API de alto nivel para controlar Chrome/Chromium. Es mantenida por el equipo de Chrome DevTools.

Configuración Inicial

Primero, configuremos nuestro proyecto:

mkdir puppeteer-scraper
cd puppeteer-scraper
npm init -y
npm install puppeteer

Estructura Base del Scraper

Este es nuestro scraper base optimizado:

const puppeteer = require("puppeteer");
 
class WebScraper {
  constructor() {
    this.browser = null;
    this.page = null;
  }
 
  async initialize() {
    this.browser = await puppeteer.launch({
      headless: "new",
      args: ["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage", "--disable-accelerated-2d-canvas", "--disable-gpu"],
      defaultViewport: { width: 1920, height: 1080 },
    });
    this.page = await this.browser.newPage();
 
    // Optimizaciones de rendimiento
    await this.page.setRequestInterception(true);
    this.page.on("request", (request) => {
      if (["image", "stylesheet", "font"].includes(request.resourceType())) {
        request.abort();
      } else {
        request.continue();
      }
    });
  }
 
  async close() {
    await this.browser.close();
  }
}

Técnicas Avanzadas de Scraping

1. Manejo de Paginación

async function scrapePaginatedContent(url, maxPages = 5) {
  const scraper = new WebScraper();
  await scraper.initialize();
 
  const results = [];
 
  for (let page = 1; page <= maxPages; page++) {
    await scraper.page.goto(`${url}?page=${page}`);
 
    // Esperar a que el contenido se cargue
    await scraper.page.waitForSelector(".item-container");
 
    const pageData = await scraper.page.evaluate(() => {
      const items = document.querySelectorAll(".item");
      return Array.from(items).map((item) => ({
        title: item.querySelector(".title")?.textContent,
        price: item.querySelector(".price")?.textContent,
        description: item.querySelector(".description")?.textContent,
      }));
    });
 
    results.push(...pageData);
 
    // Verificar si hay más páginas
    const hasNextPage = await scraper.page.$(".next-button");
    if (!hasNextPage) break;
  }
 
  await scraper.close();
  return results;
}

2. Manejo de Autenticación

async function scrapeWithAuth(url, credentials) {
  const scraper = new WebScraper();
  await scraper.initialize();
 
  // Navegar a la página de login
  await scraper.page.goto(credentials.loginUrl);
 
  // Llenar formulario de login
  await scraper.page.type("#username", credentials.username);
  await scraper.page.type("#password", credentials.password);
 
  // Enviar formulario y esperar navegación
  await Promise.all([scraper.page.waitForNavigation(), scraper.page.click("#login-button")]);
 
  // Continuar con el scraping...
}

3. Manejo de Contenido Dinámico

async function scrapeInfiniteScroll() {
  const scraper = new WebScraper();
  await scraper.initialize();
 
  await scraper.page.goto(url);
 
  let previousHeight;
  let items = new Set();
 
  while (true) {
    // Obtener elementos actuales
    const newItems = await scraper.page.evaluate(() => {
      const elements = document.querySelectorAll(".item");
      return Array.from(elements).map((el) => el.textContent);
    });
 
    // Agregar nuevos elementos al conjunto
    newItems.forEach((item) => items.add(item));
 
    // Scroll hasta el final
    previousHeight = await scraper.page.evaluate("document.body.scrollHeight");
    await scraper.page.evaluate("window.scrollTo(0, document.body.scrollHeight)");
    await scraper.page.waitForTimeout(1500);
 
    // Verificar si hemos llegado al final
    const currentHeight = await scraper.page.evaluate("document.body.scrollHeight");
    if (currentHeight === previousHeight) break;
  }
 
  return Array.from(items);
}

Mejores Prácticas y Optimizaciones

1. Control de Velocidad y Delays

class RateLimiter {
  constructor(maxRequests, timeWindow) {
    this.maxRequests = maxRequests;
    this.timeWindow = timeWindow;
    this.tokens = maxRequests;
    this.lastRefill = Date.now();
  }
 
  async waitForToken() {
    const now = Date.now();
    const timePassed = now - this.lastRefill;
    this.tokens += Math.floor(timePassed / this.timeWindow) * this.maxRequests;
    this.tokens = Math.min(this.tokens, this.maxRequests);
    this.lastRefill = now;
 
    if (this.tokens <= 0) {
      await new Promise((resolve) => setTimeout(resolve, this.timeWindow));
      return this.waitForToken();
    }
 
    this.tokens--;
    return true;
  }
}

2. Manejo de Errores y Reintentos

async function withRetry(fn, maxRetries = 3, delay = 1000) {
  for (let i = 0; i < maxRetries; i++) {
    try {
      return await fn();
    } catch (error) {
      if (i === maxRetries - 1) throw error;
      console.log(`Intento ${i + 1} fallido, reintentando...`);
      await new Promise((resolve) => setTimeout(resolve, delay * (i + 1)));
    }
  }
}

Ejemplo Completo: Scraper de Productos

async function scrapeProducts(url) {
  const scraper = new WebScraper();
  const rateLimiter = new RateLimiter(10, 1000); // 10 requests por segundo
 
  try {
    await scraper.initialize();
 
    const products = [];
    let hasNextPage = true;
    let pageNum = 1;
 
    while (hasNextPage) {
      await rateLimiter.waitForToken();
 
      await withRetry(async () => {
        await scraper.page.goto(`${url}?page=${pageNum}`);
 
        const pageProducts = await scraper.page.evaluate(() => {
          return Array.from(document.querySelectorAll(".product-card")).map((product) => ({
            name: product.querySelector(".product-name")?.textContent,
            price: product.querySelector(".product-price")?.textContent,
            description: product.querySelector(".product-description")?.textContent,
            rating: product.querySelector(".product-rating")?.textContent,
          }));
        });
 
        products.push(...pageProducts);
 
        hasNextPage = (await scraper.page.$(".next-page:not(.disabled)")) !== null;
        pageNum++;
      });
    }
 
    return products;
  } catch (error) {
    console.error("Error durante el scraping:", error);
    throw error;
  } finally {
    await scraper.close();
  }
}

Conclusiones

El web scraping con Puppeteer ofrece grandes posibilidades para la automatización y extracción de datos. Las claves para un scraper exitoso son:

Manejo robusto de errores
Control de velocidad de requests
Optimización de recursos
Código mantenible y reutilizable

Recuerda siempre revisar los términos de servicio de los sitios web que planeas scrapear y mantener un comportamiento ético en tus actividades de scraping.

Recursos Adicionales

¿Te resultó útil esta guía? Comparte tus experiencias y dudas en los comentarios. ¡Feliz scraping!