PageExtractorChild.sys.mjs

firefox-main/toolkit/components/pageextractor/PageExtractorChild.sys.mjs (file symbol)

Enable keyboard shortcuts

Source code

File a bug in Core :: Machine Learning: On Device

Revision control

Copy as Markdown

Other Tools

/* This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

// @ts-check

/**

 * @import { GetTextOptions, GetDOMOptions, CanvasSnapshot, ExtractionResult } from './PageExtractor.d.ts'

 * @import { PageExtractorParent } from './PageExtractorParent.sys.mjs'

*/

/**

 * We wait for the page to be ready before extracting content headlessly. It's hard

 * to know when a page is "ready", however the strategy here is to wait for

 * DOMContentLoaded, and then a requestIdleCallback. This way the page has time

 * to do an initial amount of work. However, if we wait too long, it will be felt by

 * the user as lag. To mitigate this, wait for at least 2 seconds for the page to settle.

*/

const MAX_REQUEST_IDLE_CALLBACK_DELAY_MS = 2000;

import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";

const lazy = XPCOMUtils.declareLazy({

  console: () =>

    console.createInstance({

      prefix: "PageExtractorChild",

      maxLogLevelPref: "browser.ml.logLevel",

}),

  ReaderMode: "moz-src:///toolkit/components/reader/ReaderMode.sys.mjs",

  extractTextFromDOM:

    "moz-src:///toolkit/components/pageextractor/DOMExtractor.sys.mjs",

  isProbablyReaderable: "resource://gre/modules/Readerable.sys.mjs",

});

/** @type {ExtractionResult} */

const EMPTY_EXTRACTION_RESULT = { text: "", links: [], canvasSnapshots: [] };

/**

 * Extract a variety of content from pages for use in a smart window.

*/

export class PageExtractorChild extends JSWindowActorChild {

/**

   * Route the messages coming from the parent process.

   * @param {object} message

   * @param {string} message.name

   * @param {any} message.data

   * @returns {Promise<unknown>}

*/

  async receiveMessage({ name, data }) {

    switch (name) {

      case "PageExtractorParent:GetReaderModeContent":

        if (this.isAboutReader()) {

          const text = this.getAboutReaderContent();

          return { text: text ?? "", links: [], canvasSnapshots: [] };

        return this.getReaderModeContent(data);

      case "PageExtractorParent:GetText":

        if (this.isAboutReader()) {

          const text = this.getAboutReaderContent();

          return {

            text: text ?? "",

            links: [],

            canvasSnapshots: [],

};

        return this.getText(data);

      case "PageExtractorParent:WaitForPageReady":

        return this.waitForPageReady();

    return Promise.reject(new Error("Unknown message: " + name));

/**

   * This function resolves once the page is ready after a requestIdleCallback.

   * @returns {Promise<void>}

*/

  async waitForPageReady() {

    return new Promise(resolve => {

      const waitForIdle = () => {

        this.document.ownerGlobal.requestIdleCallback(() => resolve(), {

          timeout: MAX_REQUEST_IDLE_CALLBACK_DELAY_MS,

});

};

      if (this.document.readyState == "loading") {

        this.document.addEventListener("DOMContentLoaded", waitForIdle);

      } else {

        lazy.console.log("The page is already interactive");

        waitForIdle();

});

/**

   * @see PageExtractorParent#getReaderModeContent for docs

   * @param {boolean} force

   * @returns {Promise<ExtractionResult>}

*/

  async getReaderModeContent(force) {

    const window = this.browsingContext?.window;

    const document = window?.document;

    if (!force && (!document || !lazy.isProbablyReaderable(document))) {

      return EMPTY_EXTRACTION_RESULT;

    if (!document) {

      return EMPTY_EXTRACTION_RESULT;

    const article = await lazy.ReaderMode.parseDocument(document);

    if (!article) {

      return EMPTY_EXTRACTION_RESULT;

    let text = (article?.textContent || "")

      .trim()

      // Replace duplicate whitespace with either a single newline or space

      .replace(/(\s*\n\s*)|\s{2,}/g, (_, newline) => (newline ? "\n" : " "));

    if (article.title) {

      text = article.title + "\n\n" + text;

    lazy.console.log("GetReaderModeContent", { force });

    lazy.console.debug(text);

    return { text, links: [], canvasSnapshots: [] };

/**

   * @see PageExtractorParent#getText for docs

   * @param {GetTextOptions} options

   * @returns {Promise<ExtractionResult>}

*/

  async getText(options = {}) {

    const window = this.browsingContext?.window;

    const document = window?.document;

    if (!document) {

      return EMPTY_EXTRACTION_RESULT;

    const { text, links, canvases } = lazy.extractTextFromDOM(

      document,

      options

);

    let canvasSnapshots = [];

    if (options.includeCanvasSnapshots && canvases.length) {

      canvasSnapshots = await this.#captureCanvases(canvases, options);

    lazy.console.log("GetText", options);

    lazy.console.debug({ text, links, canvasSnapshots });

    return { text, links, canvasSnapshots };

/**

   * Special case extracting text from Reader Mode. The original article content is not

   * retained once reader mode is activated. It is rendered out to the page. Rather

   * than cache an additional copy of the article, just extract the text from the

   * actual reader mode DOM.

   * @returns {string | null}

*/

  getAboutReaderContent() {

    lazy.console.log("Using special text extraction strategy for about:reader");

    const document = this.manager.contentWindow.document;

    if (!document) {

      return null;

    /** @type {HTMLElement?} */

    const titleEl = document.querySelector(".reader-title");

    /** @type {HTMLElement?} */

    const contentEl = document.querySelector(".moz-reader-content");

    const title = titleEl?.innerText;

    const content = contentEl?.innerText;

    if (!title && !content) {

      return null;

    if (title) {

      return `${title}\n\n${content}`.trim();

    return content.trim();

/**

   * Checks if about:reader is loaded, which requires special handling.

   * @returns {boolean}

*/

  isAboutReader() {

    // Accessing the documentURIObject in this way does not materialize the

    // `window.location.href` and should be a cheaper check here.

    let url = this.manager.contentWindow.document.documentURIObject;

    return url.schemeIs("about") && url.pathQueryRef.startsWith("reader?");

/**

   * Capture canvas elements as WebP blobs. WebP is chosen for its superior

   * compression-to-quality ratio compared to PNG/JPEG, reducing the data sent

   * to language models while preserving visual fidelity.

   * @param {HTMLCanvasElement[]} canvases

   * @param {GetTextOptions} options

   * @returns {Promise<CanvasSnapshot[]>}

*/

  async #captureCanvases(canvases, options) {

    const maxDimension = options.maxCanvasDimension ?? 1024;

    const quality = options.canvasQuality ?? 0.8;

    const window = this.browsingContext?.window;

    if (!window) {

      return [];

    const results = await Promise.all(

      canvases.map(c => this.#captureCanvas(c, maxDimension, quality, window))

);

    return results.filter(Boolean);

/**

   * Capture a canvas element as a WebP blob. Uses OffscreenCanvas to avoid

   * blocking the main thread during scaling and blob conversion. ImageBitmap

   * is used as the source to efficiently transfer pixel data from the

   * original canvas.

   * @param {HTMLCanvasElement} canvas

   * @param {number} maxDimension

   * @param {number} quality

   * @param {Window} window

   * @returns {Promise<CanvasSnapshot | null>}

*/

  async #captureCanvas(canvas, maxDimension, quality, window) {

    const { width: originalWidth, height: originalHeight } = canvas;

    try {

      const bitmap = await window.createImageBitmap(canvas);

      const scale = Math.min(

1,

        maxDimension / Math.max(originalWidth, originalHeight)

);

      const targetWidth = Math.floor(originalWidth * scale);

      const targetHeight = Math.floor(originalHeight * scale);

      const offscreen = new window.OffscreenCanvas(targetWidth, targetHeight);

      // Alpha is enabled to preserve transparency in canvases that use it.

      // willReadFrequently is false because we only draw and convert to blob,

      // never reading pixels back, so hardware acceleration is preferred.

      const ctx = offscreen.getContext("2d", {

        alpha: true,

        willReadFrequently: false,

});

      ctx.drawImage(bitmap, 0, 0, targetWidth, targetHeight);

      bitmap.close();

      let blob;

      try {

        blob = await offscreen.convertToBlob({

          type: "image/webp",

          quality,

});

      } catch (securityError) {

        // Tainted canvas fall back to original canvas toBlob which works

        blob = await new Promise((resolve, reject) => {

          canvas.toBlob(

            b => (b ? resolve(b) : reject(new Error("toBlob failed"))),

            "image/webp",

            quality

);

});

        return {

          blob,

          width: originalWidth,

          height: originalHeight,

};

      return {

        blob,

        width: targetWidth,

        height: targetHeight,

};

    } catch (error) {

      lazy.console.debug?.("Canvas capture failed:", error);

      return null;