Preparers parse HTML operation examples

These examples use preparers with the ParseHtml operation in AI Accelerator.

Primitive

-- Default method is to structurally parse HTML to plaintext
SELECT * FROM aidb.parse_html(
    '<html><body><h1>Hello World Heading</h1><p>Hello World paragraph</p></body></html>'
);
Output
      parse_html
-----------------------
 Hello World Heading  +
                      +
 Hello World paragraph+

(1 row)
-- Parse Hello World HTML to plaintext
SELECT * FROM aidb.parse_html(
    html =>
        '<h1>Hello, world!</h1>
        <p>This is my first web page.</p>
        <p>
            It contains some <strong>bold text</strong>, some <em>italic test</em>, and a <a href="https://google.com" target="_blank">link</a>.
        </p>

        <img src="postgres_logo.png" alt="Postgres Logo Image">

        <ol>
            <li>List item</li>
            <li>List item</li>
            <li>List item</li>
        </ol>',
    options => '{"method": "StructuredPlaintext"}' -- Default
);
Output
                        parse_html
-----------------------------------------------------------
 Hello, world!                                            +
                                                          +
 This is my first web page.                               +
                                                          +
 It contains some bold text, some italic test, and a link.+
                                                          +
 Postgres Logo Image                                      +
 List item                                                +
 List item                                                +
 List item                                                +

(1 row)
-- Parse Hello World HTML to markdown-esque text that retains some syntactical context
SELECT * FROM aidb.parse_html(
    html =>
        '<h1>Hello, world!</h1>
        <p>This is my first web page.</p>
        <p>
            It contains some <strong>bold text</strong>, some <em>italic test</em>, and a <a href="https://google.com" target="_blank">link</a>.
        </p>

        <img src="postgres_logo.png" alt="Postgres Logo Image">

        <ol>
            <li>List item</li>
            <li>List item</li>
            <li>List item</li>
        </ol>',
    options => '{"method": "StructuredMarkdown"}'
);
Output
                                      parse_html
---------------------------------------------------------------------------------------
 # Hello, world!                                                                      +
                                                                                      +
 This is my first web page.                                                           +
                                                                                      +
 It contains some **bold text**, some *italic test*, and a [link](https://google.com).+
                                                                                      +
 ![Postgres Logo Image](postgres_logo.png)                                            +
 1. List item                                                                         +
 2. List item                                                                         +
 3. List item                                                                         +

(1 row)

Preparer with table data source

-- Create source test table
CREATE TABLE source_table__2772
(
    id      INT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY,
    content TEXT NOT NULL
);
INSERT INTO source_table__2772
VALUES (1, '<html><body><h1>Hello World Heading</h1><p>Hello World paragraph</p></body></html>'),
       (2, '<p>This is some <strong>bold text</strong>, some <em>italic test</em>, and a <a href="https://google.com" target="_blank">link</a>.');

SELECT aidb.create_table_preparer(
    name => 'preparer__2772',
    operation => 'ParseHtml',
    source_table => 'source_table__2772',
    source_data_column => 'content',
    destination_table => 'destination_table__2772',
    destination_data_column => 'parsed_html',
    source_key_column => 'id',
    destination_key_column => 'id',
    options => '{"method": "StructuredPlaintext"}'::JSONB  -- Configuration for the ParseHtml operation
);

SELECT aidb.bulk_data_preparation('preparer__2772');

SELECT * FROM destination_table__2772;
Output
 id |                      parsed_html
----+-------------------------------------------------------
 1  | Hello World Heading                                  +
    |                                                      +
    | Hello World paragraph                                +
    |
 2  | This is some bold text, some italic test, and a link.+
    |
(2 rows)

Could this page be better? Report a problem or suggest an addition!