Web¶

Web Page Links¶

Web Page LinksDependenciesPlot

examples/web/page_links.py
# /// script
# dependencies = [
#   "graphinate[server]",
#   "beautifulsoup4",
#   "lxml",
#   "requests"
# ]
# ///
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup
from loguru import logger

import graphinate

DEFAULT_MAX_DEPTH = 0


def page_links_graph_model(max_depth: int = DEFAULT_MAX_DEPTH):
    """
    Create a graph model based on page links.

    Args:
        max_depth (int, optional): The maximum depth to crawl for page links. Defaults to DEFAULT_MAX_DEPTH.

    Returns:
        GraphModel: A graph model representing the page links.
    """

    def _links(url: str, depth=0, **kwargs):
        reqs = requests.get(url)
        logger.debug('Analyzing Page: {url}')
        soup = BeautifulSoup(reqs.text, 'lxml')
        logger.debug('Done Analyzing Page: {url}')
        for link in soup.find_all('a', href=True):
            child_url = link.get('href')

            if child_url.startswith('javascript:'):  # Skip JavaScript links
                continue

            if child_url.startswith('//'):  # Handle protocol-relative URLs
                child_url = f"https:{child_url}"

            if not bool(urlparse(child_url).netloc):  # Skip relative URLs
                # child_url = urljoin(url, child_url)
                continue

            if not child_url.startswith('http'):  # Skip non-HTTP URLs
                continue

            yield {'source': url, 'target': child_url}
            if depth < max_depth:
                yield from _links(child_url, depth=depth + 1, **kwargs)

    graph_model = graphinate.model(name='Web')

    @graph_model.edge()
    def link(url, **kwargs):
        yield from _links(url, **kwargs)

    return graph_model


if __name__ == '__main__':
    model = page_links_graph_model(1)

    params = {
        # 'url': 'https://github.com/erivlis/graphinate'
        'url': 'https://erivlis.github.io/graphinate/'
    }

    builder = graphinate.builders.GraphQLBuilder(model, graph_type=graphinate.GraphType.DiGraph)
    schema = builder.build(default_node_attributes={'type': 'url'}, **params)
    graphinate.graphql.server(schema)

examples/web/requirements.txt
1 2 3 4	`beautifulsoup4 loguru lxml requests`

HTML DOM¶

HTML DOMDependencies

examples/web/html_dom.py
# /// script
# dependencies = [
#   "graphinate[server]",
#   "beautifulsoup4",
#   "requests"
# ]
# ///
import base64

import requests
from bs4 import BeautifulSoup, Tag

import graphinate


def load_html_from_url(url="https://www.google.com"):
    response = requests.get(url)
    return response.text


def load_html(file_path):
    with open(file_path) as file:
        return file.read()


def html_dom_graph_model(html_content):
    graph_model = graphinate.model(name="HTML DOM Graph")
    soup = BeautifulSoup(html_content, 'html.parser')

    def node_type(tag: Tag):
        return tag.name.strip('[]')

    def node_key(tag: Tag):
        return str((tag.sourceline, tag.sourcepos)) if isinstance(tag, Tag) else base64.b64encode(
            tag.encode()).decode()

    def node_label(tag: Tag):
        return str(tag)

    @graph_model.node(node_type, key=node_key, label=node_label)
    def html_node():
        for tag in soup.descendants:
            if tag.name is not None:
                yield tag

    @graph_model.edge()
    def contains():
        for tag in soup.descendants:
            if tag.name is not None:
                for child in tag.children:
                    if child.name is not None:
                        yield {
                            'source': node_key(tag),
                            'target': node_key(child)
                        }

    return graph_model


if __name__ == '__main__':
    html_content = load_html_from_url()
    dom_model = html_dom_graph_model(html_content)
    schema = graphinate.builders.GraphQLBuilder(dom_model).build()
    graphinate.graphql.server(schema)

examples/web/requirements.txt
1 2 3 4	`beautifulsoup4 loguru lxml requests`