fromurllib.parseimporturlparseimportrequestsfrombs4importBeautifulSoupfromloguruimportloggerimportgraphinateDEFAULT_MAX_DEPTH=0defpage_links_graph_model(max_depth:int=DEFAULT_MAX_DEPTH):""" Create a graph model based on page links. Args: max_depth (int, optional): The maximum depth to crawl for page links. Defaults to DEFAULT_MAX_DEPTH. Returns: GraphModel: A graph model representing the page links. """def_links(url:str,depth=0,**kwargs):reqs=requests.get(url)logger.debug('Analyzing Page: {url}')soup=BeautifulSoup(reqs.text,'lxml')logger.debug('Done Analyzing Page: {url}')forlinkinsoup.find_all('a',href=True):child_url=link.get('href')ifchild_url.startswith('javascript:'):# Skip JavaScript linkscontinueifchild_url.startswith('//'):# Handle protocol-relative URLschild_url=f"https:{child_url}"ifnotbool(urlparse(child_url).netloc):# Skip relative URLs# child_url = urljoin(url, child_url)continueifnotchild_url.startswith('http'):# Skip non-HTTP URLscontinueyield{'source':url,'target':child_url}ifdepth<max_depth:yield from_links(child_url,depth=depth+1,**kwargs)graph_model=graphinate.model(name='Web')@graph_model.edge()deflink(url,**kwargs):yield from_links(url,**kwargs)returngraph_modelif__name__=='__main__':model=page_links_graph_model(1)params={# 'url': 'https://github.com/erivlis/graphinate''url':'https://erivlis.github.io/graphinate/'}builder=graphinate.builders.GraphQLBuilder(model,graph_type=graphinate.GraphType.DiGraph)schema=builder.build(default_node_attributes={'type':'url'},**params)graphinate.graphql.server(schema)
importbase64importrequestsfrombs4importBeautifulSoup,Tagimportgraphinatedefload_html_from_url(url="https://www.google.com"):response=requests.get(url)returnresponse.textdefload_html(file_path):withopen(file_path)asfile:returnfile.read()defhtml_dom_graph_model(html_content):graph_model=graphinate.model(name="HTML DOM Graph")soup=BeautifulSoup(html_content,'html.parser')defnode_type(tag:Tag):returntag.name.strip('[]')defnode_key(tag:Tag):returnstr((tag.sourceline,tag.sourcepos))ifisinstance(tag,Tag)elsebase64.b64encode(tag.encode()).decode()defnode_label(tag:Tag):returnstr(tag)@graph_model.node(node_type,key=node_key,label=node_label)defhtml_node():fortaginsoup.descendants:iftag.nameisnotNone:yieldtag@graph_model.edge()defcontains():fortaginsoup.descendants:iftag.nameisnotNone:forchildintag.children:ifchild.nameisnotNone:yield{'source':node_key(tag),'target':node_key(child)}returngraph_modelif__name__=='__main__':html_content=load_html_from_url()dom_model=html_dom_graph_model(html_content)schema=graphinate.builders.GraphQLBuilder(dom_model).build()graphinate.graphql.server(schema)