Hier werden die Unterschiede zwischen zwei Versionen angezeigt.
Beide Seiten der vorigen Revision Vorhergehende Überarbeitung | |||
elasticsearch [2025/06/30 20:06] jango [Query Data] |
elasticsearch [2025/06/30 20:38] (aktuell) jango [Python] |
||
---|---|---|---|
Zeile 229: | Zeile 229: | ||
=====Python===== | =====Python===== | ||
+ | |||
+ | <code python> | ||
+ | import logging | ||
+ | from urllib.parse import urljoin | ||
+ | import requests | ||
+ | from bs4 import BeautifulSoup | ||
+ | |||
+ | logging.basicConfig( | ||
+ | format=' | ||
+ | level=logging.INFO) | ||
+ | |||
+ | class Crawler: | ||
+ | |||
+ | def __init__(self, | ||
+ | self.visited_urls = [] | ||
+ | self.urls_to_visit = urls | ||
+ | |||
+ | def download_url(self, | ||
+ | return requests.get(url).text | ||
+ | |||
+ | def get_linked_urls(self, | ||
+ | soup = BeautifulSoup(html, | ||
+ | for link in soup.find_all(' | ||
+ | path = link.get(' | ||
+ | if path and path.startswith('/' | ||
+ | path = urljoin(url, | ||
+ | yield path | ||
+ | |||
+ | def add_url_to_visit(self, | ||
+ | if url not in self.visited_urls and url not in self.urls_to_visit: | ||
+ | self.urls_to_visit.append(url) | ||
+ | |||
+ | def crawl(self, url): | ||
+ | html = self.download_url(url) | ||
+ | for url in self.get_linked_urls(url, | ||
+ | self.add_url_to_visit(url) | ||
+ | |||
+ | def run(self): | ||
+ | while self.urls_to_visit: | ||
+ | url = self.urls_to_visit.pop(0) | ||
+ | logging.info(f' | ||
+ | try: | ||
+ | self.crawl(url) | ||
+ | except Exception: | ||
+ | logging.exception(f' | ||
+ | finally: | ||
+ | self.visited_urls.append(url) | ||
+ | |||
+ | if __name__ == ' | ||
+ | Crawler(urls=[' | ||
+ | </ | ||
+ | |||
====Create Index==== | ====Create Index==== |