Hier werden die Unterschiede zwischen zwei Versionen angezeigt.
Beide Seiten der vorigen Revision Vorhergehende Überarbeitung Nächste Überarbeitung | Vorhergehende Überarbeitung | ||
elasticsearch [2025/06/30 14:00] jango [Docker] |
elasticsearch [2025/06/30 20:38] (aktuell) jango [Python] |
||
---|---|---|---|
Zeile 229: | Zeile 229: | ||
=====Python===== | =====Python===== | ||
+ | |||
+ | <code python> | ||
+ | import logging | ||
+ | from urllib.parse import urljoin | ||
+ | import requests | ||
+ | from bs4 import BeautifulSoup | ||
+ | |||
+ | logging.basicConfig( | ||
+ | format=' | ||
+ | level=logging.INFO) | ||
+ | |||
+ | class Crawler: | ||
+ | |||
+ | def __init__(self, | ||
+ | self.visited_urls = [] | ||
+ | self.urls_to_visit = urls | ||
+ | |||
+ | def download_url(self, | ||
+ | return requests.get(url).text | ||
+ | |||
+ | def get_linked_urls(self, | ||
+ | soup = BeautifulSoup(html, | ||
+ | for link in soup.find_all(' | ||
+ | path = link.get(' | ||
+ | if path and path.startswith('/' | ||
+ | path = urljoin(url, | ||
+ | yield path | ||
+ | |||
+ | def add_url_to_visit(self, | ||
+ | if url not in self.visited_urls and url not in self.urls_to_visit: | ||
+ | self.urls_to_visit.append(url) | ||
+ | |||
+ | def crawl(self, url): | ||
+ | html = self.download_url(url) | ||
+ | for url in self.get_linked_urls(url, | ||
+ | self.add_url_to_visit(url) | ||
+ | |||
+ | def run(self): | ||
+ | while self.urls_to_visit: | ||
+ | url = self.urls_to_visit.pop(0) | ||
+ | logging.info(f' | ||
+ | try: | ||
+ | self.crawl(url) | ||
+ | except Exception: | ||
+ | logging.exception(f' | ||
+ | finally: | ||
+ | self.visited_urls.append(url) | ||
+ | |||
+ | if __name__ == ' | ||
+ | Crawler(urls=[' | ||
+ | </ | ||
+ | |||
+ | |||
+ | ====Create Index==== | ||
+ | <code python> | ||
+ | import os | ||
+ | from elasticsearch import Elasticsearch | ||
+ | import urllib3 | ||
+ | |||
+ | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | ||
+ | |||
+ | es = Elasticsearch( | ||
+ | [' | ||
+ | basic_auth=(" | ||
+ | verify_certs=False | ||
+ | ) | ||
+ | |||
+ | index_name = " | ||
+ | |||
+ | if not es.indices.exists(index=index_name): | ||
+ | es.indices.create(index=index_name, | ||
+ | " | ||
+ | " | ||
+ | " | ||
+ | " | ||
+ | } | ||
+ | } | ||
+ | }) | ||
+ | print(f" | ||
+ | else: | ||
+ | print(f" | ||
+ | </ | ||
+ | |||
+ | ====Insert Data==== | ||
+ | <code python> | ||
+ | import os | ||
+ | from elasticsearch import Elasticsearch | ||
+ | from elasticsearch.helpers import streaming_bulk | ||
+ | import urllib3 | ||
+ | |||
+ | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | ||
+ | |||
+ | es = Elasticsearch( | ||
+ | [' | ||
+ | basic_auth=(" | ||
+ | verify_certs=False | ||
+ | ) | ||
+ | |||
+ | index_name = " | ||
+ | |||
+ | # Testeinträge als Dokumente | ||
+ | documents = [ | ||
+ | {" | ||
+ | {" | ||
+ | ] | ||
+ | |||
+ | # Dokumente in den Index einfügen | ||
+ | for doc in documents: | ||
+ | es.index(index=index_name, | ||
+ | print(f" | ||
+ | |||
+ | print(" | ||
+ | </ | ||
+ | |||
+ | ====Query Data==== | ||
+ | <code python> | ||
+ | from elasticsearch import Elasticsearch | ||
+ | import urllib3 | ||
+ | |||
+ | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | ||
+ | |||
+ | es = Elasticsearch( | ||
+ | [' | ||
+ | basic_auth=(" | ||
+ | verify_certs=False | ||
+ | ) | ||
+ | |||
+ | data = { | ||
+ | " | ||
+ | " | ||
+ | " | ||
+ | } | ||
+ | }, | ||
+ | " | ||
+ | } | ||
+ | |||
+ | response = es.search(index=" | ||
+ | |||
+ | if ' | ||
+ | print(" | ||
+ | exit(1) | ||
+ | |||
+ | scroll_id = response[' | ||
+ | res_c = 0 | ||
+ | |||
+ | for hit in response[' | ||
+ | res_c += 1 | ||
+ | print(" | ||
+ | |||
+ | while True: | ||
+ | response = es.scroll(body={" | ||
+ | print(" | ||
+ | if ' | ||
+ | break | ||
+ | |||
+ | for hit in response[' | ||
+ | res_c += 1 | ||
+ | print(" | ||
+ | |||
+ | es.clear_scroll(body={" | ||
+ | |||
+ | print(" | ||
+ | </ | ||
+ | |||
+ | ====Delete Index==== | ||
+ | |||
+ | <code python> | ||
+ | import csv | ||
+ | import glob | ||
+ | import os | ||
+ | from elasticsearch import Elasticsearch | ||
+ | import urllib3 | ||
+ | |||
+ | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | ||
+ | |||
+ | es = Elasticsearch( | ||
+ | [' | ||
+ | basic_auth=(" | ||
+ | verify_certs=False | ||
+ | ) | ||
+ | |||
+ | index_name = input(' | ||
+ | |||
+ | # Prüfen, ob der Index existiert und ihn löschen | ||
+ | if es.indices.exists(index=index_name): | ||
+ | es.indices.delete(index=index_name) | ||
+ | print(f" | ||
+ | else: | ||
+ | print(f" | ||
+ | </ | ||
====Scroll Query==== | ====Scroll Query==== |