Unterschiede

Hier werden die Unterschiede zwischen zwei Versionen angezeigt.

--- elasticsearch [2025/06/30 14:00]
jango [Docker]
+++ elasticsearch [2025/07/05 11:13] (aktuell)
jango [Service]
@@ Zeile 91: / Zeile 91: @@
 sudo curl -X GET "https://localhost:9200/_cat/indices" --insecure -u elastic:dein_sicheres_passwort
 </code>
+=====Sicherheit=====
+[[https://www.elastic.co/docs/reference/elasticsearch/configuration-reference/security-settings|Elasticsearch Security Settings]]
 =====Service=====
@@ Zeile 229: / Zeile 233: @@
 =====Python=====
+<code python>
+import logging
+from urllib.parse import urljoin
+import requests
+from bs4 import BeautifulSoup
+logging.basicConfig(
+    format='%(asctime)s %(levelname)s:%(message)s',
+    level=logging.INFO)
+class Crawler:
+    def __init__(self, urls=[]):
+        self.visited_urls = []
+        self.urls_to_visit = urls
+    def download_url(self, url):
+        return requests.get(url).text
+    def get_linked_urls(self, url, html):
+        soup = BeautifulSoup(html, 'html.parser')
+        for link in soup.find_all('a'):
+            path = link.get('href')
+            if path and path.startswith('/'):
+                path = urljoin(url, path)
+            yield path
+    def add_url_to_visit(self, url):
+        if url not in self.visited_urls and url not in self.urls_to_visit:
+            self.urls_to_visit.append(url)
+    def crawl(self, url):
+        html = self.download_url(url)
+        for url in self.get_linked_urls(url, html):
+            self.add_url_to_visit(url)
+    def run(self):
+        while self.urls_to_visit:
+            url = self.urls_to_visit.pop(0)
+            logging.info(f'Crawling: {url}')
+            try:
+                self.crawl(url)
+            except Exception:
+                logging.exception(f'Failed to crawl: {url}')
+            finally:
+                self.visited_urls.append(url)
+if __name__ == '__main__':
+    Crawler(urls=['https://www.imdb.com/']).run()
+</code>
+====Create Index====
+<code python>
+import os
+from elasticsearch import Elasticsearch
+import urllib3
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+es = Elasticsearch(
+    ['https://localhost:9200'],
+    basic_auth=("elastic", "secure"),
+    verify_certs=False
+)
+index_name = "test"
+if not es.indices.exists(index=index_name):
+    es.indices.create(index=index_name, body={
+        "mappings": {
+            "properties": {
+                "url": {"type": "text"},
+                "title": {"type": "text"}
+            }
+        }
+    })
+    print(f"Index '{index_name}' wurde erstellt.")
+else:
+    print(f"Index '{index_name}' existiert bereits.")
+</code>
+====Insert Data====
+<code python>
+import os
+from elasticsearch import Elasticsearch
+from elasticsearch.helpers import streaming_bulk
+import urllib3
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+es = Elasticsearch(
+    ['https://localhost:9200'],
+    basic_auth=("elastic", "lunikoff"),
+    verify_certs=False
+)
+index_name = "test"
+# Testeinträge als Dokumente
+documents = [
+    {"url": "heute.at", "title": "Das war heute"},
+    {"url": "gestern.at", "title": "Das war gestern"},
+]
+# Dokumente in den Index einfügen
+for doc in documents:
+    es.index(index=index_name, body=doc)
+    print(f"Dokument hinzugefügt: {doc}")
+print("Alles wurde indexiert.")
+</code>
+====Query Data====
+<code python>
+from elasticsearch import Elasticsearch
+import urllib3
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+es = Elasticsearch(
+    ['https://localhost:9200'],
+    basic_auth=("elastic", "lunikoff"),
+    verify_certs=False
+)
+data = {
+    "query": {
+        "wildcard": {
+                "url": "heute*"
+        }
+    },
+    "size": 100
+}
+response = es.search(index="test", body=data, scroll="1m")
+if 'hits' not in response or len(response['hits']['hits']) == 0:
+    print("Keine Ergebnisse gefunden.")
+    exit(1)
+scroll_id = response['_scroll_id']
+res_c = 0
+for hit in response['hits']['hits']:
+    res_c += 1
+    print("%(url)s; %(title)s" % hit["_source"])
+while True:
+    response = es.scroll(body={"scroll_id": scroll_id, "scroll": "1m"})
+    print("Got a response")
+    if 'hits' not in response or len(response['hits']['hits']) == 0:
+        break
+    for hit in response['hits']['hits']:
+        res_c += 1
+        print("%(.url)s; %(.title)s" % hit["_source"])
+es.clear_scroll(body={"scroll_id": scroll_id})
+print("We had", res_c, "results")
+</code>
+====Delete Index====
+<code python>
+import csv
+import glob
+import os
+from elasticsearch import Elasticsearch
+import urllib3
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+es = Elasticsearch(
+    ['https://localhost:9200'],
+    basic_auth=("elastic", "secure"),
+    verify_certs=False
+)
+index_name = input('Index name: ')
+# Prüfen, ob der Index existiert und ihn löschen
+if es.indices.exists(index=index_name):
+    es.indices.delete(index=index_name)
+    print(f"Index '{index_name}' wurde gelöscht.")
+else:
+    print(f"Index '{index_name}' existiert nicht.")
+</code>
 ====Scroll Query====

MBCDN

Benutzer-Werkzeuge

Webseiten-Werkzeuge

Unterschiede

Seiten-Werkzeuge