Unterschiede

Hier werden die Unterschiede zwischen zwei Versionen angezeigt.

--- elasticsearch [2025/06/30 19:52]
jango
+++ elasticsearch [2025/07/05 11:13] (aktuell)
jango [Service]
@@ Zeile 91: / Zeile 91: @@
 sudo curl -X GET "https://localhost:9200/_cat/indices" --insecure -u elastic:dein_sicheres_passwort
 </code>
+=====Sicherheit=====
+[[https://www.elastic.co/docs/reference/elasticsearch/configuration-reference/security-settings|Elasticsearch Security Settings]]
 =====Service=====
@@ Zeile 229: / Zeile 233: @@
 =====Python=====
+<code python>
+import logging
+from urllib.parse import urljoin
+import requests
+from bs4 import BeautifulSoup
+logging.basicConfig(
+    format='%(asctime)s %(levelname)s:%(message)s',
+    level=logging.INFO)
+class Crawler:
+    def __init__(self, urls=[]):
+        self.visited_urls = []
+        self.urls_to_visit = urls
+    def download_url(self, url):
+        return requests.get(url).text
+    def get_linked_urls(self, url, html):
+        soup = BeautifulSoup(html, 'html.parser')
+        for link in soup.find_all('a'):
+            path = link.get('href')
+            if path and path.startswith('/'):
+                path = urljoin(url, path)
+            yield path
+    def add_url_to_visit(self, url):
+        if url not in self.visited_urls and url not in self.urls_to_visit:
+            self.urls_to_visit.append(url)
+    def crawl(self, url):
+        html = self.download_url(url)
+        for url in self.get_linked_urls(url, html):
+            self.add_url_to_visit(url)
+    def run(self):
+        while self.urls_to_visit:
+            url = self.urls_to_visit.pop(0)
+            logging.info(f'Crawling: {url}')
+            try:
+                self.crawl(url)
+            except Exception:
+                logging.exception(f'Failed to crawl: {url}')
+            finally:
+                self.visited_urls.append(url)
+if __name__ == '__main__':
+    Crawler(urls=['https://www.imdb.com/']).run()
+</code>
 ====Create Index====
 <code python>
-import csv
-import glob
 import os
 from elasticsearch import Elasticsearch
@@ Zeile 246: / Zeile 300: @@
 )
-import datetime
+index_name = "test"
-now = datetime.datetime.now()
-fmt_d = now.strftime("%Y-%m-%d")
-index_name = "fs_audit_" + fmt_d
+if not es.indices.exists(index=index_name):
-if not es.indices.exists(index=index_name):
     es.indices.create(index=index_name, body={
         "mappings": {
             "properties": {
-                "FolderPath": {"type": "keyword"},
+                "url": {"type": "text"},
-                "IdentityReference": {"type": "text"},
+                "title": {"type": "text"}
-                "FileSystemRights": {"type": "text"},
-                "IsInherited": {"type": "text"}
             }
         }
@@ Zeile 272: / Zeile 318: @@
 ====Insert Data====
 <code python>
-import csv
-import glob
 import os
 from elasticsearch import Elasticsearch
@@ Zeile 283: / Zeile 327: @@
 es = Elasticsearch(
     ['https://localhost:9200'],
-    basic_auth=("elastic", "secure"),
+    basic_auth=("elastic", "lunikoff"),
     verify_certs=False
 )
-index_name = "fs_audit_2025-04-12"
+index_name = "test"
-csv_folder = "C:\\Scripts\\Berechtigungsaudit\\shares\\2025"
-csv_files = glob.glob(os.path.join(csv_folder, "*.csv"))
-if not csv_files:
+# Testeinträge als Dokumente
-    print("Keine CSV-Dateien gefunden!")
+documents = [
-    exit(1)
+    {"url": "heute.at", "title": "Das war heute"},
+    {"url": "gestern.at", "title": "Das war gestern"},
-for csv_file in csv_files:
+]
-    print(f"Verarbeite Datei: {csv_file}")
-    with open(csv_file, mode="r", encoding="utf-8") as file:
-        reader = csv.DictReader(file, delimiter=";")
-        def generate_docs():
-            for row in reader:
-                yield {
-                    "_index": index_name,
-                    "_source": {
-                        "FolderPath": row["FolderPath"],
-                        "IdentityReference": row["IdentityReference"],
-                        "FileSystemRights": row["FileSystemRights"],
-                        "IsInherited": row["IsInherited"]
-                    }
-                }
-        for ok, response in streaming_bulk(es, generate_docs(), chunk_size=500):
+# Dokumente in den Index einfügen
-            if not ok:
+for doc in documents:
-                print("Fehler beim Indexieren:", response)
+    es.index(index=index_name, body=doc)
+    print(f"Dokument hinzugefügt: {doc}")
-print("Alle CSV-Dateien wurden indexiert.")
+print("Alles wurde indexiert.")
 </code>
@@ Zeile 328: / Zeile 356: @@
 es = Elasticsearch(
     ['https://localhost:9200'],
-    basic_auth=("elastic", "secure"),
+    basic_auth=("elastic", "lunikoff"),
     verify_certs=False
 )
@@ Zeile 335: / Zeile 363: @@
     "query": {
         "wildcard": {
-			"FolderPath": "\\\\\\\\FILESERVER\\\gbf*Walkner*"
+                "url": "heute*"
-		}
+        }
     },
     "size": 100
 }
-response = es.search(index="fs_audit", body=data, scroll="1m")
+response = es.search(index="test", body=data, scroll="1m")
 if 'hits' not in response or len(response['hits']['hits']) == 0:
@@ Zeile 352: / Zeile 380: @@
 for hit in response['hits']['hits']:
     res_c += 1
-    print("%(FolderPath)s; %(IdentityReference)s; %(FileSystemRights)s; %(IsInherited)s" % hit["_source"])
+    print("%(url)s; %(title)s" % hit["_source"])
 while True:
@@ Zeile 358: / Zeile 386: @@
     print("Got a response")
     if 'hits' not in response or len(response['hits']['hits']) == 0:
         break
     for hit in response['hits']['hits']:
         res_c += 1
-        print("%(FolderPath)s; %(IdentityReference)s; %(FileSystemRights)s; %(IsInherited)s" % hit["_source"])
+        print("%(.url)s; %(.title)s" % hit["_source"])
 es.clear_scroll(body={"scroll_id": scroll_id})

MBCDN

Benutzer-Werkzeuge

Webseiten-Werkzeuge

Unterschiede

Seiten-Werkzeuge