Benutzer-Werkzeuge

Webseiten-Werkzeuge


elasticsearch

Unterschiede

Hier werden die Unterschiede zwischen zwei Versionen angezeigt.

Link zu dieser Vergleichsansicht

Beide Seiten der vorigen Revision Vorhergehende Überarbeitung
Nächste Überarbeitung
Vorhergehende Überarbeitung
elasticsearch [2025/06/30 19:47]
jango [Python]
elasticsearch [2025/07/05 11:13] (aktuell)
jango [Service]
Zeile 91: Zeile 91:
 sudo curl -X GET "https://localhost:9200/_cat/indices" --insecure -u elastic:dein_sicheres_passwort sudo curl -X GET "https://localhost:9200/_cat/indices" --insecure -u elastic:dein_sicheres_passwort
 </code> </code>
 +=====Sicherheit=====
 +
 +[[https://www.elastic.co/docs/reference/elasticsearch/configuration-reference/security-settings|Elasticsearch Security Settings]]
 +
 =====Service===== =====Service=====
  
Zeile 231: Zeile 235:
  
 <code python> <code python>
-import csv +import logging 
-import glob+from urllib.parse import urljoin 
 +import requests 
 +from bs4 import BeautifulSoup 
 + 
 +logging.basicConfig( 
 +    format='%(asctime)s %(levelname)s:%(message)s', 
 +    level=logging.INFO) 
 + 
 +class Crawler: 
 + 
 +    def __init__(self, urls=[]): 
 +        self.visited_urls = [] 
 +        self.urls_to_visit = urls 
 + 
 +    def download_url(self, url): 
 +        return requests.get(url).text 
 + 
 +    def get_linked_urls(self, url, html): 
 +        soup = BeautifulSoup(html, 'html.parser'
 +        for link in soup.find_all('a'): 
 +            path = link.get('href'
 +            if path and path.startswith('/'): 
 +                path = urljoin(url, path) 
 +            yield path 
 + 
 +    def add_url_to_visit(self, url): 
 +        if url not in self.visited_urls and url not in self.urls_to_visit: 
 +            self.urls_to_visit.append(url) 
 + 
 +    def crawl(self, url): 
 +        html = self.download_url(url) 
 +        for url in self.get_linked_urls(url, html): 
 +            self.add_url_to_visit(url) 
 + 
 +    def run(self): 
 +        while self.urls_to_visit: 
 +            url = self.urls_to_visit.pop(0) 
 +            logging.info(f'Crawling: {url}'
 +            try: 
 +                self.crawl(url) 
 +            except Exception: 
 +                logging.exception(f'Failed to crawl: {url}'
 +            finally: 
 +                self.visited_urls.append(url) 
 + 
 +if __name__ == '__main__': 
 +    Crawler(urls=['https://www.imdb.com/']).run() 
 +</code> 
 + 
 + 
 +====Create Index==== 
 +<code python>
 import os import os
 from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
Zeile 245: Zeile 300:
 ) )
  
-import datetime+index_name = "test"
  
-now = datetime.datetime.now() +if not es.indices.exists(index=index_name):
- +
-fmt_d = now.strftime("%Y-%m-%d"+
- +
-index_name = "fs_audit_" + fmt_d +
- +
-if not es.indices.exists(index=index_name):   +
     es.indices.create(index=index_name, body={     es.indices.create(index=index_name, body={
         "mappings": {         "mappings": {
             "properties": {             "properties": {
-                "FolderPath": {"type": "keyword"}, +                "url": {"type": "text"}, 
-                "IdentityReference": {"type": "text"}, +                "title": {"type": "text"}
-                "FileSystemRights": {"type": "text"}, +
-                "IsInherited": {"type": "text"}+
             }             }
         }         }
Zeile 269: Zeile 316:
 </code> </code>
  
 +====Insert Data====
 <code python> <code python>
-import csv 
-import glob 
 import os import os
 from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
Zeile 281: Zeile 327:
 es = Elasticsearch( es = Elasticsearch(
     ['https://localhost:9200'],     ['https://localhost:9200'],
-    basic_auth=("elastic", "fw5XeLo_I0X470yF_XQL"),+    basic_auth=("elastic", "lunikoff"),
     verify_certs=False     verify_certs=False
 ) )
  
-index_name = "fs_audit_2025-04-12" +index_name = "test"
-csv_folder = "C:\\Scripts\\Berechtigungsaudit\\shares\\2025" +
-csv_files = glob.glob(os.path.join(csv_folder, "*.csv"))+
  
-if not csv_files: +# Testeinträge als Dokumente 
-    print("Keine CSV-Dateien gefunden!") +documents = [ 
-    exit(1) +    {"url""heute.at", "title": "Das war heute"}
- +    {"url": "gestern.at", "title": "Das war gestern"}
-for csv_file in csv_files: +]
-    print(f"Verarbeite Datei: {csv_file}"+
-    with open(csv_file, mode="r", encoding="utf-8") as file: +
-        reader = csv.DictReader(filedelimiter=";"+
- +
-        def generate_docs(): +
-            for row in reader: +
-                yield { +
-                    "_index": index_name+
-                    "_source": { +
-                        "FolderPath": row["FolderPath"], +
-                        "IdentityReference": row["IdentityReference"]+
-                        "FileSystemRights": row["FileSystemRights"]+
-                        "IsInherited": row["IsInherited"+
-                    } +
-                }+
  
-        for ok, response in streaming_bulk(es, generate_docs()chunk_size=500)+# Dokumente in den Index einfügen 
-            if not ok: +for doc in documents: 
-                print("Fehler beim Indexieren:", response)+    es.index(index=index_namebody=doc
 +    print(f"Dokument hinzugefügt{doc}")
  
-print("Alle CSV-Dateien wurden indexiert.")+print("Alles wurde indexiert.")
 </code> </code>
  
 +====Query Data====
 <code python> <code python>
 from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
Zeile 325: Zeile 356:
 es = Elasticsearch( es = Elasticsearch(
     ['https://localhost:9200'],     ['https://localhost:9200'],
-    basic_auth=("elastic", "fw5XeLo_I0X470yF_XQL"),+    basic_auth=("elastic", "lunikoff"),
     verify_certs=False     verify_certs=False
 ) )
Zeile 332: Zeile 363:
     "query": {     "query": {
         "wildcard": {         "wildcard": {
- "FolderPath": "\\\\\\\\FILESERVER\\\gbf*Walkner*" +                "url": "heute*" 
- }+        }
     },     },
-    "size": 100  +    "size": 100
 } }
  
-response = es.search(index="fs_audit", body=data, scroll="1m")+response = es.search(index="test", body=data, scroll="1m")
  
 if 'hits' not in response or len(response['hits']['hits']) == 0: if 'hits' not in response or len(response['hits']['hits']) == 0:
Zeile 349: Zeile 380:
 for hit in response['hits']['hits']: for hit in response['hits']['hits']:
     res_c += 1     res_c += 1
-    print("%(FolderPath)s; %(IdentityReference)s; %(FileSystemRights)s; %(IsInherited)s" % hit["_source"])+    print("%(url)s; %(title)s" % hit["_source"])
  
 while True: while True:
Zeile 355: Zeile 386:
     print("Got a response")     print("Got a response")
     if 'hits' not in response or len(response['hits']['hits']) == 0:     if 'hits' not in response or len(response['hits']['hits']) == 0:
-        break  +        break
  
     for hit in response['hits']['hits']:     for hit in response['hits']['hits']:
         res_c += 1         res_c += 1
-        print("%(FolderPath)s; %(IdentityReference)s; %(FileSystemRights)s; %(IsInherited)s" % hit["_source"])+        print("%(.url)s; %(.title)s" % hit["_source"])
  
 es.clear_scroll(body={"scroll_id": scroll_id}) es.clear_scroll(body={"scroll_id": scroll_id})
Zeile 365: Zeile 396:
 print("We had", res_c, "results") print("We had", res_c, "results")
 </code> </code>
 +
 +====Delete Index====
 +
 +<code python>
 +import csv
 +import glob
 +import os
 +from elasticsearch import Elasticsearch
 +import urllib3
 +
 +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 +
 +es = Elasticsearch(
 +    ['https://localhost:9200'],
 +    basic_auth=("elastic", "secure"),
 +    verify_certs=False
 +)
 +
 +index_name = input('Index name: ')
 +
 +# Prüfen, ob der Index existiert und ihn löschen
 +if es.indices.exists(index=index_name):
 +    es.indices.delete(index=index_name)
 +    print(f"Index '{index_name}' wurde gelöscht.")
 +else:
 +    print(f"Index '{index_name}' existiert nicht.")
 +</code>
 +
 ====Scroll Query==== ====Scroll Query====
 <code python> <code python>
elasticsearch.1751305644.txt.gz · Zuletzt geändert: 2025/06/30 19:47 von jango