Benutzer-Werkzeuge

Webseiten-Werkzeuge


elasticsearch

Unterschiede

Hier werden die Unterschiede zwischen zwei Versionen angezeigt.

Link zu dieser Vergleichsansicht

Beide Seiten der vorigen Revision Vorhergehende Überarbeitung
Nächste Überarbeitung
Vorhergehende Überarbeitung
elasticsearch [2025/06/30 19:47]
jango [Python]
elasticsearch [2025/06/30 20:38] (aktuell)
jango [Python]
Zeile 231: Zeile 231:
  
 <code python> <code python>
-import csv +import logging 
-import glob+from urllib.parse import urljoin 
 +import requests 
 +from bs4 import BeautifulSoup 
 + 
 +logging.basicConfig( 
 +    format='%(asctime)s %(levelname)s:%(message)s', 
 +    level=logging.INFO) 
 + 
 +class Crawler: 
 + 
 +    def __init__(self, urls=[]): 
 +        self.visited_urls = [] 
 +        self.urls_to_visit = urls 
 + 
 +    def download_url(self, url): 
 +        return requests.get(url).text 
 + 
 +    def get_linked_urls(self, url, html): 
 +        soup = BeautifulSoup(html, 'html.parser'
 +        for link in soup.find_all('a'): 
 +            path = link.get('href'
 +            if path and path.startswith('/'): 
 +                path = urljoin(url, path) 
 +            yield path 
 + 
 +    def add_url_to_visit(self, url): 
 +        if url not in self.visited_urls and url not in self.urls_to_visit: 
 +            self.urls_to_visit.append(url) 
 + 
 +    def crawl(self, url): 
 +        html = self.download_url(url) 
 +        for url in self.get_linked_urls(url, html): 
 +            self.add_url_to_visit(url) 
 + 
 +    def run(self): 
 +        while self.urls_to_visit: 
 +            url = self.urls_to_visit.pop(0) 
 +            logging.info(f'Crawling: {url}'
 +            try: 
 +                self.crawl(url) 
 +            except Exception: 
 +                logging.exception(f'Failed to crawl: {url}'
 +            finally: 
 +                self.visited_urls.append(url) 
 + 
 +if __name__ == '__main__': 
 +    Crawler(urls=['https://www.imdb.com/']).run() 
 +</code> 
 + 
 + 
 +====Create Index==== 
 +<code python>
 import os import os
 from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
Zeile 245: Zeile 296:
 ) )
  
-import datetime+index_name = "test"
  
-now = datetime.datetime.now() +if not es.indices.exists(index=index_name):
- +
-fmt_d = now.strftime("%Y-%m-%d"+
- +
-index_name = "fs_audit_" + fmt_d +
- +
-if not es.indices.exists(index=index_name):   +
     es.indices.create(index=index_name, body={     es.indices.create(index=index_name, body={
         "mappings": {         "mappings": {
             "properties": {             "properties": {
-                "FolderPath": {"type": "keyword"}, +                "url": {"type": "text"}, 
-                "IdentityReference": {"type": "text"}, +                "title": {"type": "text"}
-                "FileSystemRights": {"type": "text"}, +
-                "IsInherited": {"type": "text"}+
             }             }
         }         }
Zeile 269: Zeile 312:
 </code> </code>
  
 +====Insert Data====
 <code python> <code python>
-import csv 
-import glob 
 import os import os
 from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
Zeile 281: Zeile 323:
 es = Elasticsearch( es = Elasticsearch(
     ['https://localhost:9200'],     ['https://localhost:9200'],
-    basic_auth=("elastic", "fw5XeLo_I0X470yF_XQL"),+    basic_auth=("elastic", "lunikoff"),
     verify_certs=False     verify_certs=False
 ) )
  
-index_name = "fs_audit_2025-04-12" +index_name = "test"
-csv_folder = "C:\\Scripts\\Berechtigungsaudit\\shares\\2025" +
-csv_files = glob.glob(os.path.join(csv_folder, "*.csv"))+
  
-if not csv_files: +# Testeinträge als Dokumente 
-    print("Keine CSV-Dateien gefunden!") +documents = [ 
-    exit(1)+    {"url": "heute.at", "title": "Das war heute"}, 
 +    {"url": "gestern.at", "title": "Das war gestern"}, 
 +]
  
-for csv_file in csv_files+# Dokumente in den Index einfügen 
-    print(f"Verarbeite Datei: {csv_file}"+for doc in documents
-    with open(csv_file, mode="r"encoding="utf-8"as file: +    es.index(index=index_namebody=doc
-        reader = csv.DictReader(file, delimiter=";")+    print(f"Dokument hinzugefügt: {doc}")
  
-        def generate_docs(): +print("Alles wurde indexiert.")
-            for row in reader: +
-                yield { +
-                    "_index": index_name, +
-                    "_source":+
-                        "FolderPath": row["FolderPath"], +
-                        "IdentityReference": row["IdentityReference"], +
-                        "FileSystemRights": row["FileSystemRights"], +
-                        "IsInherited": row["IsInherited"+
-                    } +
-                } +
- +
-        for ok, response in streaming_bulk(es, generate_docs(), chunk_size=500): +
-            if not ok: +
-                print("Fehler beim Indexieren:", response) +
- +
-print("Alle CSV-Dateien wurden indexiert.")+
 </code> </code>
  
 +====Query Data====
 <code python> <code python>
 from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
Zeile 325: Zeile 352:
 es = Elasticsearch( es = Elasticsearch(
     ['https://localhost:9200'],     ['https://localhost:9200'],
-    basic_auth=("elastic", "fw5XeLo_I0X470yF_XQL"),+    basic_auth=("elastic", "lunikoff"),
     verify_certs=False     verify_certs=False
 ) )
Zeile 332: Zeile 359:
     "query": {     "query": {
         "wildcard": {         "wildcard": {
- "FolderPath": "\\\\\\\\FILESERVER\\\gbf*Walkner*" +                "url": "heute*" 
- }+        }
     },     },
-    "size": 100  +    "size": 100
 } }
  
-response = es.search(index="fs_audit", body=data, scroll="1m")+response = es.search(index="test", body=data, scroll="1m")
  
 if 'hits' not in response or len(response['hits']['hits']) == 0: if 'hits' not in response or len(response['hits']['hits']) == 0:
Zeile 349: Zeile 376:
 for hit in response['hits']['hits']: for hit in response['hits']['hits']:
     res_c += 1     res_c += 1
-    print("%(FolderPath)s; %(IdentityReference)s; %(FileSystemRights)s; %(IsInherited)s" % hit["_source"])+    print("%(url)s; %(title)s" % hit["_source"])
  
 while True: while True:
Zeile 355: Zeile 382:
     print("Got a response")     print("Got a response")
     if 'hits' not in response or len(response['hits']['hits']) == 0:     if 'hits' not in response or len(response['hits']['hits']) == 0:
-        break  +        break
  
     for hit in response['hits']['hits']:     for hit in response['hits']['hits']:
         res_c += 1         res_c += 1
-        print("%(FolderPath)s; %(IdentityReference)s; %(FileSystemRights)s; %(IsInherited)s" % hit["_source"])+        print("%(.url)s; %(.title)s" % hit["_source"])
  
 es.clear_scroll(body={"scroll_id": scroll_id}) es.clear_scroll(body={"scroll_id": scroll_id})
Zeile 365: Zeile 392:
 print("We had", res_c, "results") print("We had", res_c, "results")
 </code> </code>
 +
 +====Delete Index====
 +
 +<code python>
 +import csv
 +import glob
 +import os
 +from elasticsearch import Elasticsearch
 +import urllib3
 +
 +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 +
 +es = Elasticsearch(
 +    ['https://localhost:9200'],
 +    basic_auth=("elastic", "secure"),
 +    verify_certs=False
 +)
 +
 +index_name = input('Index name: ')
 +
 +# Prüfen, ob der Index existiert und ihn löschen
 +if es.indices.exists(index=index_name):
 +    es.indices.delete(index=index_name)
 +    print(f"Index '{index_name}' wurde gelöscht.")
 +else:
 +    print(f"Index '{index_name}' existiert nicht.")
 +</code>
 +
 ====Scroll Query==== ====Scroll Query====
 <code python> <code python>
elasticsearch.1751305644.txt.gz · Zuletzt geändert: 2025/06/30 19:47 von jango