Benutzer-Werkzeuge

Webseiten-Werkzeuge


elasticsearch

Unterschiede

Hier werden die Unterschiede zwischen zwei Versionen angezeigt.

Link zu dieser Vergleichsansicht

Beide Seiten der vorigen Revision Vorhergehende Überarbeitung
Nächste Überarbeitung
Vorhergehende Überarbeitung
elasticsearch [2025/06/30 20:04]
jango [Create Index]
elasticsearch [2025/06/30 20:38] (aktuell)
jango [Python]
Zeile 229: Zeile 229:
  
 =====Python===== =====Python=====
 +
 +<code python>
 +import logging
 +from urllib.parse import urljoin
 +import requests
 +from bs4 import BeautifulSoup
 +
 +logging.basicConfig(
 +    format='%(asctime)s %(levelname)s:%(message)s',
 +    level=logging.INFO)
 +
 +class Crawler:
 +
 +    def __init__(self, urls=[]):
 +        self.visited_urls = []
 +        self.urls_to_visit = urls
 +
 +    def download_url(self, url):
 +        return requests.get(url).text
 +
 +    def get_linked_urls(self, url, html):
 +        soup = BeautifulSoup(html, 'html.parser')
 +        for link in soup.find_all('a'):
 +            path = link.get('href')
 +            if path and path.startswith('/'):
 +                path = urljoin(url, path)
 +            yield path
 +
 +    def add_url_to_visit(self, url):
 +        if url not in self.visited_urls and url not in self.urls_to_visit:
 +            self.urls_to_visit.append(url)
 +
 +    def crawl(self, url):
 +        html = self.download_url(url)
 +        for url in self.get_linked_urls(url, html):
 +            self.add_url_to_visit(url)
 +
 +    def run(self):
 +        while self.urls_to_visit:
 +            url = self.urls_to_visit.pop(0)
 +            logging.info(f'Crawling: {url}')
 +            try:
 +                self.crawl(url)
 +            except Exception:
 +                logging.exception(f'Failed to crawl: {url}')
 +            finally:
 +                self.visited_urls.append(url)
 +
 +if __name__ == '__main__':
 +    Crawler(urls=['https://www.imdb.com/']).run()
 +</code>
 +
  
 ====Create Index==== ====Create Index====
Zeile 262: Zeile 314:
 ====Insert Data==== ====Insert Data====
 <code python> <code python>
-import csv 
-import glob 
 import os import os
 from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
Zeile 273: Zeile 323:
 es = Elasticsearch( es = Elasticsearch(
     ['https://localhost:9200'],     ['https://localhost:9200'],
-    basic_auth=("elastic", "secure"),+    basic_auth=("elastic", "lunikoff"),
     verify_certs=False     verify_certs=False
 ) )
  
-index_name = "fs_audit_2025-04-12" +index_name = "test"
-csv_folder = "C:\\Scripts\\Berechtigungsaudit\\shares\\2025" +
-csv_files = glob.glob(os.path.join(csv_folder, "*.csv")) +
- +
-if not csv_files: +
-    print("Keine CSV-Dateien gefunden!"+
-    exit(1) +
- +
-for csv_file in csv_files: +
-    print(f"Verarbeite Datei: {csv_file}"+
-    with open(csv_file, mode="r", encoding="utf-8") as file: +
-        reader = csv.DictReader(file, delimiter=";")+
  
-        def generate_docs(): +# Testeinträge als Dokumente 
-            for row in reader: +documents = [ 
-                yield { +    {"url": "heute.at""title": "Das war heute"}
-                    "_index": index_name, +    {"url": "gestern.at", "title": "Das war gestern"}
-                    "_source": { +]
-                        "FolderPath": row["FolderPath"]+
-                        "IdentityReference": row["IdentityReference"], +
-                        "FileSystemRights": row["FileSystemRights"]+
-                        "IsInherited": row["IsInherited"] +
-                    } +
-                }+
  
-        for ok, response in streaming_bulk(es, generate_docs()chunk_size=500)+# Dokumente in den Index einfügen 
-            if not ok: +for doc in documents: 
-                print("Fehler beim Indexieren:", response)+    es.index(index=index_namebody=doc
 +    print(f"Dokument hinzugefügt{doc}")
  
-print("Alle CSV-Dateien wurden indexiert.")+print("Alles wurde indexiert.")
 </code> </code>
  
Zeile 318: Zeile 352:
 es = Elasticsearch( es = Elasticsearch(
     ['https://localhost:9200'],     ['https://localhost:9200'],
-    basic_auth=("elastic", "secure"),+    basic_auth=("elastic", "lunikoff"),
     verify_certs=False     verify_certs=False
 ) )
Zeile 325: Zeile 359:
     "query": {     "query": {
         "wildcard": {         "wildcard": {
- "FolderPath": "\\\\\\\\FILESERVER\\\gbf*Walkner*" +                "url": "heute*" 
- }+        }
     },     },
-    "size": 100  +    "size": 100
 } }
  
-response = es.search(index="fs_audit", body=data, scroll="1m")+response = es.search(index="test", body=data, scroll="1m")
  
 if 'hits' not in response or len(response['hits']['hits']) == 0: if 'hits' not in response or len(response['hits']['hits']) == 0:
Zeile 342: Zeile 376:
 for hit in response['hits']['hits']: for hit in response['hits']['hits']:
     res_c += 1     res_c += 1
-    print("%(FolderPath)s; %(IdentityReference)s; %(FileSystemRights)s; %(IsInherited)s" % hit["_source"])+    print("%(url)s; %(title)s" % hit["_source"])
  
 while True: while True:
Zeile 348: Zeile 382:
     print("Got a response")     print("Got a response")
     if 'hits' not in response or len(response['hits']['hits']) == 0:     if 'hits' not in response or len(response['hits']['hits']) == 0:
-        break  +        break
  
     for hit in response['hits']['hits']:     for hit in response['hits']['hits']:
         res_c += 1         res_c += 1
-        print("%(FolderPath)s; %(IdentityReference)s; %(FileSystemRights)s; %(IsInherited)s" % hit["_source"])+        print("%(.url)s; %(.title)s" % hit["_source"])
  
 es.clear_scroll(body={"scroll_id": scroll_id}) es.clear_scroll(body={"scroll_id": scroll_id})
elasticsearch.1751306681.txt.gz · Zuletzt geändert: 2025/06/30 20:04 von jango