Hier werden die Unterschiede zwischen zwei Versionen angezeigt.
Beide Seiten der vorigen Revision Vorhergehende Überarbeitung Nächste Überarbeitung | Vorhergehende Überarbeitung | ||
elasticsearch [2025/06/30 19:52] jango |
elasticsearch [2025/06/30 20:38] (aktuell) jango [Python] |
||
---|---|---|---|
Zeile 229: | Zeile 229: | ||
=====Python===== | =====Python===== | ||
+ | |||
+ | <code python> | ||
+ | import logging | ||
+ | from urllib.parse import urljoin | ||
+ | import requests | ||
+ | from bs4 import BeautifulSoup | ||
+ | |||
+ | logging.basicConfig( | ||
+ | format=' | ||
+ | level=logging.INFO) | ||
+ | |||
+ | class Crawler: | ||
+ | |||
+ | def __init__(self, | ||
+ | self.visited_urls = [] | ||
+ | self.urls_to_visit = urls | ||
+ | |||
+ | def download_url(self, | ||
+ | return requests.get(url).text | ||
+ | |||
+ | def get_linked_urls(self, | ||
+ | soup = BeautifulSoup(html, | ||
+ | for link in soup.find_all(' | ||
+ | path = link.get(' | ||
+ | if path and path.startswith('/' | ||
+ | path = urljoin(url, | ||
+ | yield path | ||
+ | |||
+ | def add_url_to_visit(self, | ||
+ | if url not in self.visited_urls and url not in self.urls_to_visit: | ||
+ | self.urls_to_visit.append(url) | ||
+ | |||
+ | def crawl(self, url): | ||
+ | html = self.download_url(url) | ||
+ | for url in self.get_linked_urls(url, | ||
+ | self.add_url_to_visit(url) | ||
+ | |||
+ | def run(self): | ||
+ | while self.urls_to_visit: | ||
+ | url = self.urls_to_visit.pop(0) | ||
+ | logging.info(f' | ||
+ | try: | ||
+ | self.crawl(url) | ||
+ | except Exception: | ||
+ | logging.exception(f' | ||
+ | finally: | ||
+ | self.visited_urls.append(url) | ||
+ | |||
+ | if __name__ == ' | ||
+ | Crawler(urls=[' | ||
+ | </ | ||
+ | |||
====Create Index==== | ====Create Index==== | ||
<code python> | <code python> | ||
- | import csv | ||
- | import glob | ||
import os | import os | ||
from elasticsearch import Elasticsearch | from elasticsearch import Elasticsearch | ||
Zeile 246: | Zeile 296: | ||
) | ) | ||
- | import datetime | + | index_name = " |
- | now = datetime.datetime.now() | + | if not es.indices.exists(index=index_name): |
- | + | ||
- | fmt_d = now.strftime(" | + | |
- | + | ||
- | index_name = " | + | |
- | + | ||
- | if not es.indices.exists(index=index_name): | + | |
es.indices.create(index=index_name, | es.indices.create(index=index_name, | ||
" | " | ||
" | " | ||
- | "FolderPath": | + | "url": {" |
- | " | + | "title": {" |
- | " | + | |
- | "IsInherited": {" | + | |
} | } | ||
} | } | ||
Zeile 272: | Zeile 314: | ||
====Insert Data==== | ====Insert Data==== | ||
<code python> | <code python> | ||
- | import csv | ||
- | import glob | ||
import os | import os | ||
from elasticsearch import Elasticsearch | from elasticsearch import Elasticsearch | ||
Zeile 283: | Zeile 323: | ||
es = Elasticsearch( | es = Elasticsearch( | ||
[' | [' | ||
- | basic_auth=(" | + | basic_auth=(" |
verify_certs=False | verify_certs=False | ||
) | ) | ||
- | index_name = "fs_audit_2025-04-12" | + | index_name = "test" |
- | csv_folder = " | + | |
- | csv_files = glob.glob(os.path.join(csv_folder, | + | |
- | if not csv_files: | + | # Testeinträge als Dokumente |
- | | + | documents = [ |
- | | + | {"url": "heute.at", "title": "Das war heute"}, |
- | + | {"url": "gestern.at", "title": "Das war gestern"}, | |
- | for csv_file in csv_files: | + | ] |
- | print(f" | + | |
- | with open(csv_file, | + | |
- | reader = csv.DictReader(file, delimiter=";") | + | |
- | + | ||
- | def generate_docs(): | + | |
- | for row in reader: | + | |
- | yield { | + | |
- | | + | |
- | " | + | |
- | | + | |
- | | + | |
- | " | + | |
- | " | + | |
- | } | + | |
- | } | + | |
- | | + | # Dokumente in den Index einfügen |
- | if not ok: | + | for doc in documents: |
- | print(" | + | |
+ | print(f"Dokument hinzugefügt: {doc}") | ||
- | print(" | + | print(" |
</ | </ | ||
Zeile 328: | Zeile 352: | ||
es = Elasticsearch( | es = Elasticsearch( | ||
[' | [' | ||
- | basic_auth=(" | + | basic_auth=(" |
verify_certs=False | verify_certs=False | ||
) | ) | ||
Zeile 335: | Zeile 359: | ||
" | " | ||
" | " | ||
- | "FolderPath": "\\\\\\\\FILESERVER\\\gbf*Walkner*" | + | |
- | } | + | } |
}, | }, | ||
- | " | + | " |
} | } | ||
- | response = es.search(index=" | + | response = es.search(index=" |
if ' | if ' | ||
Zeile 352: | Zeile 376: | ||
for hit in response[' | for hit in response[' | ||
res_c += 1 | res_c += 1 | ||
- | print(" | + | print(" |
while True: | while True: | ||
Zeile 358: | Zeile 382: | ||
print(" | print(" | ||
if ' | if ' | ||
- | break | + | break |
for hit in response[' | for hit in response[' | ||
res_c += 1 | res_c += 1 | ||
- | print(" | + | print(" |
es.clear_scroll(body={" | es.clear_scroll(body={" |