Benutzer-Werkzeuge

Webseiten-Werkzeuge


elasticsearch

Unterschiede

Hier werden die Unterschiede zwischen zwei Versionen angezeigt.

Link zu dieser Vergleichsansicht

Beide Seiten der vorigen Revision Vorhergehende Überarbeitung
Nächste Überarbeitung
Vorhergehende Überarbeitung
elasticsearch [2025/06/30 14:00]
jango [Docker]
elasticsearch [2025/06/30 20:38] (aktuell)
jango [Python]
Zeile 229: Zeile 229:
  
 =====Python===== =====Python=====
 +
 +<code python>
 +import logging
 +from urllib.parse import urljoin
 +import requests
 +from bs4 import BeautifulSoup
 +
 +logging.basicConfig(
 +    format='%(asctime)s %(levelname)s:%(message)s',
 +    level=logging.INFO)
 +
 +class Crawler:
 +
 +    def __init__(self, urls=[]):
 +        self.visited_urls = []
 +        self.urls_to_visit = urls
 +
 +    def download_url(self, url):
 +        return requests.get(url).text
 +
 +    def get_linked_urls(self, url, html):
 +        soup = BeautifulSoup(html, 'html.parser')
 +        for link in soup.find_all('a'):
 +            path = link.get('href')
 +            if path and path.startswith('/'):
 +                path = urljoin(url, path)
 +            yield path
 +
 +    def add_url_to_visit(self, url):
 +        if url not in self.visited_urls and url not in self.urls_to_visit:
 +            self.urls_to_visit.append(url)
 +
 +    def crawl(self, url):
 +        html = self.download_url(url)
 +        for url in self.get_linked_urls(url, html):
 +            self.add_url_to_visit(url)
 +
 +    def run(self):
 +        while self.urls_to_visit:
 +            url = self.urls_to_visit.pop(0)
 +            logging.info(f'Crawling: {url}')
 +            try:
 +                self.crawl(url)
 +            except Exception:
 +                logging.exception(f'Failed to crawl: {url}')
 +            finally:
 +                self.visited_urls.append(url)
 +
 +if __name__ == '__main__':
 +    Crawler(urls=['https://www.imdb.com/']).run()
 +</code>
 +
 +
 +====Create Index====
 +<code python>
 +import os
 +from elasticsearch import Elasticsearch
 +import urllib3
 +
 +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 +
 +es = Elasticsearch(
 +    ['https://localhost:9200'],
 +    basic_auth=("elastic", "secure"),
 +    verify_certs=False
 +)
 +
 +index_name = "test"
 +
 +if not es.indices.exists(index=index_name):
 +    es.indices.create(index=index_name, body={
 +        "mappings": {
 +            "properties": {
 +                "url": {"type": "text"},
 +                "title": {"type": "text"}
 +            }
 +        }
 +    })
 +    print(f"Index '{index_name}' wurde erstellt.")
 +else:
 +    print(f"Index '{index_name}' existiert bereits.")
 +</code>
 +
 +====Insert Data====
 +<code python>
 +import os
 +from elasticsearch import Elasticsearch
 +from elasticsearch.helpers import streaming_bulk
 +import urllib3
 +
 +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 +
 +es = Elasticsearch(
 +    ['https://localhost:9200'],
 +    basic_auth=("elastic", "lunikoff"),
 +    verify_certs=False
 +)
 +
 +index_name = "test"
 +
 +# Testeinträge als Dokumente
 +documents = [
 +    {"url": "heute.at", "title": "Das war heute"},
 +    {"url": "gestern.at", "title": "Das war gestern"},
 +]
 +
 +# Dokumente in den Index einfügen
 +for doc in documents:
 +    es.index(index=index_name, body=doc)
 +    print(f"Dokument hinzugefügt: {doc}")
 +
 +print("Alles wurde indexiert.")
 +</code>
 +
 +====Query Data====
 +<code python>
 +from elasticsearch import Elasticsearch
 +import urllib3
 +
 +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 +
 +es = Elasticsearch(
 +    ['https://localhost:9200'],
 +    basic_auth=("elastic", "lunikoff"),
 +    verify_certs=False
 +)
 +
 +data = {
 +    "query": {
 +        "wildcard": {
 +                "url": "heute*"
 +        }
 +    },
 +    "size": 100
 +}
 +
 +response = es.search(index="test", body=data, scroll="1m")
 +
 +if 'hits' not in response or len(response['hits']['hits']) == 0:
 +    print("Keine Ergebnisse gefunden.")
 +    exit(1)
 +
 +scroll_id = response['_scroll_id']
 +res_c = 0
 +
 +for hit in response['hits']['hits']:
 +    res_c += 1
 +    print("%(url)s; %(title)s" % hit["_source"])
 +
 +while True:
 +    response = es.scroll(body={"scroll_id": scroll_id, "scroll": "1m"})
 +    print("Got a response")
 +    if 'hits' not in response or len(response['hits']['hits']) == 0:
 +        break
 +
 +    for hit in response['hits']['hits']:
 +        res_c += 1
 +        print("%(.url)s; %(.title)s" % hit["_source"])
 +
 +es.clear_scroll(body={"scroll_id": scroll_id})
 +
 +print("We had", res_c, "results")
 +</code>
 +
 +====Delete Index====
 +
 +<code python>
 +import csv
 +import glob
 +import os
 +from elasticsearch import Elasticsearch
 +import urllib3
 +
 +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 +
 +es = Elasticsearch(
 +    ['https://localhost:9200'],
 +    basic_auth=("elastic", "secure"),
 +    verify_certs=False
 +)
 +
 +index_name = input('Index name: ')
 +
 +# Prüfen, ob der Index existiert und ihn löschen
 +if es.indices.exists(index=index_name):
 +    es.indices.delete(index=index_name)
 +    print(f"Index '{index_name}' wurde gelöscht.")
 +else:
 +    print(f"Index '{index_name}' existiert nicht.")
 +</code>
  
 ====Scroll Query==== ====Scroll Query====
elasticsearch.1751284835.txt.gz · Zuletzt geändert: 2025/06/30 14:00 von jango