Siehe auch [[ElasticSearch]] (z.B Scripte, API, etc. sind identisch). [[Wazuh]] verwendet [[OpenSearch]]. =====Installation===== Run a local cluster docker run --rm -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:2.6.0 Create a python script from opensearchpy import OpenSearch client = OpenSearch( hosts = [{"host": "localhost", "port": 9200}], http_auth = ("admin", "admin"), use_ssl = True, verify_certs = False, ssl_assert_hostname = False, ssl_show_warn = False, ) client.info() Get some random data for e.g [[https://www.kaggle.com/datasets/jrobischon/wikipedia-movie-plots|wikipedia-movie-plots]]. Read the data into a pandas array. import pandas as pd df = ( pd.read_csv("wiki_movie_plots_deduped.csv") .dropna() .sample(5000, random_state=42) .reset_index(drop=True) ) Create an index body = { "mappings":{ "properties": { "title": {"type": "text", "analyzer": "english"}, "ethnicity": {"type": "text", "analyzer": "standard"}, "director": {"type": "text", "analyzer": "standard"}, "cast": {"type": "text", "analyzer": "standard"}, "genre": {"type": "text", "analyzer": "standard"}, "plot": {"type": "text", "analyzer": "english"}, "year": {"type": "integer"}, "wiki_page": {"type": "keyword"} } } } response = client.indices.create("movies", body=body) Push the data into the index for i, row in df.iterrows(): body = { "title": row["Title"], "ethnicity": row["Origin/Ethnicity"], "director": row["Director"], "cast": row["Cast"], "genre": row["Genre"], "plot": row["Plot"], "year": row["Release Year"], "wiki_page": row["Wiki Page"] } client.index(index="movies", id=i, body=body) More data in a bulk from opensearchpy.helpers import bulk bulk_data = [] for i,row in df.iterrows(): bulk_data.append( { "_index": "movies", "_id": i, "_source": { "title": row["Title"], "ethnicity": row["Origin/Ethnicity"], "director": row["Director"], "cast": row["Cast"], "genre": row["Genre"], "plot": row["Plot"], "year": row["Release Year"], "wiki_page": row["Wiki Page"], } } ) bulk(client, bulk_data) Count the inserted data client.indices.refresh(index="movies") client.cat.count(index="movies", format="json") Search the data resp = client.search( index="movies", body={ "query": { "bool": { "must": { "match_phrase": { "cast": "jack nicholson", } }, "filter": {"bool": {"must_not": {"match_phrase": {"director": "tim burton"}}}}, }, }, } ) resp Remove documents client.delete(index="movies", id="2500") Delete the index client.indices.delete(index='movies')