Dies ist eine alte Version des Dokuments!
Siehe auch ElasticSearch
Run a local cluster
docker run --rm -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:2.6.0
Create a python script
from opensearchpy import OpenSearch client = OpenSearch( hosts = [{"host": "localhost", "port": 9200}], http_auth = ("admin", "admin"), use_ssl = True, verify_certs = False, ssl_assert_hostname = False, ssl_show_warn = False, ) client.info()
Get some random data for e.g wikipedia-movie-plots. Read the data into a pandas array.
import pandas as pd df = ( pd.read_csv("wiki_movie_plots_deduped.csv") .dropna() .sample(5000, random_state=42) .reset_index(drop=True) )
Create an index
body = { "mappings":{ "properties": { "title": {"type": "text", "analyzer": "english"}, "ethnicity": {"type": "text", "analyzer": "standard"}, "director": {"type": "text", "analyzer": "standard"}, "cast": {"type": "text", "analyzer": "standard"}, "genre": {"type": "text", "analyzer": "standard"}, "plot": {"type": "text", "analyzer": "english"}, "year": {"type": "integer"}, "wiki_page": {"type": "keyword"} } } } response = client.indices.create("movies", body=body)
Push the data into the index
for i, row in df.iterrows(): body = { "title": row["Title"], "ethnicity": row["Origin/Ethnicity"], "director": row["Director"], "cast": row["Cast"], "genre": row["Genre"], "plot": row["Plot"], "year": row["Release Year"], "wiki_page": row["Wiki Page"] } client.index(index="movies", id=i, body=body)
More data in a bulk
from opensearchpy.helpers import bulk bulk_data = [] for i,row in df.iterrows(): bulk_data.append( { "_index": "movies", "_id": i, "_source": { "title": row["Title"], "ethnicity": row["Origin/Ethnicity"], "director": row["Director"], "cast": row["Cast"], "genre": row["Genre"], "plot": row["Plot"], "year": row["Release Year"], "wiki_page": row["Wiki Page"], } } ) bulk(client, bulk_data)
Count the inserted data
client.indices.refresh(index="movies") client.cat.count(index="movies", format="json")
Search the data
resp = client.search( index="movies", body={ "query": { "bool": { "must": { "match_phrase": { "cast": "jack nicholson", } }, "filter": {"bool": {"must_not": {"match_phrase": {"director": "tim burton"}}}}, }, }, } ) resp
Remove documents
client.delete(index="movies", id="2500")
Delete the index
client.indices.delete(index='movies')