Dies ist eine alte Version des Dokuments!
Siehe auch ElasticSearch (z.B Scripte, API, etc. sind identisch). Wazuh verwendet OpenSearch.
Run a local cluster
docker run --rm -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:2.6.0
Create a python script
from opensearchpy import OpenSearch client = OpenSearch( hosts = [{"host": "localhost", "port": 9200}], http_auth = ("admin", "admin"), use_ssl = True, verify_certs = False, ssl_assert_hostname = False, ssl_show_warn = False, ) client.info()
Get some random data for e.g wikipedia-movie-plots. Read the data into a pandas array.
import pandas as pd df = ( pd.read_csv("wiki_movie_plots_deduped.csv") .dropna() .sample(5000, random_state=42) .reset_index(drop=True) )
Create an index
body = { "mappings":{ "properties": { "title": {"type": "text", "analyzer": "english"}, "ethnicity": {"type": "text", "analyzer": "standard"}, "director": {"type": "text", "analyzer": "standard"}, "cast": {"type": "text", "analyzer": "standard"}, "genre": {"type": "text", "analyzer": "standard"}, "plot": {"type": "text", "analyzer": "english"}, "year": {"type": "integer"}, "wiki_page": {"type": "keyword"} } } } response = client.indices.create("movies", body=body)
Push the data into the index
for i, row in df.iterrows(): body = { "title": row["Title"], "ethnicity": row["Origin/Ethnicity"], "director": row["Director"], "cast": row["Cast"], "genre": row["Genre"], "plot": row["Plot"], "year": row["Release Year"], "wiki_page": row["Wiki Page"] } client.index(index="movies", id=i, body=body)
More data in a bulk
from opensearchpy.helpers import bulk bulk_data = [] for i,row in df.iterrows(): bulk_data.append( { "_index": "movies", "_id": i, "_source": { "title": row["Title"], "ethnicity": row["Origin/Ethnicity"], "director": row["Director"], "cast": row["Cast"], "genre": row["Genre"], "plot": row["Plot"], "year": row["Release Year"], "wiki_page": row["Wiki Page"], } } ) bulk(client, bulk_data)
Count the inserted data
client.indices.refresh(index="movies") client.cat.count(index="movies", format="json")
Search the data
resp = client.search( index="movies", body={ "query": { "bool": { "must": { "match_phrase": { "cast": "jack nicholson", } }, "filter": {"bool": {"must_not": {"match_phrase": {"director": "tim burton"}}}}, }, }, } ) resp
Remove documents
client.delete(index="movies", id="2500")
Delete the index
client.indices.delete(index='movies')
Run as root
#!/usr/bin/env bash set -Eeuo pipefail trap 'echo; echo "[FEHLER] setup.sh abgebrochen in Zeile $LINENO"; exit 1' ERR if [[ "${EUID}" -ne 0 ]]; then echo "Bitte mit sudo/root ausführen." exit 1 fi APP_ROOT="/opt/at-search" NUTCH_VERSION="1.22" OPENSEARCH_VERSION="1.3.20" REAL_USER="${SUDO_USER:-root}" REAL_GROUP="$(id -gn "$REAL_USER")" echo "[1/12] Installiere Pakete" export DEBIAN_FRONTEND=noninteractive apt-get update apt-get install -y \ curl \ wget \ jq \ tar \ gzip \ unzip \ ca-certificates \ gnupg \ lsb-release \ software-properties-common \ docker.io \ docker-compose \ openjdk-11-jdk-headless systemctl enable docker systemctl restart docker echo "[2/12] Setze vm.max_map_count" if grep -q '^vm.max_map_count=' /etc/sysctl.conf; then sed -i 's/^vm.max_map_count=.*/vm.max_map_count=262144/' /etc/sysctl.conf else echo 'vm.max_map_count=262144' >> /etc/sysctl.conf fi sysctl -w vm.max_map_count=262144 >/dev/null echo "[3/12] Setze JAVA_HOME" JAVA_HOME_PATH="$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")" cat > /etc/profile.d/at-search-java.sh <<EOF export JAVA_HOME=${JAVA_HOME_PATH} EOF chmod 0644 /etc/profile.d/at-search-java.sh export JAVA_HOME="${JAVA_HOME_PATH}" echo "[4/12] Lege Verzeichnisse an" mkdir -p \ "${APP_ROOT}/"{opensearch,seeds,crawl,logs,bin,downloads,hadoop-tmp,hadoop-staging} \ "${APP_ROOT}/opensearch/data" echo "[5/12] Schreibe OpenSearch-Compose" cat > "${APP_ROOT}/opensearch/docker-compose.yml" <<EOF version: "3.8" services: opensearch: image: opensearchproject/opensearch:${OPENSEARCH_VERSION} container_name: at-opensearch restart: unless-stopped environment: - cluster.name=at-search-cluster - node.name=at-opensearch - discovery.type=single-node - bootstrap.memory_lock=true - OPENSEARCH_JAVA_OPTS=-Xms1g -Xmx1g - DISABLE_INSTALL_DEMO_CONFIG=true - DISABLE_SECURITY_PLUGIN=true ulimits: memlock: soft: -1 hard: -1 nofile: soft: 65536 hard: 65536 volumes: - ${APP_ROOT}/opensearch/data:/usr/share/opensearch/data ports: - "127.0.0.1:9200:9200" - "127.0.0.1:9600:9600" EOF echo "[6/12] Setze Rechte" chown -R "${REAL_USER}:${REAL_GROUP}" "${APP_ROOT}" chown -R 1000:1000 "${APP_ROOT}/opensearch/data" chmod 775 "${APP_ROOT}/opensearch/data" chmod 775 "${APP_ROOT}/hadoop-tmp" "${APP_ROOT}/hadoop-staging" echo "[7/12] Starte OpenSearch" cd "${APP_ROOT}/opensearch" docker-compose up -d ok=0 for i in $(seq 1 120); do if curl -fsS "http://127.0.0.1:9200" >/dev/null 2>&1; then ok=1 break fi sleep 2 done if [[ "${ok}" != "1" ]]; then echo "[FEHLER] OpenSearch kam nicht hoch:" docker-compose logs --tail=200 opensearch || true exit 1 fi echo "[8/12] Lege Index at_web neu an" curl -fsS -X DELETE "http://127.0.0.1:9200/at_web" >/dev/null 2>&1 || true curl -fsS -X PUT "http://127.0.0.1:9200/at_web" \ -H 'Content-Type: application/json' \ -d '{ "settings": { "number_of_shards": 1, "number_of_replicas": 0 }, "mappings": { "properties": { "url": {"type": "keyword"}, "host": {"type": "keyword"}, "site": {"type": "keyword"}, "title": {"type": "text"}, "content": {"type": "text"}, "digest": {"type": "keyword"}, "tstamp": {"type": "date", "format": "strict_date_optional_time||epoch_millis"} } } }' >/dev/null echo "[9/12] Lade Apache Nutch" cd "${APP_ROOT}/downloads" if [[ ! -f "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" ]]; then wget -O "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" \ "https://dlcdn.apache.org/nutch/${NUTCH_VERSION}/apache-nutch-${NUTCH_VERSION}-bin.tar.gz" fi rm -rf "${APP_ROOT}/apache-nutch-${NUTCH_VERSION}" tar -xzf "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" -C "${APP_ROOT}" ln -sfn "${APP_ROOT}/apache-nutch-${NUTCH_VERSION}" "${APP_ROOT}/nutch" mkdir -p "${APP_ROOT}/nutch/logs" "${APP_ROOT}/crawl" echo "[10/12] Konfiguriere Nutch + Hadoop local paths" cat > "${APP_ROOT}/nutch/conf/nutch-site.xml" <<'EOF' <?xml version="1.0" encoding="UTF-8"?> <configuration> <property> <name>http.agent.name</name> <value>ATSearchBot</value> </property> <property> <name>http.robots.agents</name> <value>ATSearchBot,*</value> </property> <property> <name>plugin.includes</name> <value>protocol-httpclient|urlfilter-regex|parse-(html|tika)|index-(basic|anchor|more|metadata)|indexer-opensearch-1x|urlnormalizer-(pass|regex|basic)|scoring-opic</value> </property> <property> <name>fetcher.threads.fetch</name> <value>10</value> </property> <property> <name>http.content.limit</name> <value>1048576</value> </property> <property> <name>db.ignore.external.links</name> <value>false</value> </property> <property> <name>db.ignore.internal.links</name> <value>false</value> </property> <property> <name>db.ignore.external.links.mode</name> <value>byHost</value> </property> <property> <name>generate.max.per.host</name> <value>25</value> </property> <property> <name>generate.max.per.domain</name> <value>100</value> </property> <property> <name>generate.count.mode</name> <value>byDomain</value> </property> <property> <name>fetcher.parse</name> <value>true</value> </property> <property> <name>parser.character.encoding.default</name> <value>UTF-8</value> </property> </configuration> EOF cat > "${APP_ROOT}/nutch/conf/core-site.xml" <<EOF <?xml version="1.0"?> <configuration> <property> <name>fs.defaultFS</name> <value>file:///</value> </property> <property> <name>hadoop.tmp.dir</name> <value>${APP_ROOT}/hadoop-tmp</value> </property> </configuration> EOF cat > "${APP_ROOT}/nutch/conf/mapred-site.xml" <<EOF <?xml version="1.0"?> <configuration> <property> <name>mapreduce.framework.name</name> <value>local</value> </property> <property> <name>mapreduce.jobtracker.staging.root.dir</name> <value>${APP_ROOT}/hadoop-staging</value> </property> <property> <name>mapreduce.job.reduces</name> <value>1</value> </property> </configuration> EOF cat > "${APP_ROOT}/nutch/conf/regex-urlfilter.txt" <<'EOF' +^https?://([a-z0-9-]+\.)*(at|co\.at|gv\.at|ac\.at)(/.*)?$ -^https?://.+\.(7z|avi|bin|css|doc|docx|gif|gz|ico|iso|jar|jpeg|jpg|js|mov|mp3|mp4|pdf|png|rar|svg|tar|tgz|webp|woff|woff2|xls|xlsx|zip)(\?.*)?$ -^https?://.*(\?|&)(utm_|fbclid=|gclid=|reply=|share=|sort=|filter=|session=).*$ -. EOF cat > "${APP_ROOT}/nutch/conf/index-writers.xml" <<'EOF' <?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?> <writers xmlns="http://lucene.apache.org/nutch" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://lucene.apache.org/nutch index-writers.xsd"> <writer id="opensearch" class="org.apache.nutch.indexwriter.opensearch1x.OpenSearch1xIndexWriter"> <parameters> <param name="host" value="127.0.0.1"/> <param name="port" value="9200"/> <param name="scheme" value="http"/> <param name="index" value="at_web"/> <param name="username" value=""/> <param name="password" value=""/> <param name="trust.store.path" value=""/> <param name="trust.store.password" value=""/> <param name="trust.store.type" value="JKS"/> <param name="key.store.path" value=""/> <param name="key.store.password" value=""/> <param name="key.store.type" value="JKS"/> <param name="max.bulk.docs" value="100"/> <param name="max.bulk.size" value="1048576"/> <param name="exponential.backoff.millis" value="100"/> <param name="exponential.backoff.retries" value="5"/> <param name="bulk.close.timeout" value="600"/> </parameters> <mapping> <copy> <field source="title" dest="title,search"/> </copy> <rename /> <remove> <field source="segment"/> </remove> </mapping> </writer> </writers> EOF cat > "${APP_ROOT}/seeds/seed.txt" <<'EOF' https://www.oesterreich.gv.at/ https://www.parlament.gv.at/ https://www.orf.at/ https://www.derstandard.at/ https://www.wien.gv.at/ https://www.arbeiterkammer.at/ https://www.wko.at/ https://www.ams.at/ https://www.univie.ac.at/ https://www.tuwien.at/ EOF echo "[11/12] Schreibe Wrapper" cat > /usr/local/bin/at-search-crawl <<'EOF' #!/usr/bin/env bash set -Eeuo pipefail export JAVA_HOME="${JAVA_HOME:-$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")}" export NUTCH_HOME="/opt/at-search/nutch" export NUTCH_CONF_DIR="${NUTCH_HOME}/conf" export HADOOP_CONF_DIR="${NUTCH_HOME}/conf" export HADOOP_OPTS="-Djava.io.tmpdir=/opt/at-search/hadoop-tmp" mkdir -p /opt/at-search/crawl /opt/at-search/nutch/logs /opt/at-search/hadoop-tmp /opt/at-search/hadoop-staging exec "${NUTCH_HOME}/bin/crawl" -i -s /opt/at-search/seeds /opt/at-search/crawl 2 EOF cat > /usr/local/bin/at-search-reindex-last <<'EOF' #!/usr/bin/env bash set -Eeuo pipefail export JAVA_HOME="${JAVA_HOME:-$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")}" export NUTCH_HOME="/opt/at-search/nutch" export NUTCH_CONF_DIR="${NUTCH_HOME}/conf" export HADOOP_CONF_DIR="${NUTCH_HOME}/conf" export HADOOP_OPTS="-Djava.io.tmpdir=/opt/at-search/hadoop-tmp" SEG="$(ls -1 /opt/at-search/crawl/segments | sort | tail -n1)" exec "${NUTCH_HOME}/bin/nutch" index /opt/at-search/crawl/crawldb -linkdb /opt/at-search/crawl/linkdb "/opt/at-search/crawl/segments/${SEG}" -deleteGone EOF cat > /usr/local/bin/at-search-query <<'EOF' #!/usr/bin/env bash set -Eeuo pipefail if [[ $# -lt 1 ]]; then echo 'Nutzung: at-search-query "suchbegriff"' exit 1 fi QUERY="$*" BODY="$(jq -n --arg q "$QUERY" '{ size: 10, query: { multi_match: { query: $q, fields: ["title^3","content","site^2","host"] } }, _source: ["title","url","host","site"] }')" curl -fsS "http://127.0.0.1:9200/at_web/_search" \ -H 'Content-Type: application/json' \ -d "${BODY}" | jq . EOF cat > /usr/local/bin/at-search-status <<'EOF' #!/usr/bin/env bash set -Eeuo pipefail echo "=== OpenSearch ===" curl -s http://127.0.0.1:9200 | jq . echo echo "=== Count ===" curl -s http://127.0.0.1:9200/at_web/_count | jq . echo echo "=== Segments ===" ls -1 /opt/at-search/crawl/segments 2>/dev/null || true EOF chmod +x /usr/local/bin/at-search-crawl chmod +x /usr/local/bin/at-search-reindex-last chmod +x /usr/local/bin/at-search-query chmod +x /usr/local/bin/at-search-status echo "[12/12] Setze finale Rechte" chown -R "${REAL_USER}:${REAL_GROUP}" "${APP_ROOT}" chown -R 1000:1000 "${APP_ROOT}/opensearch/data" chmod +x "${APP_ROOT}/nutch/bin/nutch" "${APP_ROOT}/nutch/bin/crawl" cat <<EOF ==== FERTIG ==== Jetzt: at-search-status at-search-crawl at-search-status at-search-query "wien" EOF
curl http://127.0.0.1:9200 curl http://127.0.0.1:9200/at_web/_count | jq at-search-status at-search-crawl 2 at-search-status at-search-query "wien" at-search-reindex-last
cp /opt/at-search/seeds/seed.txt /opt/at-search/seeds/seed.txt.bak cat > /opt/at-search/seeds/seed.txt <<'EOF' https://www.univie.ac.at/ https://studieren.univie.ac.at/ https://www.tuwien.at/ https://www.tuwien.at/studium/ EOF at-search-crawl 3 mv /opt/at-search/seeds/seed.txt.bak /opt/at-search/seeds/seed.txt