Benutzer-Werkzeuge

Webseiten-Werkzeuge


opensearch

Unterschiede

Hier werden die Unterschiede zwischen zwei Versionen angezeigt.

Link zu dieser Vergleichsansicht

Beide Seiten der vorigen Revision Vorhergehende Überarbeitung
opensearch [2026/04/12 14:16]
jango [Production]
opensearch [2026/04/12 15:27] (aktuell)
jango [v2]
Zeile 568: Zeile 568:
  
 ===v2=== ===v2===
 +
 +<code bash>
 +#!/usr/bin/env bash
 +set -Eeuo pipefail
 +
 +trap 'echo; echo "[FEHLER] setup.sh abgebrochen in Zeile $LINENO"; exit 1' ERR
 +
 +if [[ "${EUID}" -ne 0 ]]; then
 +  echo "Bitte mit sudo/root ausführen."
 +  exit 1
 +fi
 +
 +APP_ROOT="/opt/at-search"
 +NUTCH_VERSION="1.22"
 +OPENSEARCH_VERSION="1.3.20"
 +REAL_USER="${SUDO_USER:-root}"
 +REAL_GROUP="$(id -gn "$REAL_USER" 2>/dev/null || echo root)"
 +
 +echo "[1/14] Installiere Pakete"
 +export DEBIAN_FRONTEND=noninteractive
 +apt-get update
 +apt-get install -y \
 +  curl \
 +  wget \
 +  jq \
 +  tar \
 +  gzip \
 +  unzip \
 +  ca-certificates \
 +  gnupg \
 +  lsb-release \
 +  software-properties-common \
 +  docker.io \
 +  docker-compose \
 +  openjdk-11-jdk-headless \
 +  python3 \
 +  python3-venv \
 +  python3-pip
 +
 +systemctl enable docker
 +systemctl restart docker
 +
 +echo "[2/14] Setze vm.max_map_count"
 +if grep -q '^vm.max_map_count=' /etc/sysctl.conf; then
 +  sed -i 's/^vm.max_map_count=.*/vm.max_map_count=262144/' /etc/sysctl.conf
 +else
 +  echo 'vm.max_map_count=262144' >> /etc/sysctl.conf
 +fi
 +sysctl -w vm.max_map_count=262144 >/dev/null
 +
 +echo "[3/14] Setze JAVA_HOME"
 +JAVA_HOME_PATH="$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")"
 +cat > /etc/profile.d/at-search-java.sh <<EOF
 +export JAVA_HOME=${JAVA_HOME_PATH}
 +EOF
 +chmod 0644 /etc/profile.d/at-search-java.sh
 +export JAVA_HOME="${JAVA_HOME_PATH}"
 +
 +echo "[4/14] Lege Verzeichnisse an"
 +mkdir -p \
 +  "${APP_ROOT}/"{opensearch,seeds,crawl,logs,bin,downloads,hadoop-tmp,hadoop-staging,enrich} \
 +  "${APP_ROOT}/opensearch/data"
 +
 +echo "[5/14] Python-Venv für Enrichment"
 +python3 -m venv "${APP_ROOT}/venv"
 +"${APP_ROOT}/venv/bin/pip" install --upgrade pip
 +"${APP_ROOT}/venv/bin/pip" install requests
 +
 +echo "[6/14] Schreibe OpenSearch-Compose"
 +cat > "${APP_ROOT}/opensearch/docker-compose.yml" <<EOF
 +version: "3.8"
 +
 +services:
 +  opensearch:
 +    image: opensearchproject/opensearch:${OPENSEARCH_VERSION}
 +    container_name: at-opensearch
 +    restart: unless-stopped
 +    environment:
 +      - cluster.name=at-search-cluster
 +      - node.name=at-opensearch
 +      - discovery.type=single-node
 +      - bootstrap.memory_lock=true
 +      - OPENSEARCH_JAVA_OPTS=-Xms1g -Xmx1g
 +      - DISABLE_INSTALL_DEMO_CONFIG=true
 +      - DISABLE_SECURITY_PLUGIN=true
 +    ulimits:
 +      memlock:
 +        soft: -1
 +        hard: -1
 +      nofile:
 +        soft: 65536
 +        hard: 65536
 +    volumes:
 +      - ${APP_ROOT}/opensearch/data:/usr/share/opensearch/data
 +    ports:
 +      - "9200:9200"
 +      - "9600:9600"
 +EOF
 +
 +echo "[7/14] Setze Rechte"
 +chown -R "${REAL_USER}:${REAL_GROUP}" "${APP_ROOT}"
 +chown -R 1000:1000 "${APP_ROOT}/opensearch/data"
 +chmod 775 "${APP_ROOT}/opensearch/data"
 +chmod 775 "${APP_ROOT}/hadoop-tmp" "${APP_ROOT}/hadoop-staging"
 +
 +echo "[8/14] Starte OpenSearch"
 +cd "${APP_ROOT}/opensearch"
 +docker-compose up -d
 +
 +ok=0
 +for i in $(seq 1 120); do
 +  if curl -fsS "http://127.0.0.1:9200" >/dev/null 2>&1; then
 +    ok=1
 +    break
 +  fi
 +  sleep 2
 +done
 +
 +if [[ "${ok}" != "1" ]]; then
 +  echo "[FEHLER] OpenSearch kam nicht hoch:"
 +  docker-compose logs --tail=200 opensearch || true
 +  exit 1
 +fi
 +
 +echo "[9/14] Lege Index at_web neu an"
 +curl -fsS -X DELETE "http://127.0.0.1:9200/at_web" >/dev/null 2>&1 || true
 +curl -fsS -X PUT "http://127.0.0.1:9200/at_web" \
 +  -H 'Content-Type: application/json' \
 +  -d '{
 +    "settings": {
 +      "number_of_shards": 1,
 +      "number_of_replicas": 0
 +    },
 +    "mappings": {
 +      "properties": {
 +        "url": {"type": "keyword"},
 +        "host": {"type": "keyword"},
 +        "site": {"type": "keyword"},
 +        "title": {"type": "text"},
 +        "content": {"type": "text"},
 +        "digest": {"type": "keyword"},
 +        "tstamp": {"type": "date", "format": "strict_date_optional_time||epoch_millis"},
 +        "url_depth": {"type": "integer"},
 +        "is_root": {"type": "boolean"},
 +        "content_length": {"type": "integer"},
 +        "domain_class": {"type": "keyword"},
 +        "host_rank": {"type": "float"}
 +      }
 +    }
 +  }' >/dev/null
 +
 +echo "[10/14] Lade Apache Nutch"
 +cd "${APP_ROOT}/downloads"
 +if [[ ! -f "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" ]]; then
 +  wget -O "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" \
 +    "https://dlcdn.apache.org/nutch/${NUTCH_VERSION}/apache-nutch-${NUTCH_VERSION}-bin.tar.gz"
 +fi
 +
 +rm -rf "${APP_ROOT}/apache-nutch-${NUTCH_VERSION}"
 +tar -xzf "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" -C "${APP_ROOT}"
 +ln -sfn "${APP_ROOT}/apache-nutch-${NUTCH_VERSION}" "${APP_ROOT}/nutch"
 +
 +mkdir -p "${APP_ROOT}/nutch/logs" "${APP_ROOT}/crawl"
 +
 +echo "[11/14] Konfiguriere Nutch + Hadoop"
 +cat > "${APP_ROOT}/nutch/conf/nutch-site.xml" <<'EOF'
 +<?xml version="1.0" encoding="UTF-8"?>
 +<configuration>
 +  <property>
 +    <name>http.agent.name</name>
 +    <value>ATSearchBot</value>
 +  </property>
 +
 +  <property>
 +    <name>http.robots.agents</name>
 +    <value>ATSearchBot,*</value>
 +  </property>
 +
 +  <property>
 +    <name>plugin.includes</name>
 +    <value>protocol-httpclient|urlfilter-regex|parse-(html|tika)|index-(basic|anchor|more|metadata)|indexer-opensearch-1x|urlnormalizer-(pass|regex|basic)|scoring-opic</value>
 +  </property>
 +
 +  <property>
 +    <name>fetcher.threads.fetch</name>
 +    <value>10</value>
 +  </property>
 +
 +  <property>
 +    <name>http.content.limit</name>
 +    <value>1048576</value>
 +  </property>
 +
 +  <property>
 +    <name>db.ignore.external.links</name>
 +    <value>false</value>
 +  </property>
 +
 +  <property>
 +    <name>db.ignore.internal.links</name>
 +    <value>false</value>
 +  </property>
 +
 +  <property>
 +    <name>db.ignore.external.links.mode</name>
 +    <value>byHost</value>
 +  </property>
 +
 +  <property>
 +    <name>generate.max.per.host</name>
 +    <value>200</value>
 +  </property>
 +
 +  <property>
 +    <name>generate.max.per.domain</name>
 +    <value>1000</value>
 +  </property>
 +
 +  <property>
 +    <name>generate.count.mode</name>
 +    <value>byDomain</value>
 +  </property>
 +
 +  <property>
 +    <name>fetcher.parse</name>
 +    <value>true</value>
 +  </property>
 +
 +  <property>
 +    <name>parser.character.encoding.default</name>
 +    <value>UTF-8</value>
 +  </property>
 +</configuration>
 +EOF
 +
 +cat > "${APP_ROOT}/nutch/conf/core-site.xml" <<EOF
 +<?xml version="1.0"?>
 +<configuration>
 +  <property>
 +    <name>fs.defaultFS</name>
 +    <value>file:///</value>
 +  </property>
 +  <property>
 +    <name>hadoop.tmp.dir</name>
 +    <value>${APP_ROOT}/hadoop-tmp</value>
 +  </property>
 +</configuration>
 +EOF
 +
 +cat > "${APP_ROOT}/nutch/conf/mapred-site.xml" <<EOF
 +<?xml version="1.0"?>
 +<configuration>
 +  <property>
 +    <name>mapreduce.framework.name</name>
 +    <value>local</value>
 +  </property>
 +  <property>
 +    <name>mapreduce.jobtracker.staging.root.dir</name>
 +    <value>${APP_ROOT}/hadoop-staging</value>
 +  </property>
 +  <property>
 +    <name>mapreduce.job.reduces</name>
 +    <value>1</value>
 +  </property>
 +</configuration>
 +EOF
 +
 +cat > "${APP_ROOT}/nutch/conf/regex-urlfilter.txt" <<'EOF'
 ++^https?://([a-z0-9-]+\.)*(at|co\.at|gv\.at|ac\.at)(/.*)?$
 +-^https?://.+\.(7z|avi|bin|css|doc|docx|gif|gz|ico|iso|jar|jpeg|jpg|js|mov|mp3|mp4|pdf|png|rar|svg|tar|tgz|webp|woff|woff2|xls|xlsx|zip)(\?.*)?$
 +-^https?://.*(\?|&)(utm_|fbclid=|gclid=|reply=|share=|sort=|filter=|session=).*$
 +-^https?://.*/(tag|tags|category|categories|thema|themen|archiv|archive|suche|search|index|author|feed|rss)(/.*)?$
 +-^https?://.*/(page|seite)/[0-9]+/?$
 +-.
 +EOF
 +
 +cat > "${APP_ROOT}/nutch/conf/index-writers.xml" <<'EOF'
 +<?xml version="1.0" encoding="UTF-8"?>
 +<writers xmlns="http://lucene.apache.org/nutch"
 +         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 +         xsi:schemaLocation="http://lucene.apache.org/nutch index-writers.xsd">
 +
 +  <writer id="opensearch"
 +          class="org.apache.nutch.indexwriter.opensearch1x.OpenSearch1xIndexWriter">
 +    <parameters>
 +      <param name="host" value="127.0.0.1"/>
 +      <param name="port" value="9200"/>
 +      <param name="scheme" value="http"/>
 +      <param name="index" value="at_web"/>
 +      <param name="username" value=""/>
 +      <param name="password" value=""/>
 +      <param name="trust.store.path" value=""/>
 +      <param name="trust.store.password" value=""/>
 +      <param name="trust.store.type" value="JKS"/>
 +      <param name="key.store.path" value=""/>
 +      <param name="key.store.password" value=""/>
 +      <param name="key.store.type" value="JKS"/>
 +      <param name="max.bulk.docs" value="100"/>
 +      <param name="max.bulk.size" value="1048576"/>
 +      <param name="exponential.backoff.millis" value="100"/>
 +      <param name="exponential.backoff.retries" value="5"/>
 +      <param name="bulk.close.timeout" value="600"/>
 +    </parameters>
 +
 +    <mapping>
 +      <copy>
 +        <field source="title" dest="title,search"/>
 +      </copy>
 +      <rename />
 +      <remove>
 +        <field source="segment"/>
 +      </remove>
 +    </mapping>
 +  </writer>
 +
 +</writers>
 +EOF
 +
 +cat > "${APP_ROOT}/seeds/seed.txt" <<'EOF'
 +https://www.derstandard.at/
 +https://orf.at/
 +https://www.parlament.gv.at/
 +https://www.tuwien.at/
 +https://www.univie.ac.at/
 +https://www.wko.at/
 +https://www.arbeiterkammer.at/
 +https://www.ams.at/
 +https://www.wien.gv.at/
 +https://www.oesterreich.gv.at/
 +EOF
 +
 +echo "[12/14] Schreibe Enrichment-Script"
 +cat > "${APP_ROOT}/enrich/enrich_index.py" <<'EOF'
 +#!/usr/bin/env python3
 +import json
 +from urllib.parse import urlparse
 +import requests
 +
 +OS_URL = "http://127.0.0.1:9200"
 +INDEX = "at_web"
 +BATCH = 200
 +
 +def domain_class(host: str) -> str:
 +    host = (host or "").lower()
 +    if host.endswith(".gv.at") or host == "www.parlament.gv.at":
 +        return "gov"
 +    if host.endswith(".ac.at"):
 +        return "edu"
 +    if "orf.at" in host or "derstandard.at" in host:
 +        return "news"
 +    if "wko.at" in host:
 +        return "commerce"
 +    return "general"
 +
 +def host_rank(host: str) -> float:
 +    host = (host or "").lower()
 +    if host.endswith(".gv.at") or host == "www.parlament.gv.at":
 +        return 3.0
 +    if host.endswith(".ac.at"):
 +        return 2.5
 +    if host == "www.orf.at":
 +        return 2.0
 +    if host == "www.derstandard.at":
 +        return 1.8
 +    if host == "www.wko.at":
 +        return 1.4
 +    return 1.0
 +
 +def compute_fields(src: dict) -> dict:
 +    url = src.get("url", "") or ""
 +    content = src.get("content", "") or ""
 +    parsed = urlparse(url)
 +    path = parsed.path or "/"
 +    stripped = path.strip("/")
 +    depth = 0 if stripped == "" else len([p for p in stripped.split("/") if p])
 +    is_root = stripped == ""
 +    return {
 +        "url_depth": depth,
 +        "is_root": is_root,
 +        "content_length": len(content),
 +        "domain_class": domain_class(src.get("host", "")),
 +        "host_rank": host_rank(src.get("host", "")),
 +    }
 +
 +def main():
 +    session = requests.Session()
 +    search_after = None
 +    total = 0
 +
 +    while True:
 +        body = {
 +            "size": BATCH,
 +            "sort": [{"_id": "asc"}],
 +            "_source": ["url", "host", "content"],
 +            "query": {"match_all": {}}
 +        }
 +        if search_after is not None:
 +            body["search_after"] = search_after
 +
 +        r = session.post(f"{OS_URL}/{INDEX}/_search", json=body, timeout=30)
 +        r.raise_for_status()
 +        data = r.json()
 +        hits = data.get("hits", {}).get("hits", [])
 +        if not hits:
 +            break
 +
 +        lines = []
 +        for hit in hits:
 +            doc_id = hit["_id"]
 +            src = hit.get("_source", {})
 +            fields = compute_fields(src)
 +            meta = {"update": {"_index": INDEX, "_id": doc_id}}
 +            doc = {"doc": fields}
 +            lines.append(json.dumps(meta))
 +            lines.append(json.dumps(doc))
 +            total += 1
 +
 +        payload = "\n".join(lines) + "\n"
 +        br = session.post(
 +            f"{OS_URL}/_bulk",
 +            data=payload,
 +            headers={"Content-Type": "application/x-ndjson"},
 +            timeout=60,
 +        )
 +        br.raise_for_status()
 +
 +        search_after = hits[-1]["sort"]
 +
 +    print(f"Enriched documents: {total}")
 +
 +if __name__ == "__main__":
 +    main()
 +EOF
 +chmod +x "${APP_ROOT}/enrich/enrich_index.py"
 +
 +echo "[13/14] Schreibe Wrapper"
 +cat > /usr/local/bin/at-search-crawl <<'EOF'
 +#!/usr/bin/env bash
 +set -Eeuo pipefail
 +export JAVA_HOME="${JAVA_HOME:-$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")}"
 +export NUTCH_HOME="/opt/at-search/nutch"
 +export NUTCH_CONF_DIR="${NUTCH_HOME}/conf"
 +export HADOOP_CONF_DIR="${NUTCH_HOME}/conf"
 +export HADOOP_OPTS="-Djava.io.tmpdir=/opt/at-search/hadoop-tmp"
 +
 +ROUNDS="${1:-5}"
 +
 +mkdir -p /opt/at-search/crawl /opt/at-search/nutch/logs /opt/at-search/hadoop-tmp /opt/at-search/hadoop-staging
 +
 +"${NUTCH_HOME}/bin/crawl" -i -s /opt/at-search/seeds /opt/at-search/crawl "${ROUNDS}"
 +/opt/at-search/venv/bin/python /opt/at-search/enrich/enrich_index.py
 +EOF
 +
 +cat > /usr/local/bin/at-search-reindex-last <<'EOF'
 +#!/usr/bin/env bash
 +set -Eeuo pipefail
 +export JAVA_HOME="${JAVA_HOME:-$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")}"
 +export NUTCH_HOME="/opt/at-search/nutch"
 +export NUTCH_CONF_DIR="${NUTCH_HOME}/conf"
 +export HADOOP_CONF_DIR="${NUTCH_HOME}/conf"
 +export HADOOP_OPTS="-Djava.io.tmpdir=/opt/at-search/hadoop-tmp"
 +
 +SEG="$(ls -1 /opt/at-search/crawl/segments | sort | tail -n1)"
 +"${NUTCH_HOME}/bin/nutch" index /opt/at-search/crawl/crawldb -linkdb /opt/at-search/crawl/linkdb "/opt/at-search/crawl/segments/${SEG}" -deleteGone
 +/opt/at-search/venv/bin/python /opt/at-search/enrich/enrich_index.py
 +EOF
 +
 +cat > /usr/local/bin/at-search-enrich <<'EOF'
 +#!/usr/bin/env bash
 +set -Eeuo pipefail
 +exec /opt/at-search/venv/bin/python /opt/at-search/enrich/enrich_index.py
 +EOF
 +
 +cat > /usr/local/bin/at-search-reset-index <<'EOF'
 +#!/usr/bin/env bash
 +set -Eeuo pipefail
 +curl -s -X DELETE http://127.0.0.1:9200/at_web >/dev/null 2>&1 || true
 +curl -s -X PUT http://127.0.0.1:9200/at_web -H 'Content-Type: application/json' -d '{
 +  "settings": {
 +    "number_of_shards": 1,
 +    "number_of_replicas": 0
 +  },
 +  "mappings": {
 +    "properties": {
 +      "url": {"type": "keyword"},
 +      "host": {"type": "keyword"},
 +      "site": {"type": "keyword"},
 +      "title": {"type": "text"},
 +      "content": {"type": "text"},
 +      "digest": {"type": "keyword"},
 +      "tstamp": {"type": "date", "format": "strict_date_optional_time||epoch_millis"},
 +      "url_depth": {"type": "integer"},
 +      "is_root": {"type": "boolean"},
 +      "content_length": {"type": "integer"},
 +      "domain_class": {"type": "keyword"},
 +      "host_rank": {"type": "float"}
 +    }
 +  }
 +}' | jq .
 +EOF
 +
 +cat > /usr/local/bin/at-search-query <<'EOF'
 +#!/usr/bin/env bash
 +set -Eeuo pipefail
 +if [[ $# -lt 1 ]]; then
 +  echo 'Nutzung: at-search-query "suchbegriff"'
 +  exit 1
 +fi
 +QUERY="$*"
 +BODY="$(jq -n --arg q "$QUERY" '{
 +  size: 10,
 +  query: {
 +    function_score: {
 +      query: {
 +        bool: {
 +          must: [
 +            {
 +              multi_match: {
 +                query: $q,
 +                fields: ["title^8","content^2","site","host"],
 +                type: "best_fields"
 +              }
 +            }
 +          ],
 +          must_not: [
 +            { term: { is_root: true } },
 +            { regexp: { url: ".*/(tag|tags|category|categories|thema|themen|archiv|archive|suche|search|index|author|feed|rss)(/.*)?" } },
 +            { regexp: { url: ".*/(page|seite)/[0-9]+/?" } }
 +          ],
 +          should: [
 +            { match_phrase: { title: { query: $q, boost: 10 } } }
 +          ]
 +        }
 +      },
 +      functions: [
 +        { filter: { term: { domain_class: "gov" } }, weight: 3.0 },
 +        { filter: { term: { domain_class: "edu" } }, weight: 2.5 },
 +        { filter: { term: { domain_class: "news" } }, weight: 1.5 },
 +        { field_value_factor: { field: "host_rank", factor: 1.0, missing: 1.0 } },
 +        { field_value_factor: { field: "url_depth", factor: 0.25, missing: 0 } },
 +        { field_value_factor: { field: "content_length", factor: 0.0005, modifier: "log1p", missing: 1 } }
 +      ],
 +      score_mode: "sum",
 +      boost_mode: "sum"
 +    }
 +  },
 +  _source: ["title","url","host","site","url_depth","is_root","domain_class","host_rank"]
 +}')"
 +curl -fsS "http://127.0.0.1:9200/at_web/_search" \
 +  -H 'Content-Type: application/json' \
 +  -d "${BODY}" | jq .
 +EOF
 +
 +cat > /usr/local/bin/at-search-status <<'EOF'
 +#!/usr/bin/env bash
 +set -Eeuo pipefail
 +echo "=== OpenSearch ==="
 +curl -s http://127.0.0.1:9200 | jq .
 +echo
 +echo "=== Count ==="
 +curl -s http://127.0.0.1:9200/at_web/_count | jq .
 +echo
 +echo "=== Segments ==="
 +ls -1 /opt/at-search/crawl/segments 2>/dev/null || true
 +EOF
 +
 +chmod +x /usr/local/bin/at-search-crawl
 +chmod +x /usr/local/bin/at-search-reindex-last
 +chmod +x /usr/local/bin/at-search-query
 +chmod +x /usr/local/bin/at-search-status
 +chmod +x /usr/local/bin/at-search-enrich
 +chmod +x /usr/local/bin/at-search-reset-index
 +
 +echo "[14/14] Setze finale Rechte"
 +chown -R "${REAL_USER}:${REAL_GROUP}" "${APP_ROOT}"
 +chown -R 1000:1000 "${APP_ROOT}/opensearch/data"
 +chmod +x "${APP_ROOT}/nutch/bin/nutch" "${APP_ROOT}/nutch/bin/crawl"
 +
 +cat <<EOF
 +
 +==== FERTIG ====
 +
 +Jetzt:
 +  at-search-status
 +  at-search-crawl 5
 +  at-search-status
 +  at-search-query "wien"
 +
 +EOF
 +</code>
opensearch.txt · Zuletzt geändert: 2026/04/12 15:27 von jango