Benutzer-Werkzeuge

Webseiten-Werkzeuge


opensearch

Unterschiede

Hier werden die Unterschiede zwischen zwei Versionen angezeigt.

Link zu dieser Vergleichsansicht

Beide Seiten der vorigen Revision Vorhergehende Überarbeitung
Nächste Überarbeitung
Vorhergehende Überarbeitung
opensearch [2026/04/12 11:53]
jango
opensearch [2026/04/12 15:27] (aktuell)
jango [v2]
Zeile 142: Zeile 142:
 ====Production==== ====Production====
  
 +===v1===
 Run as root Run as root
 <code bash> <code bash>
Zeile 157: Zeile 158:
 NUTCH_VERSION="1.22" NUTCH_VERSION="1.22"
 OPENSEARCH_VERSION="1.3.20" OPENSEARCH_VERSION="1.3.20"
-JAVA_PKG="openjdk-11-jre-headless+REAL_USER="${SUDO_USER:-root}
-OS_UID="1000" +REAL_GROUP="$(id -gn "$REAL_USER")"
-OS_GID="1000"+
  
-echo "[1/13Systempakete installieren"+echo "[1/12Installiere Pakete"
 export DEBIAN_FRONTEND=noninteractive export DEBIAN_FRONTEND=noninteractive
 apt-get update apt-get update
Zeile 177: Zeile 177:
   docker.io \   docker.io \
   docker-compose \   docker-compose \
-  "${JAVA_PKG}"+  openjdk-11-jdk-headless
  
 systemctl enable docker systemctl enable docker
 systemctl restart docker systemctl restart docker
  
-echo "[2/13Host-Kernel-Setting für OpenSearch setzen"+echo "[2/12Setze vm.max_map_count"
 if grep -q '^vm.max_map_count=' /etc/sysctl.conf; then if grep -q '^vm.max_map_count=' /etc/sysctl.conf; then
   sed -i 's/^vm.max_map_count=.*/vm.max_map_count=262144/' /etc/sysctl.conf   sed -i 's/^vm.max_map_count=.*/vm.max_map_count=262144/' /etc/sysctl.conf
Zeile 190: Zeile 190:
 sysctl -w vm.max_map_count=262144 >/dev/null sysctl -w vm.max_map_count=262144 >/dev/null
  
-echo "[3/13Verzeichnisstruktur anlegen"+echo "[3/12Setze JAVA_HOME" 
 +JAVA_HOME_PATH="$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")" 
 +cat > /etc/profile.d/at-search-java.sh <<EOF 
 +export JAVA_HOME=${JAVA_HOME_PATH} 
 +EOF 
 +chmod 0644 /etc/profile.d/at-search-java.sh 
 +export JAVA_HOME="${JAVA_HOME_PATH}" 
 + 
 +echo "[4/12] Lege Verzeichnisse an"
 mkdir -p \ mkdir -p \
-  "${APP_ROOT}/"{opensearch,seeds,crawl,logs,bin,downloads} \+  "${APP_ROOT}/"{opensearch,seeds,crawl,logs,bin,downloads,hadoop-tmp,hadoop-staging} \
   "${APP_ROOT}/opensearch/data"   "${APP_ROOT}/opensearch/data"
  
-echo "[4/13Rechte für OpenSearch-Datenverzeichnis korrekt setzen" +echo "[5/12Schreibe OpenSearch-Compose"
-chown -R "${OS_UID}:${OS_GID}" "${APP_ROOT}/opensearch/data" +
-chmod 775 "${APP_ROOT}/opensearch/data" +
- +
-echo "[5/13] Docker-Compose für OpenSearch schreiben"+
 cat > "${APP_ROOT}/opensearch/docker-compose.yml" <<EOF cat > "${APP_ROOT}/opensearch/docker-compose.yml" <<EOF
 version: "3.8" version: "3.8"
Zeile 230: Zeile 234:
 EOF EOF
  
-echo "[6/13] OpenSearch starten"+echo "[6/12Setze Rechte" 
 +chown -R "${REAL_USER}:${REAL_GROUP}" "${APP_ROOT}" 
 +chown -R 1000:1000 "${APP_ROOT}/opensearch/data" 
 +chmod 775 "${APP_ROOT}/opensearch/data" 
 +chmod 775 "${APP_ROOT}/hadoop-tmp" "${APP_ROOT}/hadoop-staging" 
 + 
 +echo "[7/12] Starte OpenSearch"
 cd "${APP_ROOT}/opensearch" cd "${APP_ROOT}/opensearch"
 docker-compose up -d docker-compose up -d
  
-echo "Warte auf OpenSearch ..." 
 ok=0 ok=0
 for i in $(seq 1 120); do for i in $(seq 1 120); do
Zeile 245: Zeile 254:
  
 if [[ "${ok}" != "1" ]]; then if [[ "${ok}" != "1" ]]; then
-  echo +  echo "[FEHLER] OpenSearch kam nicht hoch:"
-  echo "[FEHLER] OpenSearch kam nicht hoch. Letzte Logs:"+
   docker-compose logs --tail=200 opensearch || true   docker-compose logs --tail=200 opensearch || true
   exit 1   exit 1
 fi fi
  
-echo "[7/13Basis-Index anlegen"+echo "[8/12Lege Index at_web neu an" 
 +curl -fsS -X DELETE "http://127.0.0.1:9200/at_web" >/dev/null 2>&1 || true
 curl -fsS -X PUT "http://127.0.0.1:9200/at_web" \ curl -fsS -X PUT "http://127.0.0.1:9200/at_web" \
   -H 'Content-Type: application/json' \   -H 'Content-Type: application/json' \
Zeile 261: Zeile 270:
     "mappings": {     "mappings": {
       "properties": {       "properties": {
-        "url":      { "type": "keyword" }, +        "url": {"type": "keyword"}, 
-        "host":     { "type": "keyword" }, +        "host": {"type": "keyword"}, 
-        "site":     { "type": "keyword" }, +        "site": {"type": "keyword"}, 
-        "title":    { "type": "text" }, +        "title": {"type": "text"}, 
-        "content":  { "type": "text" }, +        "content": {"type": "text"}, 
-        "type":     { "type": "keyword" }, +        "digest": {"type": "keyword"}, 
-        "digest":   { "type": "keyword" }, +        "tstamp": {"type": "date", "format": "strict_date_optional_time||epoch_millis"}
-        "segment":  { "type": "keyword" }, +
-        "tstamp":   { "type": "date", "format": "strict_date_optional_time||epoch_millis" }+
       }       }
     }     }
-  }' >/dev/null || true+  }' >/dev/null
  
-echo "[8/13] Apache Nutch herunterladen"+echo "[9/12Lade Apache Nutch"
 cd "${APP_ROOT}/downloads" cd "${APP_ROOT}/downloads"
 if [[ ! -f "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" ]]; then if [[ ! -f "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" ]]; then
Zeile 284: Zeile 291:
 tar -xzf "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" -C "${APP_ROOT}" tar -xzf "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" -C "${APP_ROOT}"
 ln -sfn "${APP_ROOT}/apache-nutch-${NUTCH_VERSION}" "${APP_ROOT}/nutch" ln -sfn "${APP_ROOT}/apache-nutch-${NUTCH_VERSION}" "${APP_ROOT}/nutch"
-chmod +x "${APP_ROOT}/nutch/bin/nutch" 
  
-echo "[9/13] JAVA_HOME setzen" +mkdir -"${APP_ROOT}/nutch/logs" "${APP_ROOT}/crawl"
-JAVA_HOME_PATH="$(readlink -f /usr/bin/java | sed 's:bin/java::')" +
-cat > /etc/profile.d/at-search-java.sh <<EOF +
-export JAVA_HOME=${JAVA_HOME_PATH} +
-EOF +
-chmod 0644 /etc/profile.d/at-search-java.sh +
-export JAVA_HOME="${JAVA_HOME_PATH}"+
  
-echo "[10/13] Nutch konfigurieren"+echo "[10/12Konfiguriere Nutch + Hadoop local paths"
 cat > "${APP_ROOT}/nutch/conf/nutch-site.xml" <<'EOF' cat > "${APP_ROOT}/nutch/conf/nutch-site.xml" <<'EOF'
 <?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
 <configuration> <configuration>
- 
   <property>   <property>
     <name>http.agent.name</name>     <name>http.agent.name</name>
Zeile 311: Zeile 310:
   <property>   <property>
     <name>plugin.includes</name>     <name>plugin.includes</name>
-    <value>protocol-httpclient|urlfilter-regex|parse-(html|tika)|index-(basic|anchor|more|metadata)|indexer-opensearch1x|urlnormalizer-(pass|regex|basic)|scoring-opic</value>+    <value>protocol-httpclient|urlfilter-regex|parse-(html|tika)|index-(basic|anchor|more|metadata)|indexer-opensearch-1x|urlnormalizer-(pass|regex|basic)|scoring-opic</value>
   </property>   </property>
  
Zeile 363: Zeile 362:
     <value>UTF-8</value>     <value>UTF-8</value>
   </property>   </property>
 +</configuration>
 +EOF
  
 +cat > "${APP_ROOT}/nutch/conf/core-site.xml" <<EOF
 +<?xml version="1.0"?>
 +<configuration>
 +  <property>
 +    <name>fs.defaultFS</name>
 +    <value>file:///</value>
 +  </property>
 +  <property>
 +    <name>hadoop.tmp.dir</name>
 +    <value>${APP_ROOT}/hadoop-tmp</value>
 +  </property>
 +</configuration>
 +EOF
 +
 +cat > "${APP_ROOT}/nutch/conf/mapred-site.xml" <<EOF
 +<?xml version="1.0"?>
 +<configuration>
 +  <property>
 +    <name>mapreduce.framework.name</name>
 +    <value>local</value>
 +  </property>
 +  <property>
 +    <name>mapreduce.jobtracker.staging.root.dir</name>
 +    <value>${APP_ROOT}/hadoop-staging</value>
 +  </property>
 +  <property>
 +    <name>mapreduce.job.reduces</name>
 +    <value>1</value>
 +  </property>
 </configuration> </configuration>
 EOF EOF
Zeile 375: Zeile 405:
  
 cat > "${APP_ROOT}/nutch/conf/index-writers.xml" <<'EOF' cat > "${APP_ROOT}/nutch/conf/index-writers.xml" <<'EOF'
 +<?xml version="1.0" encoding="UTF-8"?>
 <?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
 <writers xmlns="http://lucene.apache.org/nutch" <writers xmlns="http://lucene.apache.org/nutch"
Zeile 380: Zeile 411:
          xsi:schemaLocation="http://lucene.apache.org/nutch index-writers.xsd">          xsi:schemaLocation="http://lucene.apache.org/nutch index-writers.xsd">
  
-  <writer id="indexer_opensearch_1x_1"+  <writer id="opensearch"
           class="org.apache.nutch.indexwriter.opensearch1x.OpenSearch1xIndexWriter">           class="org.apache.nutch.indexwriter.opensearch1x.OpenSearch1xIndexWriter">
     <parameters>     <parameters>
-      <param name="host" value="localhost"/>+      <param name="host" value="127.0.0.1"/>
       <param name="port" value="9200"/>       <param name="port" value="9200"/>
       <param name="scheme" value="http"/>       <param name="scheme" value="http"/>
       <param name="index" value="at_web"/>       <param name="index" value="at_web"/>
-      <param name="max.bulk.docs" value="250"/> +      <param name="username" value=""/> 
-      <param name="max.bulk.size" value="2500500"/>+      <param name="password" value=""/> 
 +      <param name="trust.store.path" value=""/> 
 +      <param name="trust.store.password" value=""/> 
 +      <param name="trust.store.type" value="JKS"/> 
 +      <param name="key.store.path" value=""/> 
 +      <param name="key.store.password" value=""/> 
 +      <param name="key.store.type" value="JKS"/> 
 +      <param name="max.bulk.docs" value="100"/> 
 +      <param name="max.bulk.size" value="1048576"/>
       <param name="exponential.backoff.millis" value="100"/>       <param name="exponential.backoff.millis" value="100"/>
-      <param name="exponential.backoff.retries" value="10"/>+      <param name="exponential.backoff.retries" value="5"/>
       <param name="bulk.close.timeout" value="600"/>       <param name="bulk.close.timeout" value="600"/>
     </parameters>     </parameters>
Zeile 404: Zeile 443:
     </mapping>     </mapping>
   </writer>   </writer>
 +
 </writers> </writers>
 EOF EOF
  
-echo "[11/13] Österreich-Seeds schreiben" 
 cat > "${APP_ROOT}/seeds/seed.txt" <<'EOF' cat > "${APP_ROOT}/seeds/seed.txt" <<'EOF'
 https://www.oesterreich.gv.at/ https://www.oesterreich.gv.at/
Zeile 421: Zeile 460:
 EOF EOF
  
-echo "[12/13Helper-Skripte schreiben+echo "[11/12Schreibe Wrapper
-cat > "${APP_ROOT}/bin/run-crawl.sh" <<'EOF'+cat > /usr/local/bin/at-search-crawl <<'EOF'
 #!/usr/bin/env bash #!/usr/bin/env bash
 set -Eeuo pipefail set -Eeuo pipefail
- +export JAVA_HOME="${JAVA_HOME:-$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")}"
-export JAVA_HOME="${JAVA_HOME:-$(readlink -f /usr/bin/java | sed 's:bin/java::')}"+
 export NUTCH_HOME="/opt/at-search/nutch" export NUTCH_HOME="/opt/at-search/nutch"
 export NUTCH_CONF_DIR="${NUTCH_HOME}/conf" export NUTCH_CONF_DIR="${NUTCH_HOME}/conf"
- +export HADOOP_CONF_DIR="${NUTCH_HOME}/conf
-ROUNDS="${1:-3}" +export HADOOP_OPTS="-Djava.io.tmpdir=/opt/at-search/hadoop-tmp
-CRAWL_DIR="/opt/at-search/crawl+mkdir -p /opt/at-search/crawl /opt/at-search/nutch/logs /opt/at-search/hadoop-tmp /opt/at-search/hadoop-staging 
-SEED_DIR="/opt/at-search/seeds" +exec "${NUTCH_HOME}/bin/crawl" -i -s /opt/at-search/seeds /opt/at-search/crawl 2
- +
-mkdir -p "${CRAWL_DIR}" +
- +
-cd "${NUTCH_HOME}" +
-exec "${NUTCH_HOME}/bin/crawl" -i -s "${SEED_DIR}" "${CRAWL_DIR}" "${ROUNDS}"+
 EOF EOF
-chmod +x "${APP_ROOT}/bin/run-crawl.sh" 
  
-cat > "${APP_ROOT}/bin/search.sh" <<'EOF'+cat > /usr/local/bin/at-search-reindex-last <<'EOF'
 #!/usr/bin/env bash #!/usr/bin/env bash
 set -Eeuo pipefail set -Eeuo pipefail
 +export JAVA_HOME="${JAVA_HOME:-$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")}"
 +export NUTCH_HOME="/opt/at-search/nutch"
 +export NUTCH_CONF_DIR="${NUTCH_HOME}/conf"
 +export HADOOP_CONF_DIR="${NUTCH_HOME}/conf"
 +export HADOOP_OPTS="-Djava.io.tmpdir=/opt/at-search/hadoop-tmp"
 +SEG="$(ls -1 /opt/at-search/crawl/segments | sort | tail -n1)"
 +exec "${NUTCH_HOME}/bin/nutch" index /opt/at-search/crawl/crawldb -linkdb /opt/at-search/crawl/linkdb "/opt/at-search/crawl/segments/${SEG}" -deleteGone
 +EOF
  
 +cat > /usr/local/bin/at-search-query <<'EOF'
 +#!/usr/bin/env bash
 +set -Eeuo pipefail
 if [[ $# -lt 1 ]]; then if [[ $# -lt 1 ]]; then
-  echo "Nutzung: $0 \"suchbegriff\""+  echo 'Nutzung: at-search-query "suchbegriff"'
   exit 1   exit 1
 fi fi
- 
 QUERY="$*" QUERY="$*"
 BODY="$(jq -n --arg q "$QUERY" '{ BODY="$(jq -n --arg q "$QUERY" '{
Zeile 461: Zeile 503:
   _source: ["title","url","host","site"]   _source: ["title","url","host","site"]
 }')" }')"
- 
 curl -fsS "http://127.0.0.1:9200/at_web/_search" \ curl -fsS "http://127.0.0.1:9200/at_web/_search" \
   -H 'Content-Type: application/json' \   -H 'Content-Type: application/json' \
   -d "${BODY}" | jq .   -d "${BODY}" | jq .
 EOF EOF
-chmod +x "${APP_ROOT}/bin/search.sh" 
  
-cat > /usr/local/bin/at-search-crawl <<'EOF'+cat > /usr/local/bin/at-search-status <<'EOF'
 #!/usr/bin/env bash #!/usr/bin/env bash
-exec /opt/at-search/bin/run-crawl.sh "$@"+set -Eeuo pipefail 
 +echo "=== OpenSearch ===" 
 +curl -s http://127.0.0.1:9200 | jq . 
 +echo 
 +echo "=== Count ===" 
 +curl -s http://127.0.0.1:9200/at_web/_count | jq . 
 +echo 
 +echo "=== Segments ===" 
 +ls -1 /opt/at-search/crawl/segments 2>/dev/null || true
 EOF EOF
-chmod +x /usr/local/bin/at-search-crawl 
  
-cat > /usr/local/bin/at-search-query <<'EOF' +chmod +x /usr/local/bin/at-search-crawl 
-#!/usr/bin/env bash +chmod +x /usr/local/bin/at-search-reindex-last
-exec /opt/at-search/bin/search.sh "$@" +
-EOF+
 chmod +x /usr/local/bin/at-search-query chmod +x /usr/local/bin/at-search-query
 +chmod +x /usr/local/bin/at-search-status
  
-echo "[13/13Fertig"+echo "[12/12Setze finale Rechte" 
 +chown -R "${REAL_USER}:${REAL_GROUP}" "${APP_ROOT}" 
 +chown -R 1000:1000 "${APP_ROOT}/opensearch/data" 
 +chmod +x "${APP_ROOT}/nutch/bin/nutch" "${APP_ROOT}/nutch/bin/crawl"
  
-cat <<'EOF'+cat <<EOF
  
-==== INSTALLATION FERTIG ====+==== FERTIG ====
  
-Jetzt genau das ausführen:+Jetzt: 
 +  at-search-status 
 +  at-search-crawl 
 +  at-search-status 
 +  at-search-query "wien"
  
-1) +EOF 
-curl http://127.0.0.1:9200+</code>
  
-2) +<code bash> 
-at-search-crawl 2 +curl http://127.0.0.1:9200
- +
-3)+
 curl http://127.0.0.1:9200/at_web/_count | jq curl http://127.0.0.1:9200/at_web/_count | jq
  
-4)+at-search-status 
 +at-search-crawl 2 
 +at-search-status
 at-search-query "wien" at-search-query "wien"
 +at-search-reindex-last
 +</code>
  
-Wichtige Pfade: +<code bash> 
-  /opt/at-search/nutch +cp /opt/at-search/seeds/seed.txt /opt/at-search/seeds/seed.txt.bak 
-  /opt/at-search/seeds/seed.txt +cat > /opt/at-search/seeds/seed.txt <<'EOF' 
-  /opt/at-search/crawl +https://www.univie.ac.at/ 
-  /opt/at-search/opensearch/docker-compose.yml +https://studieren.univie.ac.at/ 
- +https://www.tuwien.at/ 
-Logs+https://www.tuwien.at/studium/
-  cd /opt/at-search/opensearch && docker-compose logs -f+
 EOF EOF
 +at-search-crawl 3
 +mv /opt/at-search/seeds/seed.txt.bak /opt/at-search/seeds/seed.txt
 </code> </code>
 +
 +===v2===
  
 <code bash> <code bash>
-curl http://127.0.0.1:9200 +#!/usr/bin/env bash 
-sudo at-search-crawl 2 +set -Eeuo pipefail 
-curl http://127.0.0.1:9200/at_web/_count | jq + 
-sudo at-search-query "wien"+trap 'echo; echo "[FEHLER] setup.sh abgebrochen in Zeile $LINENO"; exit 1' ERR 
 + 
 +if [[ "${EUID}" -ne 0 ]]; then 
 +  echo "Bitte mit sudo/root ausführen." 
 +  exit 1 
 +fi 
 + 
 +APP_ROOT="/opt/at-search" 
 +NUTCH_VERSION="1.22" 
 +OPENSEARCH_VERSION="1.3.20" 
 +REAL_USER="${SUDO_USER:-root}" 
 +REAL_GROUP="$(id -gn "$REAL_USER" 2>/dev/null || echo root)" 
 + 
 +echo "[1/14] Installiere Pakete" 
 +export DEBIAN_FRONTEND=noninteractive 
 +apt-get update 
 +apt-get install -y \ 
 +  curl 
 +  wget \ 
 +  jq \ 
 +  tar \ 
 +  gzip \ 
 +  unzip \ 
 +  ca-certificates \ 
 +  gnupg \ 
 +  lsb-release \ 
 +  software-properties-common \ 
 +  docker.io \ 
 +  docker-compose \ 
 +  openjdk-11-jdk-headless \ 
 +  python3 \ 
 +  python3-venv \ 
 +  python3-pip 
 + 
 +systemctl enable docker 
 +systemctl restart docker 
 + 
 +echo "[2/14] Setze vm.max_map_count" 
 +if grep -q '^vm.max_map_count=' /etc/sysctl.conf; then 
 +  sed -i 's/^vm.max_map_count=.*/vm.max_map_count=262144/' /etc/sysctl.conf 
 +else 
 +  echo 'vm.max_map_count=262144' >> /etc/sysctl.conf 
 +fi 
 +sysctl -w vm.max_map_count=262144 >/dev/null 
 + 
 +echo "[3/14] Setze JAVA_HOME" 
 +JAVA_HOME_PATH="$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")" 
 +cat > /etc/profile.d/at-search-java.sh <<EOF 
 +export JAVA_HOME=${JAVA_HOME_PATH} 
 +EOF 
 +chmod 0644 /etc/profile.d/at-search-java.sh 
 +export JAVA_HOME="${JAVA_HOME_PATH}" 
 + 
 +echo "[4/14] Lege Verzeichnisse an" 
 +mkdir -p \ 
 +  "${APP_ROOT}/"{opensearch,seeds,crawl,logs,bin,downloads,hadoop-tmp,hadoop-staging,enrich} \ 
 +  "${APP_ROOT}/opensearch/data" 
 + 
 +echo "[5/14] Python-Venv für Enrichment" 
 +python3 -m venv "${APP_ROOT}/venv" 
 +"${APP_ROOT}/venv/bin/pip" install --upgrade pip 
 +"${APP_ROOT}/venv/bin/pip" install requests 
 + 
 +echo "[6/14] Schreibe OpenSearch-Compose" 
 +cat > "${APP_ROOT}/opensearch/docker-compose.yml" <<EOF 
 +version: "3.8" 
 + 
 +services: 
 +  opensearch: 
 +    image: opensearchproject/opensearch:${OPENSEARCH_VERSION} 
 +    container_name: at-opensearch 
 +    restart: unless-stopped 
 +    environment: 
 +      - cluster.name=at-search-cluster 
 +      - node.name=at-opensearch 
 +      - discovery.type=single-node 
 +      - bootstrap.memory_lock=true 
 +      - OPENSEARCH_JAVA_OPTS=-Xms1g -Xmx1g 
 +      - DISABLE_INSTALL_DEMO_CONFIG=true 
 +      - DISABLE_SECURITY_PLUGIN=true 
 +    ulimits: 
 +      memlock: 
 +        soft: -1 
 +        hard: -1 
 +      nofile: 
 +        soft: 65536 
 +        hard: 65536 
 +    volumes: 
 +      - ${APP_ROOT}/opensearch/data:/usr/share/opensearch/data 
 +    ports: 
 +      - "9200:9200" 
 +      - "9600:9600" 
 +EOF 
 + 
 +echo "[7/14] Setze Rechte" 
 +chown -R "${REAL_USER}:${REAL_GROUP}" "${APP_ROOT}" 
 +chown -R 1000:1000 "${APP_ROOT}/opensearch/data" 
 +chmod 775 "${APP_ROOT}/opensearch/data" 
 +chmod 775 "${APP_ROOT}/hadoop-tmp" "${APP_ROOT}/hadoop-staging" 
 + 
 +echo "[8/14] Starte OpenSearch" 
 +cd "${APP_ROOT}/opensearch" 
 +docker-compose up -d 
 + 
 +ok=0 
 +for i in $(seq 1 120); do 
 +  if curl -fsS "http://127.0.0.1:9200" >/dev/null 2>&1; then 
 +    ok=1 
 +    break 
 +  fi 
 +  sleep 2 
 +done 
 + 
 +if [[ "${ok}" != "1" ]]; then 
 +  echo "[FEHLER] OpenSearch kam nicht hoch:" 
 +  docker-compose logs --tail=200 opensearch || true 
 +  exit 1 
 +fi 
 + 
 +echo "[9/14] Lege Index at_web neu an" 
 +curl -fsS -X DELETE "http://127.0.0.1:9200/at_web" >/dev/null 2>&1 || true 
 +curl -fsS -X PUT "http://127.0.0.1:9200/at_web"
 +  -H 'Content-Type: application/json'
 +  -d '{ 
 +    "settings":
 +      "number_of_shards": 1, 
 +      "number_of_replicas":
 +    }, 
 +    "mappings":
 +      "properties":
 +        "url": {"type": "keyword"}, 
 +        "host": {"type": "keyword"}, 
 +        "site": {"type": "keyword"}, 
 +        "title": {"type": "text"}, 
 +        "content": {"type": "text"}, 
 +        "digest": {"type": "keyword"}, 
 +        "tstamp": {"type": "date", "format": "strict_date_optional_time||epoch_millis"}, 
 +        "url_depth": {"type": "integer"}, 
 +        "is_root": {"type": "boolean"}, 
 +        "content_length": {"type": "integer"}, 
 +        "domain_class": {"type": "keyword"}, 
 +        "host_rank": {"type": "float"
 +      } 
 +    } 
 +  }' >/dev/null 
 + 
 +echo "[10/14] Lade Apache Nutch" 
 +cd "${APP_ROOT}/downloads" 
 +if [[ ! -f "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" ]]; then 
 +  wget -O "apache-nutch-${NUTCH_VERSION}-bin.tar.gz"
 +    "https://dlcdn.apache.org/nutch/${NUTCH_VERSION}/apache-nutch-${NUTCH_VERSION}-bin.tar.gz" 
 +fi 
 + 
 +rm -rf "${APP_ROOT}/apache-nutch-${NUTCH_VERSION}" 
 +tar -xzf "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" -C "${APP_ROOT}" 
 +ln -sfn "${APP_ROOT}/apache-nutch-${NUTCH_VERSION}" "${APP_ROOT}/nutch" 
 + 
 +mkdir -p "${APP_ROOT}/nutch/logs" "${APP_ROOT}/crawl" 
 + 
 +echo "[11/14] Konfiguriere Nutch + Hadoop" 
 +cat > "${APP_ROOT}/nutch/conf/nutch-site.xml" <<'EOF' 
 +<?xml version="1.0" encoding="UTF-8"?> 
 +<configuration> 
 +  <property> 
 +    <name>http.agent.name</name> 
 +    <value>ATSearchBot</value> 
 +  </property> 
 + 
 +  <property> 
 +    <name>http.robots.agents</name> 
 +    <value>ATSearchBot,*</value> 
 +  </property> 
 + 
 +  <property> 
 +    <name>plugin.includes</name> 
 +    <value>protocol-httpclient|urlfilter-regex|parse-(html|tika)|index-(basic|anchor|more|metadata)|indexer-opensearch-1x|urlnormalizer-(pass|regex|basic)|scoring-opic</value> 
 +  </property> 
 + 
 +  <property> 
 +    <name>fetcher.threads.fetch</name> 
 +    <value>10</value> 
 +  </property> 
 + 
 +  <property> 
 +    <name>http.content.limit</name> 
 +    <value>1048576</value> 
 +  </property> 
 + 
 +  <property> 
 +    <name>db.ignore.external.links</name> 
 +    <value>false</value> 
 +  </property> 
 + 
 +  <property> 
 +    <name>db.ignore.internal.links</name> 
 +    <value>false</value> 
 +  </property> 
 + 
 +  <property> 
 +    <name>db.ignore.external.links.mode</name> 
 +    <value>byHost</value> 
 +  </property> 
 + 
 +  <property> 
 +    <name>generate.max.per.host</name> 
 +    <value>200</value> 
 +  </property> 
 + 
 +  <property> 
 +    <name>generate.max.per.domain</name> 
 +    <value>1000</value> 
 +  </property> 
 + 
 +  <property> 
 +    <name>generate.count.mode</name> 
 +    <value>byDomain</value> 
 +  </property> 
 + 
 +  <property> 
 +    <name>fetcher.parse</name> 
 +    <value>true</value> 
 +  </property> 
 + 
 +  <property> 
 +    <name>parser.character.encoding.default</name> 
 +    <value>UTF-8</value> 
 +  </property> 
 +</configuration> 
 +EOF 
 + 
 +cat > "${APP_ROOT}/nutch/conf/core-site.xml" <<EOF 
 +<?xml version="1.0"?> 
 +<configuration> 
 +  <property> 
 +    <name>fs.defaultFS</name> 
 +    <value>file:///</value> 
 +  </property> 
 +  <property> 
 +    <name>hadoop.tmp.dir</name> 
 +    <value>${APP_ROOT}/hadoop-tmp</value> 
 +  </property> 
 +</configuration> 
 +EOF 
 + 
 +cat > "${APP_ROOT}/nutch/conf/mapred-site.xml" <<EOF 
 +<?xml version="1.0"?> 
 +<configuration> 
 +  <property> 
 +    <name>mapreduce.framework.name</name> 
 +    <value>local</value> 
 +  </property> 
 +  <property> 
 +    <name>mapreduce.jobtracker.staging.root.dir</name> 
 +    <value>${APP_ROOT}/hadoop-staging</value> 
 +  </property> 
 +  <property> 
 +    <name>mapreduce.job.reduces</name> 
 +    <value>1</value> 
 +  </property> 
 +</configuration> 
 +EOF 
 + 
 +cat > "${APP_ROOT}/nutch/conf/regex-urlfilter.txt" <<'EOF' 
 ++^https?://([a-z0-9-]+\.)*(at|co\.at|gv\.at|ac\.at)(/.*)?
 +-^https?://.+\.(7z|avi|bin|css|doc|docx|gif|gz|ico|iso|jar|jpeg|jpg|js|mov|mp3|mp4|pdf|png|rar|svg|tar|tgz|webp|woff|woff2|xls|xlsx|zip)(\?.*)?
 +-^https?://.*(\?|&)(utm_|fbclid=|gclid=|reply=|share=|sort=|filter=|session=).*$ 
 +-^https?://.*/(tag|tags|category|categories|thema|themen|archiv|archive|suche|search|index|author|feed|rss)(/.*)?
 +-^https?://.*/(page|seite)/[0-9]+/?
 +-. 
 +EOF 
 + 
 +cat > "${APP_ROOT}/nutch/conf/index-writers.xml" <<'EOF' 
 +<?xml version="1.0" encoding="UTF-8"?> 
 +<writers xmlns="http://lucene.apache.org/nutch" 
 +         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
 +         xsi:schemaLocation="http://lucene.apache.org/nutch index-writers.xsd"> 
 + 
 +  <writer id="opensearch" 
 +          class="org.apache.nutch.indexwriter.opensearch1x.OpenSearch1xIndexWriter"> 
 +    <parameters> 
 +      <param name="host" value="127.0.0.1"/> 
 +      <param name="port" value="9200"/> 
 +      <param name="scheme" value="http"/> 
 +      <param name="index" value="at_web"/> 
 +      <param name="username" value=""/> 
 +      <param name="password" value=""/> 
 +      <param name="trust.store.path" value=""/> 
 +      <param name="trust.store.password" value=""/> 
 +      <param name="trust.store.type" value="JKS"/> 
 +      <param name="key.store.path" value=""/> 
 +      <param name="key.store.password" value=""/> 
 +      <param name="key.store.type" value="JKS"/> 
 +      <param name="max.bulk.docs" value="100"/> 
 +      <param name="max.bulk.size" value="1048576"/> 
 +      <param name="exponential.backoff.millis" value="100"/> 
 +      <param name="exponential.backoff.retries" value="5"/> 
 +      <param name="bulk.close.timeout" value="600"/> 
 +    </parameters> 
 + 
 +    <mapping> 
 +      <copy> 
 +        <field source="title" dest="title,search"/> 
 +      </copy> 
 +      <rename /> 
 +      <remove> 
 +        <field source="segment"/> 
 +      </remove> 
 +    </mapping> 
 +  </writer> 
 + 
 +</writers> 
 +EOF 
 + 
 +cat > "${APP_ROOT}/seeds/seed.txt" <<'EOF' 
 +https://www.derstandard.at/ 
 +https://orf.at/ 
 +https://www.parlament.gv.at/ 
 +https://www.tuwien.at/ 
 +https://www.univie.ac.at/ 
 +https://www.wko.at/ 
 +https://www.arbeiterkammer.at/ 
 +https://www.ams.at/ 
 +https://www.wien.gv.at/ 
 +https://www.oesterreich.gv.at/ 
 +EOF 
 + 
 +echo "[12/14] Schreibe Enrichment-Script" 
 +cat > "${APP_ROOT}/enrich/enrich_index.py" <<'EOF' 
 +#!/usr/bin/env python3 
 +import json 
 +from urllib.parse import urlparse 
 +import requests 
 + 
 +OS_URL = "http://127.0.0.1:9200" 
 +INDEX = "at_web" 
 +BATCH = 200 
 + 
 +def domain_class(host: str) -> str: 
 +    host = (host or "").lower() 
 +    if host.endswith(".gv.at") or host == "www.parlament.gv.at": 
 +        return "gov" 
 +    if host.endswith(".ac.at"): 
 +        return "edu" 
 +    if "orf.at" in host or "derstandard.at" in host: 
 +        return "news" 
 +    if "wko.at" in host: 
 +        return "commerce" 
 +    return "general" 
 + 
 +def host_rank(host: str) -> float: 
 +    host = (host or "").lower() 
 +    if host.endswith(".gv.at") or host == "www.parlament.gv.at": 
 +        return 3.0 
 +    if host.endswith(".ac.at"): 
 +        return 2.5 
 +    if host == "www.orf.at": 
 +        return 2.0 
 +    if host == "www.derstandard.at": 
 +        return 1.8 
 +    if host == "www.wko.at": 
 +        return 1.4 
 +    return 1.0 
 + 
 +def compute_fields(src: dict) -> dict: 
 +    url = src.get("url", "") or "" 
 +    content = src.get("content", "") or "" 
 +    parsed = urlparse(url) 
 +    path = parsed.path or "/" 
 +    stripped = path.strip("/"
 +    depth = 0 if stripped == "" else len([p for p in stripped.split("/") if p]) 
 +    is_root = stripped == "" 
 +    return { 
 +        "url_depth": depth, 
 +        "is_root": is_root, 
 +        "content_length": len(content), 
 +        "domain_class": domain_class(src.get("host", "")), 
 +        "host_rank": host_rank(src.get("host", "")), 
 +    } 
 + 
 +def main(): 
 +    session = requests.Session() 
 +    search_after = None 
 +    total = 0 
 + 
 +    while True: 
 +        body = { 
 +            "size": BATCH, 
 +            "sort": [{"_id": "asc"}], 
 +            "_source": ["url", "host", "content"], 
 +            "query": {"match_all": {}} 
 +        } 
 +        if search_after is not None: 
 +            body["search_after"] = search_after 
 + 
 +        r = session.post(f"{OS_URL}/{INDEX}/_search", json=body, timeout=30) 
 +        r.raise_for_status() 
 +        data = r.json() 
 +        hits = data.get("hits", {}).get("hits", []) 
 +        if not hits: 
 +            break 
 + 
 +        lines = [] 
 +        for hit in hits: 
 +            doc_id = hit["_id"
 +            src = hit.get("_source", {}) 
 +            fields = compute_fields(src) 
 +            meta = {"update": {"_index": INDEX, "_id": doc_id}} 
 +            doc = {"doc": fields} 
 +            lines.append(json.dumps(meta)) 
 +            lines.append(json.dumps(doc)) 
 +            total += 1 
 + 
 +        payload = "\n".join(lines) + "\n" 
 +        br = session.post( 
 +            f"{OS_URL}/_bulk", 
 +            data=payload, 
 +            headers={"Content-Type": "application/x-ndjson"}, 
 +            timeout=60, 
 +        ) 
 +        br.raise_for_status() 
 + 
 +        search_after = hits[-1]["sort"
 + 
 +    print(f"Enriched documents: {total}"
 + 
 +if __name__ == "__main__": 
 +    main() 
 +EOF 
 +chmod +x "${APP_ROOT}/enrich/enrich_index.py" 
 + 
 +echo "[13/14] Schreibe Wrapper" 
 +cat > /usr/local/bin/at-search-crawl <<'EOF' 
 +#!/usr/bin/env bash 
 +set -Eeuo pipefail 
 +export JAVA_HOME="${JAVA_HOME:-$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")}" 
 +export NUTCH_HOME="/opt/at-search/nutch" 
 +export NUTCH_CONF_DIR="${NUTCH_HOME}/conf" 
 +export HADOOP_CONF_DIR="${NUTCH_HOME}/conf" 
 +export HADOOP_OPTS="-Djava.io.tmpdir=/opt/at-search/hadoop-tmp" 
 + 
 +ROUNDS="${1:-5}" 
 + 
 +mkdir -p /opt/at-search/crawl /opt/at-search/nutch/logs /opt/at-search/hadoop-tmp /opt/at-search/hadoop-staging 
 + 
 +"${NUTCH_HOME}/bin/crawl" -i -s /opt/at-search/seeds /opt/at-search/crawl "${ROUNDS}" 
 +/opt/at-search/venv/bin/python /opt/at-search/enrich/enrich_index.py 
 +EOF 
 + 
 +cat > /usr/local/bin/at-search-reindex-last <<'EOF' 
 +#!/usr/bin/env bash 
 +set -Eeuo pipefail 
 +export JAVA_HOME="${JAVA_HOME:-$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")}" 
 +export NUTCH_HOME="/opt/at-search/nutch" 
 +export NUTCH_CONF_DIR="${NUTCH_HOME}/conf" 
 +export HADOOP_CONF_DIR="${NUTCH_HOME}/conf" 
 +export HADOOP_OPTS="-Djava.io.tmpdir=/opt/at-search/hadoop-tmp" 
 + 
 +SEG="$(ls -1 /opt/at-search/crawl/segments | sort | tail -n1)" 
 +"${NUTCH_HOME}/bin/nutch" index /opt/at-search/crawl/crawldb -linkdb /opt/at-search/crawl/linkdb "/opt/at-search/crawl/segments/${SEG}" -deleteGone 
 +/opt/at-search/venv/bin/python /opt/at-search/enrich/enrich_index.py 
 +EOF 
 + 
 +cat > /usr/local/bin/at-search-enrich <<'EOF' 
 +#!/usr/bin/env bash 
 +set -Eeuo pipefail 
 +exec /opt/at-search/venv/bin/python /opt/at-search/enrich/enrich_index.py 
 +EOF 
 + 
 +cat > /usr/local/bin/at-search-reset-index <<'EOF' 
 +#!/usr/bin/env bash 
 +set -Eeuo pipefail 
 +curl -s -X DELETE http://127.0.0.1:9200/at_web >/dev/null 2>&1 || true 
 +curl -s -X PUT http://127.0.0.1:9200/at_web -H 'Content-Type: application/json' -d '{ 
 +  "settings":
 +    "number_of_shards": 1, 
 +    "number_of_replicas":
 +  }, 
 +  "mappings":
 +    "properties":
 +      "url": {"type": "keyword"}, 
 +      "host": {"type": "keyword"}, 
 +      "site": {"type": "keyword"}, 
 +      "title": {"type": "text"}, 
 +      "content": {"type": "text"}, 
 +      "digest": {"type": "keyword"}, 
 +      "tstamp": {"type": "date", "format": "strict_date_optional_time||epoch_millis"}, 
 +      "url_depth": {"type": "integer"}, 
 +      "is_root": {"type": "boolean"}, 
 +      "content_length": {"type": "integer"}, 
 +      "domain_class": {"type": "keyword"}, 
 +      "host_rank": {"type": "float"
 +    } 
 +  } 
 +}' | jq . 
 +EOF 
 + 
 +cat > /usr/local/bin/at-search-query <<'EOF' 
 +#!/usr/bin/env bash 
 +set -Eeuo pipefail 
 +if [[ $# -lt 1 ]]; then 
 +  echo 'Nutzung: at-search-query "suchbegriff"' 
 +  exit 1 
 +fi 
 +QUERY="$*" 
 +BODY="$(jq -n --arg q "$QUERY" '{ 
 +  size: 10, 
 +  query: { 
 +    function_score:
 +      query: { 
 +        bool: { 
 +          must: [ 
 +            { 
 +              multi_match:
 +                query: $q, 
 +                fields: ["title^8","content^2","site","host"], 
 +                type: "best_fields" 
 +              } 
 +            } 
 +          ], 
 +          must_not: [ 
 +            { term: { is_root: true } }, 
 +            { regexp: { url: ".*/(tag|tags|category|categories|thema|themen|archiv|archive|suche|search|index|author|feed|rss)(/.*)?" } }, 
 +            { regexp: { url: ".*/(page|seite)/[0-9]+/?" } } 
 +          ], 
 +          should: [ 
 +            { match_phrase: { title: { query: $q, boost: 10 } } } 
 +          ] 
 +        } 
 +      }, 
 +      functions: [ 
 +        { filter: { term: { domain_class: "gov" } }, weight: 3.0 }, 
 +        { filter: { term: { domain_class: "edu" } }, weight: 2.5 }, 
 +        { filter: { term: { domain_class: "news" } }, weight: 1.5 }, 
 +        { field_value_factor: { field: "host_rank", factor: 1.0, missing: 1.0 } }, 
 +        { field_value_factor: { field: "url_depth", factor: 0.25, missing: 0 } }, 
 +        { field_value_factor: { field: "content_length", factor: 0.0005, modifier: "log1p", missing: 1 } } 
 +      ], 
 +      score_mode: "sum", 
 +      boost_mode: "sum" 
 +    } 
 +  }, 
 +  _source: ["title","url","host","site","url_depth","is_root","domain_class","host_rank"
 +}')" 
 +curl -fsS "http://127.0.0.1:9200/at_web/_search"
 +  -H 'Content-Type: application/json'
 +  -d "${BODY}" | jq . 
 +EOF 
 + 
 +cat > /usr/local/bin/at-search-status <<'EOF' 
 +#!/usr/bin/env bash 
 +set -Eeuo pipefail 
 +echo "=== OpenSearch ===" 
 +curl -s http://127.0.0.1:9200 | jq . 
 +echo 
 +echo "=== Count ===" 
 +curl -s http://127.0.0.1:9200/at_web/_count | jq . 
 +echo 
 +echo "=== Segments ===" 
 +ls -1 /opt/at-search/crawl/segments 2>/dev/null || true 
 +EOF 
 + 
 +chmod +x /usr/local/bin/at-search-crawl 
 +chmod +x /usr/local/bin/at-search-reindex-last 
 +chmod +x /usr/local/bin/at-search-query 
 +chmod +x /usr/local/bin/at-search-status 
 +chmod +x /usr/local/bin/at-search-enrich 
 +chmod +x /usr/local/bin/at-search-reset-index 
 + 
 +echo "[14/14] Setze finale Rechte" 
 +chown -R "${REAL_USER}:${REAL_GROUP}" "${APP_ROOT}" 
 +chown -R 1000:1000 "${APP_ROOT}/opensearch/data" 
 +chmod +x "${APP_ROOT}/nutch/bin/nutch" "${APP_ROOT}/nutch/bin/crawl" 
 + 
 +cat <<EOF 
 + 
 +==== FERTIG ==== 
 + 
 +Jetzt: 
 +  at-search-status 
 +  at-search-crawl 5 
 +  at-search-status 
 +  at-search-query "wien" 
 + 
 +EOF
 </code> </code>
opensearch.1775987588.txt.gz · Zuletzt geändert: 2026/04/12 11:53 von jango