Hier werden die Unterschiede zwischen zwei Versionen angezeigt.
| Beide Seiten der vorigen Revision Vorhergehende Überarbeitung | |||
|
opensearch [2026/04/12 14:16] jango [Production] |
opensearch [2026/04/12 15:27] (aktuell) jango [v2] |
||
|---|---|---|---|
| Zeile 568: | Zeile 568: | ||
| ===v2=== | ===v2=== | ||
| + | |||
| + | <code bash> | ||
| + | # | ||
| + | set -Eeuo pipefail | ||
| + | |||
| + | trap 'echo; echo " | ||
| + | |||
| + | if [[ " | ||
| + | echo "Bitte mit sudo/root ausführen." | ||
| + | exit 1 | ||
| + | fi | ||
| + | |||
| + | APP_ROOT="/ | ||
| + | NUTCH_VERSION=" | ||
| + | OPENSEARCH_VERSION=" | ||
| + | REAL_USER=" | ||
| + | REAL_GROUP=" | ||
| + | |||
| + | echo " | ||
| + | export DEBIAN_FRONTEND=noninteractive | ||
| + | apt-get update | ||
| + | apt-get install -y \ | ||
| + | curl \ | ||
| + | wget \ | ||
| + | jq \ | ||
| + | tar \ | ||
| + | gzip \ | ||
| + | unzip \ | ||
| + | ca-certificates \ | ||
| + | gnupg \ | ||
| + | lsb-release \ | ||
| + | software-properties-common \ | ||
| + | docker.io \ | ||
| + | docker-compose \ | ||
| + | openjdk-11-jdk-headless \ | ||
| + | python3 \ | ||
| + | python3-venv \ | ||
| + | python3-pip | ||
| + | |||
| + | systemctl enable docker | ||
| + | systemctl restart docker | ||
| + | |||
| + | echo " | ||
| + | if grep -q ' | ||
| + | sed -i ' | ||
| + | else | ||
| + | echo ' | ||
| + | fi | ||
| + | sysctl -w vm.max_map_count=262144 >/ | ||
| + | |||
| + | echo " | ||
| + | JAVA_HOME_PATH=" | ||
| + | cat > / | ||
| + | export JAVA_HOME=${JAVA_HOME_PATH} | ||
| + | EOF | ||
| + | chmod 0644 / | ||
| + | export JAVA_HOME=" | ||
| + | |||
| + | echo " | ||
| + | mkdir -p \ | ||
| + | " | ||
| + | " | ||
| + | |||
| + | echo " | ||
| + | python3 -m venv " | ||
| + | " | ||
| + | " | ||
| + | |||
| + | echo " | ||
| + | cat > " | ||
| + | version: " | ||
| + | |||
| + | services: | ||
| + | opensearch: | ||
| + | image: opensearchproject/ | ||
| + | container_name: | ||
| + | restart: unless-stopped | ||
| + | environment: | ||
| + | - cluster.name=at-search-cluster | ||
| + | - node.name=at-opensearch | ||
| + | - discovery.type=single-node | ||
| + | - bootstrap.memory_lock=true | ||
| + | - OPENSEARCH_JAVA_OPTS=-Xms1g -Xmx1g | ||
| + | - DISABLE_INSTALL_DEMO_CONFIG=true | ||
| + | - DISABLE_SECURITY_PLUGIN=true | ||
| + | ulimits: | ||
| + | memlock: | ||
| + | soft: -1 | ||
| + | hard: -1 | ||
| + | nofile: | ||
| + | soft: 65536 | ||
| + | hard: 65536 | ||
| + | volumes: | ||
| + | - ${APP_ROOT}/ | ||
| + | ports: | ||
| + | - " | ||
| + | - " | ||
| + | EOF | ||
| + | |||
| + | echo " | ||
| + | chown -R " | ||
| + | chown -R 1000:1000 " | ||
| + | chmod 775 " | ||
| + | chmod 775 " | ||
| + | |||
| + | echo " | ||
| + | cd " | ||
| + | docker-compose up -d | ||
| + | |||
| + | ok=0 | ||
| + | for i in $(seq 1 120); do | ||
| + | if curl -fsS " | ||
| + | ok=1 | ||
| + | break | ||
| + | fi | ||
| + | sleep 2 | ||
| + | done | ||
| + | |||
| + | if [[ " | ||
| + | echo " | ||
| + | docker-compose logs --tail=200 opensearch || true | ||
| + | exit 1 | ||
| + | fi | ||
| + | |||
| + | echo " | ||
| + | curl -fsS -X DELETE " | ||
| + | curl -fsS -X PUT " | ||
| + | -H ' | ||
| + | -d '{ | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | }, | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | } | ||
| + | } | ||
| + | }' >/ | ||
| + | |||
| + | echo " | ||
| + | cd " | ||
| + | if [[ ! -f " | ||
| + | wget -O " | ||
| + | " | ||
| + | fi | ||
| + | |||
| + | rm -rf " | ||
| + | tar -xzf " | ||
| + | ln -sfn " | ||
| + | |||
| + | mkdir -p " | ||
| + | |||
| + | echo " | ||
| + | cat > " | ||
| + | <?xml version=" | ||
| + | < | ||
| + | < | ||
| + | < | ||
| + | < | ||
| + | </ | ||
| + | |||
| + | < | ||
| + | < | ||
| + | < | ||
| + | </ | ||
| + | |||
| + | < | ||
| + | < | ||
| + | < | ||
| + | </ | ||
| + | |||
| + | < | ||
| + | < | ||
| + | < | ||
| + | </ | ||
| + | |||
| + | < | ||
| + | < | ||
| + | < | ||
| + | </ | ||
| + | |||
| + | < | ||
| + | < | ||
| + | < | ||
| + | </ | ||
| + | |||
| + | < | ||
| + | < | ||
| + | < | ||
| + | </ | ||
| + | |||
| + | < | ||
| + | < | ||
| + | < | ||
| + | </ | ||
| + | |||
| + | < | ||
| + | < | ||
| + | < | ||
| + | </ | ||
| + | |||
| + | < | ||
| + | < | ||
| + | < | ||
| + | </ | ||
| + | |||
| + | < | ||
| + | < | ||
| + | < | ||
| + | </ | ||
| + | |||
| + | < | ||
| + | < | ||
| + | < | ||
| + | </ | ||
| + | |||
| + | < | ||
| + | < | ||
| + | < | ||
| + | </ | ||
| + | </ | ||
| + | EOF | ||
| + | |||
| + | cat > " | ||
| + | <?xml version=" | ||
| + | < | ||
| + | < | ||
| + | < | ||
| + | < | ||
| + | </ | ||
| + | < | ||
| + | < | ||
| + | < | ||
| + | </ | ||
| + | </ | ||
| + | EOF | ||
| + | |||
| + | cat > " | ||
| + | <?xml version=" | ||
| + | < | ||
| + | < | ||
| + | < | ||
| + | < | ||
| + | </ | ||
| + | < | ||
| + | < | ||
| + | < | ||
| + | </ | ||
| + | < | ||
| + | < | ||
| + | < | ||
| + | </ | ||
| + | </ | ||
| + | EOF | ||
| + | |||
| + | cat > " | ||
| + | +^https?:// | ||
| + | -^https?:// | ||
| + | -^https?:// | ||
| + | -^https?:// | ||
| + | -^https?:// | ||
| + | -. | ||
| + | EOF | ||
| + | |||
| + | cat > " | ||
| + | <?xml version=" | ||
| + | <writers xmlns=" | ||
| + | | ||
| + | | ||
| + | |||
| + | <writer id=" | ||
| + | class=" | ||
| + | < | ||
| + | <param name=" | ||
| + | <param name=" | ||
| + | <param name=" | ||
| + | <param name=" | ||
| + | <param name=" | ||
| + | <param name=" | ||
| + | <param name=" | ||
| + | <param name=" | ||
| + | <param name=" | ||
| + | <param name=" | ||
| + | <param name=" | ||
| + | <param name=" | ||
| + | <param name=" | ||
| + | <param name=" | ||
| + | <param name=" | ||
| + | <param name=" | ||
| + | <param name=" | ||
| + | </ | ||
| + | |||
| + | < | ||
| + | < | ||
| + | <field source=" | ||
| + | </ | ||
| + | <rename /> | ||
| + | < | ||
| + | <field source=" | ||
| + | </ | ||
| + | </ | ||
| + | </ | ||
| + | |||
| + | </ | ||
| + | EOF | ||
| + | |||
| + | cat > " | ||
| + | https:// | ||
| + | https:// | ||
| + | https:// | ||
| + | https:// | ||
| + | https:// | ||
| + | https:// | ||
| + | https:// | ||
| + | https:// | ||
| + | https:// | ||
| + | https:// | ||
| + | EOF | ||
| + | |||
| + | echo " | ||
| + | cat > " | ||
| + | # | ||
| + | import json | ||
| + | from urllib.parse import urlparse | ||
| + | import requests | ||
| + | |||
| + | OS_URL = " | ||
| + | INDEX = " | ||
| + | BATCH = 200 | ||
| + | |||
| + | def domain_class(host: | ||
| + | host = (host or "" | ||
| + | if host.endswith(" | ||
| + | return " | ||
| + | if host.endswith(" | ||
| + | return " | ||
| + | if " | ||
| + | return " | ||
| + | if " | ||
| + | return " | ||
| + | return " | ||
| + | |||
| + | def host_rank(host: | ||
| + | host = (host or "" | ||
| + | if host.endswith(" | ||
| + | return 3.0 | ||
| + | if host.endswith(" | ||
| + | return 2.5 | ||
| + | if host == " | ||
| + | return 2.0 | ||
| + | if host == " | ||
| + | return 1.8 | ||
| + | if host == " | ||
| + | return 1.4 | ||
| + | return 1.0 | ||
| + | |||
| + | def compute_fields(src: | ||
| + | url = src.get(" | ||
| + | content = src.get(" | ||
| + | parsed = urlparse(url) | ||
| + | path = parsed.path or "/" | ||
| + | stripped = path.strip("/" | ||
| + | depth = 0 if stripped == "" | ||
| + | is_root = stripped == "" | ||
| + | return { | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | } | ||
| + | |||
| + | def main(): | ||
| + | session = requests.Session() | ||
| + | search_after = None | ||
| + | total = 0 | ||
| + | |||
| + | while True: | ||
| + | body = { | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | } | ||
| + | if search_after is not None: | ||
| + | body[" | ||
| + | |||
| + | r = session.post(f" | ||
| + | r.raise_for_status() | ||
| + | data = r.json() | ||
| + | hits = data.get(" | ||
| + | if not hits: | ||
| + | break | ||
| + | |||
| + | lines = [] | ||
| + | for hit in hits: | ||
| + | doc_id = hit[" | ||
| + | src = hit.get(" | ||
| + | fields = compute_fields(src) | ||
| + | meta = {" | ||
| + | doc = {" | ||
| + | lines.append(json.dumps(meta)) | ||
| + | lines.append(json.dumps(doc)) | ||
| + | total += 1 | ||
| + | |||
| + | payload = " | ||
| + | br = session.post( | ||
| + | f" | ||
| + | data=payload, | ||
| + | headers={" | ||
| + | timeout=60, | ||
| + | ) | ||
| + | br.raise_for_status() | ||
| + | |||
| + | search_after = hits[-1][" | ||
| + | |||
| + | print(f" | ||
| + | |||
| + | if __name__ == " | ||
| + | main() | ||
| + | EOF | ||
| + | chmod +x " | ||
| + | |||
| + | echo " | ||
| + | cat > / | ||
| + | # | ||
| + | set -Eeuo pipefail | ||
| + | export JAVA_HOME=" | ||
| + | export NUTCH_HOME="/ | ||
| + | export NUTCH_CONF_DIR=" | ||
| + | export HADOOP_CONF_DIR=" | ||
| + | export HADOOP_OPTS=" | ||
| + | |||
| + | ROUNDS=" | ||
| + | |||
| + | mkdir -p / | ||
| + | |||
| + | " | ||
| + | / | ||
| + | EOF | ||
| + | |||
| + | cat > / | ||
| + | # | ||
| + | set -Eeuo pipefail | ||
| + | export JAVA_HOME=" | ||
| + | export NUTCH_HOME="/ | ||
| + | export NUTCH_CONF_DIR=" | ||
| + | export HADOOP_CONF_DIR=" | ||
| + | export HADOOP_OPTS=" | ||
| + | |||
| + | SEG=" | ||
| + | " | ||
| + | / | ||
| + | EOF | ||
| + | |||
| + | cat > / | ||
| + | # | ||
| + | set -Eeuo pipefail | ||
| + | exec / | ||
| + | EOF | ||
| + | |||
| + | cat > / | ||
| + | # | ||
| + | set -Eeuo pipefail | ||
| + | curl -s -X DELETE http:// | ||
| + | curl -s -X PUT http:// | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | }, | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | } | ||
| + | } | ||
| + | }' | jq . | ||
| + | EOF | ||
| + | |||
| + | cat > / | ||
| + | # | ||
| + | set -Eeuo pipefail | ||
| + | if [[ $# -lt 1 ]]; then | ||
| + | echo ' | ||
| + | exit 1 | ||
| + | fi | ||
| + | QUERY=" | ||
| + | BODY=" | ||
| + | size: 10, | ||
| + | query: { | ||
| + | function_score: | ||
| + | query: { | ||
| + | bool: { | ||
| + | must: [ | ||
| + | { | ||
| + | multi_match: | ||
| + | query: $q, | ||
| + | fields: [" | ||
| + | type: " | ||
| + | } | ||
| + | } | ||
| + | ], | ||
| + | must_not: [ | ||
| + | { term: { is_root: true } }, | ||
| + | { regexp: { url: " | ||
| + | { regexp: { url: " | ||
| + | ], | ||
| + | should: [ | ||
| + | { match_phrase: | ||
| + | ] | ||
| + | } | ||
| + | }, | ||
| + | functions: [ | ||
| + | { filter: { term: { domain_class: | ||
| + | { filter: { term: { domain_class: | ||
| + | { filter: { term: { domain_class: | ||
| + | { field_value_factor: | ||
| + | { field_value_factor: | ||
| + | { field_value_factor: | ||
| + | ], | ||
| + | score_mode: " | ||
| + | boost_mode: " | ||
| + | } | ||
| + | }, | ||
| + | _source: [" | ||
| + | }' | ||
| + | curl -fsS " | ||
| + | -H ' | ||
| + | -d " | ||
| + | EOF | ||
| + | |||
| + | cat > / | ||
| + | # | ||
| + | set -Eeuo pipefail | ||
| + | echo "=== OpenSearch ===" | ||
| + | curl -s http:// | ||
| + | echo | ||
| + | echo "=== Count ===" | ||
| + | curl -s http:// | ||
| + | echo | ||
| + | echo "=== Segments ===" | ||
| + | ls -1 / | ||
| + | EOF | ||
| + | |||
| + | chmod +x / | ||
| + | chmod +x / | ||
| + | chmod +x / | ||
| + | chmod +x / | ||
| + | chmod +x / | ||
| + | chmod +x / | ||
| + | |||
| + | echo " | ||
| + | chown -R " | ||
| + | chown -R 1000:1000 " | ||
| + | chmod +x " | ||
| + | |||
| + | cat <<EOF | ||
| + | |||
| + | ==== FERTIG ==== | ||
| + | |||
| + | Jetzt: | ||
| + | at-search-status | ||
| + | at-search-crawl 5 | ||
| + | at-search-status | ||
| + | at-search-query " | ||
| + | |||
| + | EOF | ||
| + | </ | ||