Hier werden die Unterschiede zwischen zwei Versionen angezeigt.
— |
crawler [2025/07/01 00:11] (aktuell) jango angelegt |
||
---|---|---|---|
Zeile 1: | Zeile 1: | ||
+ | <code python> | ||
+ | import requests | ||
+ | from urllib.parse import urljoin, urlparse, urldefrag, parse_qs | ||
+ | from bs4 import BeautifulSoup | ||
+ | # Liste unerwünschter Query-Parameter und Dateiendungen | ||
+ | blocked_query_params = [' | ||
+ | blocked_extensions = [' | ||
+ | |||
+ | def is_blocked_url(url): | ||
+ | """ | ||
+ | Diese Funktion prüft, ob eine URL unerwünschte Query-Parameter oder Dateiendungen enthält. | ||
+ | """ | ||
+ | # URL parsen | ||
+ | parsed_url = urlparse(url) | ||
+ | | ||
+ | # Prüfen auf unerwünschte Query-Parameter | ||
+ | query_params = parse_qs(parsed_url.query) | ||
+ | for param in query_params: | ||
+ | if param in blocked_query_params: | ||
+ | return True | ||
+ | | ||
+ | # Prüfen auf unerwünschte Dateiendungen | ||
+ | if any(url.endswith(ext) for ext in blocked_extensions): | ||
+ | return True | ||
+ | | ||
+ | return False | ||
+ | |||
+ | def crawl(start_url, | ||
+ | """ | ||
+ | Diese Funktion crawlt die angegebene URL, extrahiert Links und verarbeitet sie. | ||
+ | """ | ||
+ | visited_urls = set() | ||
+ | queue = [start_url] | ||
+ | |||
+ | while queue: | ||
+ | current_url = queue.pop(0) | ||
+ | if current_url in visited_urls: | ||
+ | continue | ||
+ | | ||
+ | # URL parsen und fragment entfernen | ||
+ | full_url, _ = urldefrag(current_url) | ||
+ | |||
+ | # | ||
+ | # Filtern der unerwünschten URLs | ||
+ | if is_blocked_url(full_url): | ||
+ | # | ||
+ | continue | ||
+ | |||
+ | # Markiere die URL als besucht | ||
+ | visited_urls.add(full_url) | ||
+ | |||
+ | print(f" | ||
+ | |||
+ | # Hole die HTML-Seite | ||
+ | try: | ||
+ | response = requests.get(full_url) | ||
+ | if response.status_code != 200: | ||
+ | print(f" | ||
+ | continue | ||
+ | | ||
+ | page_content = response.text | ||
+ | soup = BeautifulSoup(page_content, | ||
+ | |||
+ | # Finde alle Links auf der Seite | ||
+ | a_links = soup.find_all(' | ||
+ | |||
+ | # Verarbeite die Links | ||
+ | for link in a_links: | ||
+ | href = link.get(' | ||
+ | if href: | ||
+ | # Absolute URL erstellen | ||
+ | next_url = urljoin(base_url, | ||
+ | if next_url not in visited_urls: | ||
+ | # | ||
+ | queue.append(next_url) | ||
+ | |||
+ | except requests.RequestException as e: | ||
+ | print(f" | ||
+ | | ||
+ | print(f" | ||
+ | |||
+ | # Start URL und Basis-URL für den Crawl-Prozess | ||
+ | start_url = ' | ||
+ | base_url = start_url | ||
+ | |||
+ | # Crawlen der Seite | ||
+ | crawl(start_url, | ||
+ | </ |