import requests from xml.etree import ElementTree import time def fetch_sitemap(url): response = requests.get(url) return response.content def parse_sitemap(content): root = ElementTree.fromstring(content) # サイトマップインデックスの名前空間を取得 ns = {'sm': root.tag.split('}')[0].strip('{')} urls = [] # サイトマップまたはURLエントリを探す for sitemap in root.findall('.//sm:sitemap', ns) + root.findall('.//sm:url', ns): loc = sitemap.find('sm:loc', ns) if loc is not None: urls.append(loc.text) return urls def get_all_urls(start_url): to_crawl = [start_url] crawled = set() all_urls = [] while to_crawl: url = to_crawl.pop(0) if url in crawled: continue print(f"Fetching: {url}") content = fetch_sitemap(url) urls = parse_sitemap(content) for u in urls: if u.endswith('.xml'): if u not in crawled and u not in to_crawl: to_crawl.append(u) else: all_urls.append(u) crawled.add(url) time.sleep(1) # サーバーに負荷をかけないよう1秒待機 return all_urls # 最初のサイトマップURL start_url = "http://blog.livedoor.jp/hogehoge/sitemap.xml" # すべてのURLを取得 all_urls = get_all_urls(start_url) # 結果を表示 print(f"Total URLs found: {len(all_urls)}") for url in all_urls: print(url)