import requests
from xml.etree import ElementTree
import time
def fetch_sitemap(url):
response = requests.get(url)
return response.content
def parse_sitemap(content):
root = ElementTree.fromstring(content)
# サイトマップインデックスの名前空間を取得
ns = {'sm': root.tag.split('}')[0].strip('{')}
urls = []
# サイトマップまたはURLエントリを探す
for sitemap in root.findall('.//sm:sitemap', ns) + root.findall('.//sm:url', ns):
loc = sitemap.find('sm:loc', ns)
if loc is not None:
urls.append(loc.text)
return urls
def get_all_urls(start_url):
to_crawl = [start_url]
crawled = set()
all_urls = []
while to_crawl:
url = to_crawl.pop(0)
if url in crawled:
continue
print(f"Fetching: {url}")
content = fetch_sitemap(url)
urls = parse_sitemap(content)
for u in urls:
if u.endswith('.xml'):
if u not in crawled and u not in to_crawl:
to_crawl.append(u)
else:
all_urls.append(u)
crawled.add(url)
time.sleep(1) # サーバーに負荷をかけないよう1秒待機
return all_urls
# 最初のサイトマップURL
start_url = "http://blog.livedoor.jp/hogehoge/sitemap.xml"
# すべてのURLを取得
all_urls = get_all_urls(start_url)
# 結果を表示
print(f"Total URLs found: {len(all_urls)}")
for url in all_urls:
print(url)