"""
collect_taito_gov.py
台東区公式サイト（city.taito.lg.jp / t-navi.city.taito.lg.jp）を
「言問散歩の営業視点」で定期監視し、新規掲載（補助金・行事・施策・新着）の
差分だけを taito-gov-updates.json に整形出力する。

役割（第2層＝定期監視層）:
    - 区政インテリジェンスの「最新情報を他に先駆けて掴む」ためのツール。
    - 静的キュレーション層（area-intel.json の taito-gov セクション）とは別。
      こちらは"差分アラート"、あちらは"分析済みの営業フック"。

実行方法:
    py -3 collect_taito_gov.py            # 通常実行（差分のみ追記）
    py -3 collect_taito_gov.py --dry      # フェッチして件数だけ表示（保存しない）

冪等設計:
    - _taito_gov_snapshot.json に前回の (url,title) 集合を保持し、純粋な差分検知。
    - 既存 taito-gov-updates.json の id と重複しないものだけ先頭に追記。
    - フェッチ失敗・0件でも既存データを壊さない（GR13 ログは stderr）。

GR19 Zero-Fabrication:
    - 取得した実ページのリンク・見出しのみを出力。本文の要約・talkTrack 付与は
      area-intel 合成側（人/AI）が行う。ここでは事実（タイトル+URL+取得日）だけ。

依存: 標準ライブラリのみ（urllib, json, re, hashlib, datetime, pathlib）。
    ※ requests 不要。
"""

import json
import re
import sys
import time
import hashlib
import datetime
import pathlib
import urllib.request
import urllib.parse
import urllib.error

SCRIPT_DIR = pathlib.Path(__file__).parent.resolve()
OUTPUT_FILE = SCRIPT_DIR / "taito-gov-updates.json"
SNAPSHOT_FILE = SCRIPT_DIR / "_taito_gov_snapshot.json"

UA = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/124.0.0.0 Safari/537.36"
)
TODAY = datetime.date.today().isoformat()
FETCH_TIMEOUT = 12

# ─────────────────────────────────────────────
# 監視対象（SOURCES）
#   url      … フェッチするインデックスページ（実在確認済みのみ）
#   base     … 相対リンクを絶対化するためのベース
#   category … 出力カテゴリ（営業フックの分類）
#   priority … 1=最優先（広告/補助金）, 2=高, 3=参考
#   include  … タイトルにこの語のいずれかを含む新着のみ拾う（Noneなら全件）
#   ★研究結果（B系統）で確定した補助金一覧・イベントカレンダーの実URLを
#     ここに追記して運用に乗せる。未確認URLは入れない（404と捏造を避ける）。
# ─────────────────────────────────────────────
SOURCES = [
    # ★最優先：事業者向け補助金（公益財団法人 台東区産業振興事業団）。広告制作・販路に直結
    {
        "id": "sangyo-jodan-josei",
        "label": "産業振興事業団 助成金一覧",
        "url": "https://taito-sangyo.jp/subsidy/subsidy-1/",
        "base": "https://taito-sangyo.jp/",
        "category": "補助金・助成",
        "priority": 1,
        "include": ["補助", "助成", "支援", "募集", "プロモ", "販路", "デジタル", "展示会", "創業", "承継"],
    },
    # ★区の商店街支援（空き店舗・家賃・改修＝新規開業広告主の発生源）
    {
        "id": "taito-shotengai",
        "label": "台東区 商店街支援",
        "url": "https://www.city.taito.lg.jp/bunka_kanko/shotengaishinko/shotengaishinko/index.html",
        "base": "https://www.city.taito.lg.jp/",
        "category": "補助金・商店街",
        "priority": 1,
        "include": ["補助", "助成", "支援", "空き店舗", "家賃", "改修", "募集", "商店街"],
    },
    # 区 中小企業・事業経営支援トップ（新設補助金・創業支援の追加を監視）
    {
        "id": "taito-keiei",
        "label": "台東区 中小企業支援",
        "url": "https://www.city.taito.lg.jp/bunka_kanko/jigyoukeiei/index.html",
        "base": "https://www.city.taito.lg.jp/",
        "category": "産業振興",
        "priority": 2,
        "include": None,
    },
    # 区長所信表明（次の重点施策・新制度・予算方針の一次発表源＝最重要の先読み）
    {
        "id": "taito-shoshin",
        "label": "区長所信表明",
        "url": "https://www.city.taito.lg.jp/kusei/kucho/shoshinhyomei/index.html",
        "base": "https://www.city.taito.lg.jp/",
        "category": "区政方針",
        "priority": 1,
        "include": None,
    },
    # 区 イベントカレンダー（催事の確定日程＝号の企画・営業タイミング設計）
    {
        "id": "taito-event",
        "label": "台東区 イベントカレンダー",
        "url": "https://www.city.taito.lg.jp/event/index.html",
        "base": "https://www.city.taito.lg.jp/",
        "category": "行事・催事",
        "priority": 2,
        "include": None,
    },
    # 文化・観光・産業トップ（観光振興方針の公開待ち＝最大の先読み案件）
    {
        "id": "taito-bunka-kanko",
        "label": "文化・観光・産業",
        "url": "https://www.city.taito.lg.jp/bunka_kanko/index.html",
        "base": "https://www.city.taito.lg.jp/",
        "category": "観光・文化・産業",
        "priority": 2,
        "include": None,
    },
]

# 営業に無関係なノイズリンク・区役所トップのナビ系を除外
EXCLUDE_TITLE = re.compile(
    r"(このページ|お問い合わせ|サイトマップ|個人情報|アクセシビリティ|"
    r"よくある|外国語|English|中文|한국|RSS|閉じる|トップへ|ページの先頭|"
    r"音声読み上げ|文字拡大|キーワードから|人生の岐路|防災気象|ハザードマップ|"
    r"夜間・休日|犯罪発生|資源・ごみ|ごみと資源|粗大ごみ|プラスチック分別|"
    r"めぐりん|戸籍|混雑状況|診療案内|休室・休館|たいとうマップ|"
    r"配信登録|過去の配信|イベント検索|医療・介護情報|外国人の方へ|"
    r"組織から|目的から|ライフイベント|広報たいとう|区議会|入札|採用情報|"
    r"パブリックコメント募集一覧$)"
)
# 補助金・営業に効く語（priority昇格・タグ付けに使用）
HOT_WORDS = ["補助", "助成", "支援金", "商店街", "宣伝", "広告", "PR", "販路",
             "キャッシュレス", "インバウンド", "創業", "DX", "観光", "賑わい",
             "リニューアル", "改装", "募集"]


def make_id(title: str, url: str) -> str:
    return hashlib.md5((title + url).encode("utf-8")).hexdigest()[:10]


def fetch_html(url: str) -> str:
    req = urllib.request.Request(url, headers={"User-Agent": UA})
    with urllib.request.urlopen(req, timeout=FETCH_TIMEOUT) as resp:
        raw = resp.read()
    for enc in ("utf-8", "cp932", "euc-jp"):
        try:
            return raw.decode(enc)
        except UnicodeDecodeError:
            continue
    return raw.decode("utf-8", errors="replace")


A_TAG = re.compile(r'<a\s+[^>]*href=["\']([^"\']+)["\'][^>]*>(.*?)</a>', re.I | re.S)
TAG_STRIP = re.compile(r"<[^>]+>")
WS = re.compile(r"\s+")


def clean_text(s: str) -> str:
    s = TAG_STRIP.sub("", s)
    s = (s.replace("&amp;", "&").replace("&nbsp;", " ")
           .replace("&gt;", ">").replace("&lt;", "<").replace("&quot;", '"'))
    return WS.sub(" ", s).strip()


def extract_links(html: str, base: str):
    """ページ内の <a> から (title, abs_url) を抽出。ノイズ除去。"""
    out = []
    seen = set()
    for m in A_TAG.finditer(html):
        href, inner = m.group(1), clean_text(m.group(2))
        if not inner or len(inner) < 6:
            continue
        if EXCLUDE_TITLE.search(inner):
            continue
        if href.startswith(("#", "javascript:", "mailto:", "tel:")):
            continue
        absu = urllib.parse.urljoin(base, href)
        if not absu.startswith("http"):
            continue
        key = (inner, absu)
        if key in seen:
            continue
        seen.add(key)
        out.append({"title": inner, "url": absu})
    return out


def hot_tags(title: str):
    return [w for w in HOT_WORDS if w in title]


def load_json(path, default):
    if path.exists():
        try:
            return json.loads(path.read_text(encoding="utf-8"))
        except Exception as e:
            print(f"[warn] {path.name} 読込失敗: {e}", file=sys.stderr)
    return default


def main():
    dry = "--dry" in sys.argv
    snapshot = load_json(SNAPSHOT_FILE, {})          # {source_id: [ "title|url", ... ]}
    existing = load_json(OUTPUT_FILE, {"_meta": {}, "items": []})
    existing_ids = {it["id"] for it in existing.get("items", [])}

    new_items = []
    new_snapshot = dict(snapshot)

    for src in SOURCES:
        try:
            html = fetch_html(src["url"])
        except Exception as e:
            print(f"[warn] fetch失敗 {src['label']} ({src['url']}): {e}", file=sys.stderr)
            continue
        links = extract_links(html, src["base"])
        # include フィルタ
        if src.get("include"):
            links = [l for l in links
                     if any(w in l["title"] for w in src["include"])]
        prev = set(snapshot.get(src["id"], []))
        cur_keys = []
        for l in links:
            k = f"{l['title']}|{l['url']}"
            cur_keys.append(k)
            if k in prev:
                continue  # 既出 → 差分でない
            tags = hot_tags(l["title"])
            prio = 1 if (src["priority"] == 1 or tags) else src["priority"]
            # 見出しは80字に整形（本文を巻き込んだリンクの体裁を整える。差分キーは原文）
            disp = l["title"]
            if len(disp) > 80:
                disp = disp[:78].rstrip("　 、。・") + "…"
            item = {
                "id": make_id(l["title"], l["url"]),
                "title": disp,
                "url": l["url"],
                "source_label": src["label"],
                "category": src["category"],
                "priority": prio,
                "tags": tags,
                "found": TODAY,
            }
            if item["id"] not in existing_ids:
                new_items.append(item)
                existing_ids.add(item["id"])
        # スナップショット更新（最新の集合で上書き＝純粋差分）
        new_snapshot[src["id"]] = cur_keys
        time.sleep(1)  # 区サーバへの配慮

    print(f"[info] 新規検出: {len(new_items)} 件", file=sys.stderr)
    for it in new_items[:20]:
        flag = "★" if it["priority"] == 1 else " "
        print(f"  {flag} [{it['category']}] {it['title']}", file=sys.stderr)

    if dry:
        print("[dry] 保存せず終了", file=sys.stderr)
        return

    # 出力：新規を先頭に。優先度高→新しい順で並ぶよう先頭挿入。
    merged = sorted(new_items, key=lambda x: x["priority"]) + existing.get("items", [])
    out = {
        "_meta": {
            "title": "台東区 区政アラート（新着監視・営業先読み）",
            "purpose": "台東区公式サイトの新規掲載（補助金・行事・施策）を他に先駆けて掴む差分フィード",
            "updated": TODAY,
            "fabricationNote": "GR19準拠。区公式ページの実在リンク・見出しのみ。talkTrack付与はarea-intel合成側で行う。",
            "note": "★=広告/補助金など営業最優先。area-intel.json の taito-gov セクションへ昇格させる候補。",
        },
        "items": merged[:300],
    }
    OUTPUT_FILE.write_text(json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8")
    SNAPSHOT_FILE.write_text(json.dumps(new_snapshot, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"[ok] {OUTPUT_FILE.name} 更新（全{len(out['items'])}件）", file=sys.stderr)


if __name__ == "__main__":
    main()