#!/usr/bin/env python3
"""Does Google search interest lead or lag the Bitcoin price?

Companion analysis for the blog post "The Google Trends Case for Bitcoin
Investment" and its 2026 update. Reproducible: data is in ./data, the figure
goes to the post's image folder.

Inputs (./data):
  gtrends_2009_2026.csv          monthly Google Trends ("bitcoin","ethereum"), 2009-2026
  btc_daily.csv, eth_daily.csv   daily close (Nasdaq export, 2019-2026)

Price history starts 2019-03, so the quantitative window is 2019-2026, which
covers two cycles: the 2021 top and the 2024-2025 top.
"""
import os
import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

HERE = os.path.dirname(os.path.abspath(__file__))
DATA = os.path.join(HERE, "data")
OUT = os.path.join(HERE, "..", "..", "public", "images", "blog", "googletrends",
                   "bitcoin-price-vs-search-2026.png")


def load_search(col):
    df = pd.read_csv(os.path.join(DATA, "gtrends_2009_2026.csv"))
    df["Time"] = pd.to_datetime(df["Time"])
    return df.set_index("Time")[col]


def load_price(fname):
    df = pd.read_csv(os.path.join(DATA, fname))
    df["Date"] = pd.to_datetime(df["Date"], format="%m/%d/%Y")
    df = df.set_index("Date").sort_index()
    return pd.to_numeric(df["Close/Last"], errors="coerce").resample("MS").mean()


def cross_corr(search, price, max_lag=6):
    """corr of month-over-month changes; lag k>0 => search lags price by k months."""
    ds, dp = search.pct_change(fill_method=None), price.pct_change(fill_method=None)
    rows = [(k, ds.corr(dp.shift(k))) for k in range(-max_lag, max_lag + 1)]
    return rows, max(rows, key=lambda t: (t[1] if pd.notna(t[1]) else -9))


def peaks(s, p, lo, hi):
    m = (s.index >= lo) & (s.index <= hi)
    return p[m].idxmax(), s[m].idxmax()


def analyze(name, col, price_file):
    s, p = load_search(col), load_price(price_file)
    idx = s.index.intersection(p.index)
    s, p = s.loc[idx], p.loc[idx]
    rows, (lag, rlag) = cross_corr(s, p)
    print(f"\n== {name}  ({idx.min().date()}..{idx.max().date()}, {len(idx)} mo) ==")
    print(f"  r levels={s.corr(p):.2f}  r changes={s.pct_change(fill_method=None).corr(p.pct_change(fill_method=None)):.2f}"
          f"  best lag={lag:+d}mo (r={rlag:.2f})")
    for lbl, lo, hi in [("2021 cycle", "2020-06", "2022-12"), ("2024-25 cycle", "2023-01", "2026-06")]:
        pp, sp = peaks(s, p, lo, hi)
        print(f"  {lbl}: price top {pp.date()} (${p[pp]:,.0f})  search peak {sp.date()} "
              f"({int(s[sp])})  -> search {(sp.to_period('M') - pp.to_period('M')).n:+d} mo vs top")
    return s, p, rows, lag


def main():
    s, p, rows, lag = analyze("Bitcoin", "bitcoin", "btc_daily.csv")
    analyze("Ethereum", "ethereum", "eth_daily.csv")

    fig, (ax1, ax3) = plt.subplots(1, 2, figsize=(12, 4.6),
                                   gridspec_kw={"width_ratios": [2.3, 1]})
    ax1.set_title("Bitcoin price vs. 'bitcoin' search interest (2019-2026)")
    ax1.semilogy(p.index, p.values, color="#f7931a", lw=2)
    ax1.set_ylabel("BTC price (USD, log)", color="#b8650f")
    ax2 = ax1.twinx()
    ax2.plot(s.index, s.values, color="#2563eb", lw=2)
    ax2.set_ylabel("'bitcoin' search interest (0-100)", color="#2563eb")
    for lo, hi in [("2020-06", "2022-12"), ("2023-01", "2026-06")]:
        pp, sp = peaks(s, p, lo, hi)
        ax1.axvline(pp, color="#b8650f", ls="--", lw=1)
        ax2.axvline(sp, color="#2563eb", ls=":", lw=1)

    ks = [k for k, _ in rows]
    ax3.set_title("Lead/lag (MoM changes)")
    ax3.bar(ks, [r for _, r in rows],
            color=["#16a34a" if k == lag else "#9ca3af" for k in ks])
    ax3.axvline(0, color="#444", lw=0.8)
    ax3.set_xlabel("lag (months): >0 = search lags price")
    ax3.set_ylabel("correlation")
    fig.tight_layout()
    os.makedirs(os.path.dirname(OUT), exist_ok=True)
    fig.savefig(OUT, dpi=130)
    print(f"\nwrote {os.path.relpath(OUT, os.path.join(HERE, '..', '..'))}")


if __name__ == "__main__":
    main()
