Corin1998's picture
Update data.py
eaf7cc5 verified
from __future__ import annotations
import os
import json
from datetime import datetime
import pandas as pd
# 書き込み可能な場所をデフォルトにする(HFでは /app がReadOnlyな場合がある)
DEFAULT_WRITABLE_DIR = "/tmp/adcopy_data"
DATA_DIR = os.environ.get("DATA_DIR", DEFAULT_WRITABLE_DIR)
LOG_PATH = os.path.join(DATA_DIR, "events.csv")
META_PATH = os.path.join(DATA_DIR, "meta.json")
SCHEMA = [
"ts", "date", "medium", "creative", "is_control",
"impressions", "clicks", "conversions", "cost", "features_json"
]
def _ensure_storage():
os.makedirs(DATA_DIR, exist_ok=True)
if not os.path.exists(LOG_PATH):
pd.DataFrame(columns=SCHEMA).to_csv(LOG_PATH, index=False)
if not os.path.exists(META_PATH):
with open(META_PATH, "w", encoding="utf-8") as f:
json.dump({"created_at": datetime.utcnow().isoformat()}, f)
# インポート時に準備
_ensure_storage()
def read_events() -> pd.DataFrame:
_ensure_storage()
df = pd.read_csv(LOG_PATH)
if df.empty:
return df
df["date"] = pd.to_datetime(df["date"]).dt.date.astype(str)
df["is_control"] = df["is_control"].fillna(0).astype(int)
for col in ["impressions", "clicks", "conversions"]:
df[col] = df[col].fillna(0).astype(int)
df["cost"] = df["cost"].fillna(0.0).astype(float)
df["features_json"] = df["features_json"].fillna("{}")
return df
def append_events(rows: pd.DataFrame) -> None:
_ensure_storage()
for c in SCHEMA:
if c not in rows.columns:
if c == "features_json":
rows[c] = "{}"
elif c == "ts":
rows[c] = datetime.utcnow().isoformat()
elif c == "date":
rows[c] = datetime.utcnow().date().isoformat()
elif c in ("impressions", "clicks", "conversions", "is_control"):
rows[c] = 0
elif c == "cost":
rows[c] = 0.0
else:
rows[c] = None
rows = rows[SCHEMA]
rows.to_csv(LOG_PATH, mode="a", header=False, index=False)
def aggregate(levels=("medium", "creative")) -> pd.DataFrame:
_ensure_storage()
df = read_events()
if df.empty:
return pd.DataFrame(columns=[*levels, "is_control", "impressions", "clicks", "conversions", "cost"])
g = df.groupby([*levels, "is_control"], dropna=False).agg(
impressions=("impressions", "sum"),
clicks=("clicks", "sum"),
conversions=("conversions", "sum"),
cost=("cost", "sum"),
).reset_index()
return g