#Housekeeping
import sys 
import os
import re
#Import functions stored in a folder (custom) 
import sys 
#sys.path.append("directoryname") 
import time
import pandas as pd
import numpy as np
from glob import glob
import seaborn as sns
from scipy import stats
import pyreadstat 
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf


clear = lambda: os.system('cls')
clear()

#Define main path and output path to export results
os.chdir('G:/My Drive/Projects/Diff in Diff')
out_path   = "./Output/"
os.makedirs(out_path, exist_ok=True)

#First, let's define filepaths 
raw_path   = "./Data/"
os.makedirs(raw_path, exist_ok=True)

excel_files = glob(os.path.join(raw_path, "*.xls"))

display(print(f"Found {len(excel_files)} raw postings files."))

#Append all data
dfs = []
for file in excel_files:
    print("Loading:", file)
    #Extract Years
    match = re.search(r'(\d{4})', file)
    year = int(match.group(1))
    df = pd.read_excel(file,sheet_name="Data")
    df["year"] = year
    dfs.append(df)

#ADD 2024 (is an xlsm instead)
file_excel_2024 =  glob(os.path.join(raw_path, "hafd2024pivot.xlsm") )
excel_2024 =  pd.read_excel(file_excel_2024[0],sheet_name="Data")
df["year"] = 2024 
 
# Append into a single DataFrame
hcai_data_1995_2023 = pd.concat(dfs, ignore_index=True)
hcai_data = pd.concat([hcai_data_1995_2023, excel_2024],ignore_index=True )

#hcai_data_clean = hcai_data.clean_names()
print("Combined shape:", hcai_data.shape)
display(hcai_data)

Found 29 raw postings files.

None

Loading: ./Data\hafd2023pivot.xls
Loading: ./Data\hafd2019pivot.xls
Loading: ./Data\hafd2018pivot.xls
Loading: ./Data\hafd2017pivot.xls
Loading: ./Data\hafd2013pivot.xls
Loading: ./Data\hafd2012pivot.xls
Loading: ./Data\hafd2014pivot.xls
Loading: ./Data\hafd2016pivot.xls
Loading: ./Data\hafd2015pivot.xls
Loading: ./Data\hafd2008pivot.xls
Loading: ./Data\hafd2007pivot.xls
Loading: ./Data\hafd2006pivot.xls
Loading: ./Data\hafd2005pivot.xls
Loading: ./Data\hafd2004pivot.xls
Loading: ./Data\hafd2003pivot.xls
Loading: ./Data\hafd2002pivot.xls
Loading: ./Data\hafd2001pivot.xls
Loading: ./Data\hafd2000pivot.xls
Loading: ./Data\hafd1999pivot.xls
Loading: ./Data\hafd1998pivot.xls
Loading: ./Data\hafd1997pivot.xls
Loading: ./Data\hafd1996pivot.xls
Loading: ./Data\hafd1995pivot.xls
Loading: ./Data\hafd2011pivot.xls
Loading: ./Data\hafd2010pivot.xls
Loading: ./Data\hafd2009pivot.xls
Loading: ./Data\hafd2020pivot.xls
Loading: ./Data\hafd2021pivot.xls
Loading: ./Data\hafd2022pivot.xls
Combined shape: (13518, 328)

#Define Treatment variable
panel_df = hcai_data.sort_values(["FAC_NO", "year"])

panel_df["system"] = panel_df["SYSTEM (≥ 3 HOSP)"].fillna("INDEPENDENT_or_<3")
panel_df["system_lag"] = panel_df.groupby("FAC_NO")["system"].shift(1)


panel_df["system_raw"] = panel_df["SYSTEM (≥ 3 HOSP)"]
panel_df["in_system"] = panel_df["system_raw"].notna().astype(int)
panel_df["in_system_lag"] = panel_df.groupby("FAC_NO")["in_system"].shift(1)

# Event: first time entering a system (0 -> 1)
panel_df["switch_system"] = ((panel_df["in_system_lag"] == 0) & (panel_df["in_system"] == 1)).astype(int)


#Define Treatment variable
panel_df["treated"] = panel_df.groupby("FAC_NO")["switch_system"].transform("max") 
#Compute the event year per hospital and add to the main dataset
event_year = (
    panel_df
    .loc[panel_df["switch_system"] == 1]
    .groupby("FAC_NO")["year"]
    .min()
    .rename("event_year")
)


print(panel_df["treated"].value_counts())
panel_df = panel_df.merge(event_year, on="FAC_NO", how="left")
panel_df["event_time"] = panel_df["year"] - panel_df["event_year"]

#unique_categories = panel_df["independent"].unique()
#print(unique_categories)
#Define Outcome Variables
panel_df["tot_op_rev"] = panel_df["NET_PT_REV"] + panel_df["OTH_OP_REV"]

# 1) Operating margin = Net income from operations / total operating revenue
panel_df["operating_margin"] = panel_df["NET_FRM_OP"]/ panel_df["tot_op_rev"]

# 2) Net income margin = Net income / (net patient rev + other op rev + nonop rev)
panel_df["total_revenue_incl_nonop"] = panel_df["tot_op_rev"] + panel_df["NONOP_REV"]
panel_df["net_income_margin"]        = panel_df["NET_INCOME"]/ panel_df["total_revenue_incl_nonop"]

# 3) Cost-to-charge ratio = (Total op exp - other op rev) / gross patient revenue
panel_df["cost_to_charge_ratio"] = (panel_df["TOT_OP_EXP"] - panel_df["OTH_OP_REV"])/ panel_df["GR_PT_REV"]

# 4) Cost Intensity 
panel_df["cost_per_discharge"] = np.where(
    (panel_df["DIS_TOT"].isna()) | (panel_df["DIS_TOT"] == 0),
    np.nan,
    panel_df["TOT_OP_EXP"] / panel_df["DIS_TOT"]
)



#Additional variables 
#Payment Mix
panel_df["mcal_share"] = (panel_df["DAY_MCAL_TR"] + panel_df["DAY_MCAL_MC"]) / panel_df["DAY_TOT"]
panel_df["mcar_share"] = (panel_df["DAY_MCAR_TR"] + panel_df["DAY_MCAR_MC"]) / panel_df["DAY_TOT"]
print(panel_df)

treated
1.0    8124
0.0    5393
Name: count, dtype: int64
            FAC_NO                      FAC_NAME             BEG_DATE  \
0      106010735.0              ALAMEDA HOSPITAL                  NaN   
1      106010735.0              ALAMEDA HOSPITAL                  NaN   
2      106010735.0              ALAMEDA HOSPITAL                  NaN   
3      106010735.0              ALAMEDA HOSPITAL                  NaN   
4      106010735.0              ALAMEDA HOSPITAL                  NaN   
...            ...                           ...                  ...   
13513  106580996.0  ADVENTIST HEALTH AND RIDEOUT  2021-01-01 00:00:00   
13514  106580996.0  ADVENTIST HEALTH AND RIDEOUT  2023-01-01 00:00:00   
13515  106580996.0  ADVENTIST HEALTH AND RIDEOUT  2022-01-01 00:00:00   
13516  106580996.0  ADVENTIST HEALTH AND RIDEOUT  2024-01-01 00:00:00   
13517          NaN                           NaN                  NaT   

                  END_DATE  DAY_PER county name            HSA name   HFPA  \
0      1995-12-31 00:00:00    365.0     ALAMEDA                 NaN  417.0   
1      1996-12-31 00:00:00    366.0     ALAMEDA                 NaN  417.0   
2      1997-12-31 00:00:00    365.0     ALAMEDA                 NaN  417.0   
3      1998-12-31 00:00:00    365.0     ALAMEDA                 NaN  417.0   
4      1999-12-31 00:00:00    365.0     ALAMEDA       05 - EAST BAY  417.0   
...                    ...      ...         ...                 ...    ...   
13513  2021-12-31 00:00:00    365.0        Yuba  02 - GOLDEN EMPIRE  227.0   
13514  2023-12-31 00:00:00    365.0        Yuba  02 - GOLDEN EMPIRE  227.0   
13515  2022-12-31 00:00:00    365.0        Yuba  02 - GOLDEN EMPIRE  227.0   
13516  2024-12-31 00:00:00    366.0        Yuba  02 - GOLDEN EMPIRE  227.0   
13517                  NaT      NaN         NaN                 NaN    NaN   

       TYPE_CNTRL         SYSTEM (≥ 3 HOSP)  ... event_year event_time  \
0      NON-PROFIT                       NaN  ...     2017.0      -22.0   
1      NON-PROFIT                       NaN  ...     2017.0      -21.0   
2      NON-PROFIT                       NaN  ...     2017.0      -20.0   
3      NON-PROFIT                       NaN  ...     2017.0      -19.0   
4      NON-PROFIT                       NaN  ...     2017.0      -18.0   
...           ...                       ...  ...        ...        ...   
13513  Non-Profit  ADVENTIST HEALTH SYSTEMS  ...     2018.0        3.0   
13514  Non-Profit  ADVENTIST HEALTH SYSTEMS  ...     2018.0        5.0   
13515  Non-Profit  ADVENTIST HEALTH SYSTEMS  ...     2018.0        6.0   
13516  Non-Profit  ADVENTIST HEALTH SYSTEMS  ...     2018.0        NaN   
13517         NaN                       NaN  ...        NaN        NaN   

        tot_op_rev operating_margin total_revenue_incl_nonop  \
0       35267808.0         0.001672               35938244.0   
1       36513839.0         0.010444               37122489.0   
2       35752157.0         0.003077               36849945.0   
3       35188110.0         0.013413               36526628.0   
4       33433033.0        -0.068330               34449131.0   
...            ...              ...                      ...   
13513  450759624.0        -0.151103              453738388.0   
13514  493945531.0        -0.027518              495147269.0   
13515  448045987.0        -0.056035              449259078.0   
13516  495639486.0        -0.060706              496932628.0   
13517          NaN              NaN                      NaN   

      net_income_margin cost_to_charge_ratio cost_per_discharge mcal_share  \
0              0.019696             0.336585        9178.532586        NaN   
1              0.026074             0.311648        8377.580339        NaN   
2              0.032098             0.298755        8504.449773        NaN   
3              0.049567             0.282768        8454.972479        NaN   
4             -0.036819             0.288248        9454.075966        NaN   
...                 ...                  ...                ...        ...   
13513         -0.190936             0.265128       43091.991446   0.310778   
13514         -0.067256             0.230475       46877.060589   0.271021   
13515         -0.116736             0.242527       45768.240762   0.292008   
13516         -0.098090             0.227167       46230.016444   0.313916   
13517               NaN                  NaN                NaN        NaN   

      mcar_share  
0            NaN  
1            NaN  
2            NaN  
3            NaN  
4            NaN  
...          ...  
13513   0.545992  
13514   0.577480  
13515   0.553161  
13516   0.543731  
13517        NaN  

[13518 rows x 345 columns]

C:\Users\ja21_\AppData\Local\Temp\ipykernel_29388\2927848962.py:4: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  panel_df["system"] = panel_df["SYSTEM (≥ 3 HOSP)"].fillna("INDEPENDENT_or_<3")
C:\Users\ja21_\AppData\Local\Temp\ipykernel_29388\2927848962.py:5: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  panel_df["system_lag"] = panel_df.groupby("FAC_NO")["system"].shift(1)
C:\Users\ja21_\AppData\Local\Temp\ipykernel_29388\2927848962.py:8: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  panel_df["system_raw"] = panel_df["SYSTEM (≥ 3 HOSP)"]
C:\Users\ja21_\AppData\Local\Temp\ipykernel_29388\2927848962.py:9: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  panel_df["in_system"] = panel_df["system_raw"].notna().astype(int)
C:\Users\ja21_\AppData\Local\Temp\ipykernel_29388\2927848962.py:10: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  panel_df["in_system_lag"] = panel_df.groupby("FAC_NO")["in_system"].shift(1)
C:\Users\ja21_\AppData\Local\Temp\ipykernel_29388\2927848962.py:13: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  panel_df["switch_system"] = ((panel_df["in_system_lag"] == 0) & (panel_df["in_system"] == 1)).astype(int)
C:\Users\ja21_\AppData\Local\Temp\ipykernel_29388\2927848962.py:17: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  panel_df["treated"] = panel_df.groupby("FAC_NO")["switch_system"].transform("max")

#Now let's drop na, summarise the outcome and treatment variables
outcomes = ["operating_margin","net_income_margin","cost_to_charge_ratio","cost_per_discharge"]

panel_df_clean = (
    panel_df
    .replace([np.inf, -np.inf], np.nan)   
    .dropna(subset=outcomes)
) 
# Let's windzorise outcome variables too
for v in outcomes:
 lo, hi = panel_df_clean[v].quantile([0.02, 0.98])
 panel_df_clean[v] = panel_df_clean[v].clip(lo, hi)
 

print("Descriptive Statistics for Outcome Variables:")
print(panel_df_clean[outcomes].describe())

#Now lets summarise the treatment variable
N_obs = len(panel_df_clean)
N_hosp = panel_df_clean["FAC_NO"].nunique()
N_events = panel_df_clean["switch_system"].sum()
event_rate = panel_df_clean["switch_system"].mean()

ever_treated = panel_df_clean.groupby("FAC_NO")["switch_system"].max()
N_ever_treated = ever_treated.sum()

switch_years = panel_df_clean.loc[panel_df_clean.switch_system == 1, "year"]

#Summary Statistics for treatment variable.
treatment_summary = pd.DataFrame({
    "Statistic": [
        "Hospital-years",
        "Hospitals",
        "Switch events",
        "Event rate (hospital-year)",
        "Hospitals ever switching",
        "First switch year",
        "Last switch year"
    ],
    "Value": [
        N_obs,
        N_hosp,
        int(N_events),
        round(event_rate, 4),
        int(N_ever_treated),
        int(switch_years.min()) if len(switch_years) > 0 else None,
        int(switch_years.max()) if len(switch_years) > 0 else None
    ]
})
print("Descriptive Statistics for Treatment Variable:")
print(treatment_summary)

print("Descriptive Statistics for Event Timing:")
print(panel_df_clean["event_time"].describe())

Descriptive Statistics for Outcome Variables:
       operating_margin  net_income_margin  cost_to_charge_ratio  \
count      12802.000000       12802.000000          12802.000000   
mean          -0.017996           0.023994              0.405265   
std            0.176492           0.125268              0.231116   
min           -0.769381          -0.403427              0.128554   
25%           -0.061050          -0.023347              0.238315   
50%            0.010111           0.030855              0.331640   
75%            0.073290           0.092650              0.499828   
max            0.265380           0.291121              1.037215   

       cost_per_discharge  
count        12802.000000  
mean         31945.535524  
std          41274.135209  
min           4155.582886  
25%          10478.387466  
50%          18590.900462  
75%          34746.293307  
max         237197.147220  
Descriptive Statistics for Treatment Variable:
                    Statistic       Value
0              Hospital-years  12802.0000
1                   Hospitals    646.0000
2               Switch events    288.0000
3  Event rate (hospital-year)      0.0225
4    Hospitals ever switching    268.0000
5           First switch year   2017.0000
6            Last switch year   2024.0000
Descriptive Statistics for Event Timing:
count    7253.000000
mean       -8.014339
std         8.735105
min       -28.000000
25%       -15.000000
50%        -8.000000
75%        -1.000000
max         7.000000
Name: event_time, dtype: float64

pre = panel_df_clean[(panel_df_clean["event_time"] < 0) | (panel_df_clean["treated"] == 0)]


#print(pre)
balance = (
    pre
    .groupby("treated")[outcomes]
    .agg(["mean", "std"])
)
 
print(balance)

#Standardized Differences
def std_diff(x_t, x_c):
    return (x_t.mean() - x_c.mean()) / np.sqrt(0.5 * (x_t.var() + x_c.var()))

std_diffs = {
    v: std_diff(pre.loc[pre.treated == 1, v],
                pre.loc[pre.treated == 0, v])
    for v in outcomes
}

print(std_diffs)

        operating_margin           net_income_margin            \
                    mean       std              mean       std   
treated                                                          
0.0            -0.063276  0.201035          0.003888  0.128696   
1.0             0.014162  0.147510          0.039296  0.115774   

        cost_to_charge_ratio           cost_per_discharge                
                        mean       std               mean           std  
treated                                                                  
0.0                 0.510327  0.264233       40478.690211  55878.828597  
1.0                 0.343155  0.164460       19818.031044  18744.281899  
{'operating_margin': 0.4392019913980789, 'net_income_margin': 0.2892660723932149, 'cost_to_charge_ratio': -0.7596093472387215, 'cost_per_discharge': -0.4957438000843327}

event_window = (-3, 3)

# Keep treated hospitals + never-treated controls
es_df = panel_df_clean.copy()

column_names_list = es_df.columns.tolist()

#print(column_names_list)

# Keep treated hospitals within the window + never-treated controls
es_df = es_df[
    (es_df["event_time"].between(event_window[0], event_window[1])) |
    (es_df["treated"] == 0)
].copy()


#This function defines the variable coefficient names
def evname(k):
    if k < 0:
        return f"event_m{abs(k)}"
    elif k > 0:
        return f"event_p{k}"
    else:
        return "event_0"

# Create dummies, omit -1 as reference
for k in range(event_window[0], event_window[1] + 1):
    if k == -1:
        continue
    es_df[evname(k)] = (es_df["event_time"] == k).astype(int)

event_terms = " + ".join([evname(k) for k in range(event_window[0], event_window[1] + 1) if k != -1])

#controls = ["BED_LIC", "DIS_TOT", "VIS_TOT"]  # Control Variables
controls = ["BED_LIC", "DIS_TOT", "VIS_TOT","mcal_share","mcar_share"]  # Control Variables

#formula = f"operating_margin ~ {event_terms} + " + " + ".join(controls) + " + C(FAC_NO) + C(year)"
#formula = f"net_income_margin ~ {event_terms} + " + " + ".join(controls) + " + C(FAC_NO) + C(year)"
#formula = f"cost_to_charge_ratio ~ {event_terms} + " + " + ".join(controls) + " + C(FAC_NO) + C(year)"
#formula = f"cost_per_discharge ~ {event_terms} + " + " + ".join(controls) + " + C(FAC_NO) + C(year)"



#Spec1_baseline uses only event dummies + FE.

#Spec2_controls adds your controls list (if it’s empty, it collapses to the baseline).

specs = {
    "Spec1_baseline": "",  # no extra controls
    "Spec2_controls": (" + " + " + ".join(controls)) if len(controls) > 0 else ""
}

results = {}  # store fitted models
models  = {}

#Run this to loop thtough all regs and specs
for y in outcomes:
    for spec_name, extra in specs.items():

        formula = f"{y} ~ {event_terms}{extra} + C(FAC_NO) + C(year)"
        # If you’re using local market-by-year FE, add e.g.:
        # formula += " + C(HFPA):C(year)"

        # 1) fit without robust SE first (lets patsy drop rows)
        m = smf.ols(formula, data=es_df).fit()

        # 2) align clusters to estimation sample
        used_idx = m.model.data.row_labels
        groups_used = es_df.loc[used_idx, "FAC_NO"]

        # 3) clustered SE
        m_cl = m.get_robustcov_results(cov_type="cluster", groups=groups_used)
        
        models[(y, spec_name)] = m_cl
        # 4) Convert params + SEs to pandas objects
        param_names = m_cl.model.exog_names
        coef        = pd.Series(m_cl.params, index=param_names)
        se          = pd.Series(m_cl.bse,    index=param_names)
        p_values    = pd.Series(m_cl.pvalues,    index=param_names)
        ci = pd.DataFrame(
        m_cl.conf_int(alpha=0.05),
        index=param_names,
        columns=["ci_lower", "ci_upper"]
         )
        #print(ci)

        # 5) Keep only event-study coefficients
        event_table = pd.DataFrame({
            "coef": coef,
            "se": se,
            "p-value": p_values, 
            "ci_lower": ci["ci_lower"], 
            "ci_upper": ci["ci_upper"]
        }).loc[lambda d: d.index.str.startswith("event_")]

        results[(y, spec_name)] = event_table

        print("\n" + "=" * 100)
        print(f"Outcome: {y} | {spec_name}")
        print(f"Formula: {formula}")
        print(event_table.round(4))

====================================================================================================
Outcome: operating_margin | Spec1_baseline
Formula: operating_margin ~ event_m3 + event_m2 + event_0 + event_p1 + event_p2 + event_p3 + C(FAC_NO) + C(year)
            coef      se  p-value  ci_lower  ci_upper
event_m3  0.0134  0.0153   0.3782   -0.0165    0.0434
event_m2  0.0050  0.0149   0.7392   -0.0243    0.0342
event_0   0.0235  0.0156   0.1314   -0.0071    0.0542
event_p1  0.0048  0.0174   0.7809   -0.0294    0.0391
event_p2 -0.0201  0.0173   0.2452   -0.0540    0.0138
event_p3  0.0073  0.0178   0.6815   -0.0276    0.0422

====================================================================================================
Outcome: operating_margin | Spec2_controls
Formula: operating_margin ~ event_m3 + event_m2 + event_0 + event_p1 + event_p2 + event_p3 + BED_LIC + DIS_TOT + VIS_TOT + mcal_share + mcar_share + C(FAC_NO) + C(year)
            coef      se  p-value  ci_lower  ci_upper
event_m3  0.0093  0.0157   0.5557   -0.0216    0.0402
event_m2 -0.0012  0.0148   0.9330   -0.0304    0.0279
event_0   0.0248  0.0159   0.1203   -0.0065    0.0561
event_p1  0.0050  0.0176   0.7759   -0.0296    0.0397
event_p2 -0.0158  0.0177   0.3721   -0.0505    0.0189
event_p3  0.0155  0.0180   0.3888   -0.0198    0.0508

====================================================================================================
Outcome: net_income_margin | Spec1_baseline
Formula: net_income_margin ~ event_m3 + event_m2 + event_0 + event_p1 + event_p2 + event_p3 + C(FAC_NO) + C(year)
            coef      se  p-value  ci_lower  ci_upper
event_m3  0.0178  0.0119   0.1333   -0.0055    0.0411
event_m2  0.0019  0.0105   0.8551   -0.0187    0.0225
event_0   0.0228  0.0115   0.0471    0.0003    0.0453
event_p1 -0.0117  0.0125   0.3468   -0.0362    0.0127
event_p2 -0.0135  0.0125   0.2809   -0.0380    0.0111
event_p3  0.0104  0.0141   0.4588   -0.0172    0.0380

====================================================================================================
Outcome: net_income_margin | Spec2_controls
Formula: net_income_margin ~ event_m3 + event_m2 + event_0 + event_p1 + event_p2 + event_p3 + BED_LIC + DIS_TOT + VIS_TOT + mcal_share + mcar_share + C(FAC_NO) + C(year)
            coef      se  p-value  ci_lower  ci_upper
event_m3  0.0165  0.0122   0.1775   -0.0075    0.0405
event_m2 -0.0011  0.0107   0.9183   -0.0222    0.0200
event_0   0.0237  0.0117   0.0429    0.0008    0.0466
event_p1 -0.0115  0.0125   0.3598   -0.0361    0.0131
event_p2 -0.0114  0.0127   0.3687   -0.0363    0.0135
event_p3  0.0146  0.0142   0.3034   -0.0133    0.0426

====================================================================================================
Outcome: cost_to_charge_ratio | Spec1_baseline
Formula: cost_to_charge_ratio ~ event_m3 + event_m2 + event_0 + event_p1 + event_p2 + event_p3 + C(FAC_NO) + C(year)
            coef      se  p-value  ci_lower  ci_upper
event_m3 -0.0015  0.0113   0.8946   -0.0237    0.0207
event_m2  0.0016  0.0097   0.8678   -0.0174    0.0206
event_0  -0.0004  0.0076   0.9557   -0.0154    0.0146
event_p1  0.0071  0.0087   0.4162   -0.0100    0.0242
event_p2  0.0033  0.0094   0.7222   -0.0151    0.0217
event_p3  0.0067  0.0106   0.5296   -0.0142    0.0275

====================================================================================================
Outcome: cost_to_charge_ratio | Spec2_controls
Formula: cost_to_charge_ratio ~ event_m3 + event_m2 + event_0 + event_p1 + event_p2 + event_p3 + BED_LIC + DIS_TOT + VIS_TOT + mcal_share + mcar_share + C(FAC_NO) + C(year)
            coef      se  p-value  ci_lower  ci_upper
event_m3  0.0005  0.0113   0.9667   -0.0217    0.0226
event_m2  0.0052  0.0091   0.5649   -0.0126    0.0231
event_0  -0.0010  0.0078   0.8965   -0.0163    0.0143
event_p1  0.0071  0.0086   0.4140   -0.0099    0.0241
event_p2  0.0012  0.0093   0.8988   -0.0171    0.0195
event_p3  0.0019  0.0107   0.8607   -0.0191    0.0228

====================================================================================================
Outcome: cost_per_discharge | Spec1_baseline
Formula: cost_per_discharge ~ event_m3 + event_m2 + event_0 + event_p1 + event_p2 + event_p3 + C(FAC_NO) + C(year)
               coef         se  p-value   ci_lower   ci_upper
event_m3  1654.9555  1711.0039   0.3338 -1705.2177  5015.1286
event_m2  -250.1380  1856.2393   0.8928 -3895.5331  3395.2571
event_0  -1415.7636  1027.8029   0.1689 -3434.2251   602.6979
event_p1 -2781.8773  1703.3519   0.1029 -6127.0228   563.2682
event_p2 -2619.0611  1687.5564   0.1212 -5933.1866   695.0643
event_p3 -3743.0870  2133.4759   0.0799 -7932.9361   446.7621

====================================================================================================
Outcome: cost_per_discharge | Spec2_controls
Formula: cost_per_discharge ~ event_m3 + event_m2 + event_0 + event_p1 + event_p2 + event_p3 + BED_LIC + DIS_TOT + VIS_TOT + mcal_share + mcar_share + C(FAC_NO) + C(year)
               coef         se  p-value   ci_lower   ci_upper
event_m3  1960.9430  1731.5409   0.2579 -1440.6717  5362.5577
event_m2   433.5700  1818.8000   0.8117 -3139.4653  4006.6053
event_0  -1522.7253  1035.7051   0.1421 -3557.3696   511.9190
event_p1 -2749.9656  1713.0429   0.1090 -6115.2410   615.3099
event_p2 -2879.2718  1681.2241   0.0874 -6182.0391   423.4955
event_p3 -4541.1149  2139.6542   0.0343 -8744.4688  -337.7611

#Helper to convert event_m5/event_p2/event_0  to numeric event time
def parse_event_time(name: str) -> int:
    if name == "event_0":
        return 0
    m = re.match(r"event_m(\d+)$", name)
    if m:
        return -int(m.group(1))
    m = re.match(r"event_p(\d+)$", name)
    if m:
        return int(m.group(1))
    raise ValueError(f"Unrecognized event term: {name}")


colors = {
    "Spec1_baseline": "tab:blue",
    "Spec2_controls": "tab:red"
}

for spec in specs:
        tbl = results[("net_income_margin", spec)].copy()
        tbl["k"] = [parse_event_time(ix) for ix in tbl.index]
        tbl = tbl.sort_values("k")

        # point estimate + 95% CI
        y     = tbl["coef"].to_numpy()
        se    = tbl["se"].to_numpy()
        lower = tbl["ci_lower"].to_numpy()
        upper = tbl["ci_upper"].to_numpy()
        yerr  = np.vstack([y - lower, upper - y])

        plt.errorbar(
        tbl["k"],
        y,
        yerr=yerr,
        fmt="o",
        color=colors.get(spec, "black"),
        capsize=4,
        markersize=5,
        linewidth=1.5,
        label=spec.replace("_", " ").title()
        )

plt.axhline(0, color="black", linewidth=1)
plt.axvline(-1, linestyle="--", color="black", linewidth=1)
plt.xlabel("Event time (years relative to system entry)")
plt.ylabel("Effect on Net Income Margin")
plt.title("Event Study: Net Income Margin", fontsize=13)

plt.legend(frameon=False)
plt.tight_layout()
plt.show()

# Test the pre-trend coefficients jointly
def lead_terms(event_window=(-5, 5)):
    leads = []
    for k in range(event_window[0], 0):
        if k == -1:
            continue
        leads.append(f"event_m{abs(k)}")
    return leads

leads = lead_terms(event_window=event_window)
print(leads)

restriction = ", ".join([f"{term} = 0" for term in leads])
print(restriction)
 
spec_list = list(specs.keys())
 
spec1 = models[("net_income_margin",spec_list[0])] 
print(spec1.wald_test(restriction))

spec2 = models[("net_income_margin",spec_list[1])] 
print(spec2.wald_test(restriction))

['event_m3', 'event_m2']
event_m3 = 0, event_m2 = 0
<F test: F=array([[1.41014099]]), p=0.2449034646127719, df_denom=610, df_num=2>
<F test: F=array([[1.45541918]]), p=0.23424407837869188, df_denom=524, df_num=2>

C:\ProgramData\anaconda3\Lib\site-packages\statsmodels\base\model.py:1912: FutureWarning: The behavior of wald_test will change after 0.14 to returning scalar test statistic values. To get the future behavior now, set scalar to True. To silence this message while retaining the legacy behavior, set scalar to False.
  warnings.warn(

# 1) Create placebo event_time for controls
rng = np.random.default_rng(123)

placebo_df = es_df.copy()

controls = placebo_df[placebo_df["treated"] == 0].copy()

fake_event_year = (
    controls.groupby("FAC_NO")["year"]
    .apply(lambda s: rng.choice(s.values))
    .rename("fake_event_year")
)

placebo_df = placebo_df.merge(fake_event_year, on="FAC_NO", how="left")

placebo_df["placebo_event_time"] = np.where(
    placebo_df["treated"] == 0,
    placebo_df["year"] - placebo_df["fake_event_year"],
    np.nan
)

# -------------------------------------------------------------------
# WINDOW = (-3, 3)
window_min, window_max = -3, 3

# 2) Create placebo event dummies for k in [-3,3], omit -1
for col in list(placebo_df.columns):
    if col.startswith("pl_event_"):
        placebo_df.drop(columns=col, inplace=True)

for k in range(window_min, window_max + 1):
    if k == -1:
        continue
    if k < 0:
        name = f"pl_event_m{abs(k)}"
    elif k > 0:
        name = f"pl_event_p{k}"
    else:
        name = "pl_event_0"

    placebo_df[name] = (placebo_df["placebo_event_time"] == k).astype(int)

# Build terms that match the created columns
pl_event_terms = " + ".join(
    [f"pl_event_m{abs(k)}" for k in range(window_min, 0) if k != -1] +
    ["pl_event_0"] +
    [f"pl_event_p{k}" for k in range(1, window_max + 1)]
)

print("Placebo terms:", pl_event_terms)

Placebo terms: pl_event_m3 + pl_event_m2 + pl_event_0 + pl_event_p1 + pl_event_p2 + pl_event_p3

# 3) Run placebo regression (controls only)
placebo_controls = placebo_df[placebo_df["treated"] == 0].copy()
placebo_controls = placebo_controls[
    placebo_controls["placebo_event_time"].between(window_min, window_max)
].copy()

# (Sanity check: confirm dummy columns exist)
#missing = [t.strip() for t in pl_event_terms.split("+") if t.strip() not in placebo_controls.columns]
#print("Missing placebo columns (should be empty):", missing)

placebo_models = {}

for spec_name in spec_list:
    extra = specs[spec_name]

    formula_pl = f"net_income_margin ~ {pl_event_terms}{extra} + C(FAC_NO) + C(year)"
    m = smf.ols(formula_pl, data=placebo_controls).fit()

    used_idx = m.model.data.row_labels
    groups_used = placebo_controls.loc[used_idx, "FAC_NO"].to_numpy()

    m_cl = m.get_robustcov_results(cov_type="cluster", groups=groups_used)
    placebo_models[spec_name] = m_cl

    param_names = m_cl.model.exog_names
    coef        = pd.Series(m_cl.params, index=param_names)
    se          = pd.Series(m_cl.bse,   index=param_names)
    p_values    = pd.Series(m_cl.pvalues,   index=param_names)

    keep = [p for p in param_names if p.startswith("pl_event_")]
    out_tbl = pd.DataFrame({"coef": coef[keep], "se": se[keep], "p-value": p_values[keep]})

    print("\n" + "-" * 90)
    print(f"PLACEBO (fake event year, controls only) | {spec_name}")
    print(out_tbl.round(4))

------------------------------------------------------------------------------------------
PLACEBO (fake event year, controls only) | Spec1_baseline
               coef      se  p-value
pl_event_m3  0.0139  0.0094   0.1415
pl_event_m2 -0.0008  0.0084   0.9195
pl_event_0  -0.0026  0.0079   0.7431
pl_event_p1  0.0053  0.0091   0.5599
pl_event_p2 -0.0173  0.0097   0.0758
pl_event_p3 -0.0075  0.0102   0.4612

------------------------------------------------------------------------------------------
PLACEBO (fake event year, controls only) | Spec2_controls
               coef      se  p-value
pl_event_m3  0.0077  0.0116   0.5093
pl_event_m2 -0.0039  0.0110   0.7241
pl_event_0  -0.0000  0.0101   0.9968
pl_event_p1 -0.0021  0.0120   0.8613
pl_event_p2 -0.0193  0.0127   0.1306
pl_event_p3 -0.0195  0.0144   0.1764

C:\ProgramData\anaconda3\Lib\site-packages\statsmodels\regression\linear_model.py:1884: RuntimeWarning: invalid value encountered in sqrt
  return np.sqrt(np.diag(self.cov_params()))

shift = 3                 # shift event by +n (treat as if event happened n years earlier). 
outcome = "net_income_margin"

shift_df = es_df.copy()

#1) Create shifted event_time (treated only)
#If treated==1, then shift_event_time = event_time + shift
#If treated==0, then leave as NaN (we are not using controls in this placebo)

shift_df["shift_event_time"] = np.where(
    shift_df["treated"] == 1,
    shift_df["event_time"] + shift,
    np.nan
)

#2) Restrict to treated hospitals and event window  
shift_treated = shift_df[shift_df["treated"] == 1].copy()
shift_treated = shift_treated[
    shift_treated["shift_event_time"].between(window_min, window_max)
].copy()

 
#3) Drop old shifted dummy columns if re-running
 
for col in list(shift_treated.columns):
    if col.startswith("sh_event_"):
        shift_treated.drop(columns=col, inplace=True)

#4a) Create shifted event-time dummies (omit -1 as the reference period)
# sh_event_m3, sh_event_m2, sh_event_0, sh_event_p1, sh_event_p2, sh_event_p3 (ex for window size 3)
for k in range(window_min, window_max + 1):
    if k == -1:
        continue
    if k < 0:
        name = f"sh_event_m{abs(k)}"
    elif k > 0:
        name = f"sh_event_p{k}"
    else:
        name = "sh_event_0"
    shift_treated[name] = (shift_treated["shift_event_time"] == k).astype(int)

#4b) Build the RHS event terms string consistent with the created columns

sh_event_terms = " + ".join(
    [f"sh_event_m{abs(k)}" for k in range(window_min, 0) if k != -1] +
    ["sh_event_0"] +
    [f"sh_event_p{k}" for k in range(1, window_max + 1)]
)

 
print("Shifted placebo terms:", sh_event_terms)
print("Shifted placebo sample size:", len(shift_treated))
print("Shifted placebo hospitals:", shift_treated["FAC_NO"].nunique())

#Check for missing shifted (there should be none)
missing = [t.strip() for t in sh_event_terms.split("+") if t.strip() not in shift_treated.columns]
print("Missing shifted dummy columns (should be empty):", missing)

Shifted placebo terms: sh_event_m3 + sh_event_m2 + sh_event_0 + sh_event_p1 + sh_event_p2 + sh_event_p3
Shifted placebo sample size: 1034
Shifted placebo hospitals: 267
Missing shifted dummy columns (should be empty): []

#5) Run the shifted-event placebo regressions (clustered by hospital) 
# Uses: net_income_margin ~ shifted_event_dummies + (any controls) + FE

shift_models = {}

for spec_name in spec_list:
    extra = specs[spec_name]  # "" for baseline, or " + ..." for controls

    formula_sh = f"{outcome} ~ {sh_event_terms}{extra} + C(FAC_NO) + C(year)"
    m = smf.ols(formula_sh, data=shift_treated).fit()

    # Align clusters to the rows actually used by Patsy/statsmodels
    used_idx = m.model.data.row_labels
    groups_used = shift_treated.loc[used_idx, "FAC_NO"].to_numpy()

    # Cluster-robust covariance
    m_cl = m.get_robustcov_results(cov_type="cluster", groups=groups_used)
    shift_models[spec_name] = m_cl

    #6) Print shifted placebo event coefficients (should be 0)
    param_names = m_cl.model.exog_names
    coef = pd.Series(m_cl.params, index=param_names)
    se   = pd.Series(m_cl.bse,   index=param_names)
    p_values    = pd.Series(m_cl.pvalues,   index=param_names)

    keep = [p for p in param_names if p.startswith("sh_event_")]
    out_tbl = pd.DataFrame({"coef": coef[keep], "se": se[keep], "p-value": p_values[keep]})

    print("\n" + "-" * 90)
    print(f"PLACEBO (shift event earlier by {shift}y, treated only) | {spec_name}")
    print(out_tbl.round(4))

------------------------------------------------------------------------------------------
PLACEBO (shift event earlier by 3y, treated only) | Spec1_baseline
               coef      se  p-value
sh_event_m3  0.0000  0.0000   0.0000
sh_event_m2  0.0000  0.0000   0.0000
sh_event_0   0.0163  0.0174   0.3506
sh_event_p1 -0.0278  0.0189   0.1425
sh_event_p2 -0.0392  0.0264   0.1388
sh_event_p3 -0.0783  0.0119   0.0000

------------------------------------------------------------------------------------------
PLACEBO (shift event earlier by 3y, treated only) | Spec2_controls
               coef      se  p-value
sh_event_m3 -0.0000  0.0000   0.0000
sh_event_m2 -0.0000  0.0000   0.0000
sh_event_0   0.0417  0.0231   0.0725
sh_event_p1  0.0104  0.0316   0.7423
sh_event_p2  0.0137  0.0434   0.7528
sh_event_p3 -0.0106  0.0367   0.7723

C:\ProgramData\anaconda3\Lib\site-packages\statsmodels\regression\linear_model.py:1884: RuntimeWarning: invalid value encountered in sqrt
  return np.sqrt(np.diag(self.cov_params()))

balance_nontreated = balance.loc[0, (slice(None), "mean")]["net_income_margin"]
balance_treated    = balance.loc[1, (slice(None), "mean")]["net_income_margin"]
spec_use           = results[("net_income_margin", "Spec2_controls")]
coef               = spec_use.loc["event_0", "coef"]
low_ci             = spec_use.loc["event_0", "ci_lower"]
hi_ci              = spec_use.loc["event_0", "ci_upper"]
print("Pre Treatment Mean (Non-Treated): ",balance_nontreated)
print("Pre Treatment Mean (Treated): ",balance_treated)
print("Average Effect of Joining a System: ",coef)
print("Lower CI Effect of Joining a System: ",low_ci)
print("Higher CI Effect of Joining a System: ",hi_ci)

#Calculate now for pre-treated
print("Profit Margin After Effect at the Average Treated Hospital: ",balance_treated+coef)
print("Profit Margin After Effect (Lower CI) at the Average Treated Hospital: ",balance_treated+low_ci)
print("Profit Margin After Effect (Higher CI) at the Average Treated Hospital: ",balance_treated+hi_ci)

Pre Treatment Mean (Non-Treated):  mean    0.003888
Name: 0.0, dtype: float64
Pre Treatment Mean (Treated):  mean    0.039296
Name: 1.0, dtype: float64
Average Effect of Joining a System:  0.023663901778246326
Lower CI Effect of Joining a System:  0.0007558140530176602
Higher CI Effect of Joining a System:  0.04657198950347499
Profit Margin After Effect at the Average Treated Hospital:  mean    0.06296
Name: 1.0, dtype: float64
Profit Margin After Effect (Lower CI) at the Average Treated Hospital:  mean    0.040051
Name: 1.0, dtype: float64
Profit Margin After Effect (Higher CI) at the Average Treated Hospital:  mean    0.085868
Name: 1.0, dtype: float64

The Financial Effects of Hospital System Affiliation¶

0. Department of Health Care Access and Information (HCAI) Pivot Data¶

1. Importing the HCAI Financial Pivot Data¶

2. Define the estimator, treatment and Outcome variables¶

2.1 Difference-in-Differences and Event-Study Identification¶

2.2 Treatment Definition and Outcome Variables¶

3. Descriptive Statistics¶

3.1 Pre-Treatment Differences¶

4. Event-study pre-trends and dynamic effects¶

5. Robustness Checks¶

5.1 Pre-trends ("lead") joint test.¶

5.2 Placebo "fake event year" for never-treated controls:¶

5.3 "Shifted event" placebo (pretend treatment happens earlier)¶

Null hypothesis¶

5. Final Consideration: what does this effect mean?¶

	FAC_NO	FAC_NAME	BEG_DATE	END_DATE	DAY_PER	county name	HSA name	HFPA	TYPE_CNTRL	SYSTEM (≥ 3 HOSP)	...	TOT_LTDEBT	CUR_MAT	ner a/r (calc)	nurs fte	prod fte	paid fte	hsa name	Health System	COUNTY	HSA
0	106010735.0	ALAMEDA HOSPITAL	2022-07-01 00:00:00	2023-06-30 00:00:00	365.0	Alameda	05 - EAST BAY	417.0	City/County	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	106010739.0	ALTA BATES SUMMIT MEDICAL CENTER - ALTA BATES ...	2023-01-01 00:00:00	2023-12-31 00:00:00	365.0	Alameda	05 - EAST BAY	415.0	Non-Profit	SUTTER HEALTH	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	106010776.0	UCSF BENIOFF CHILDREN'S HOSPITAL OAKLAND	2022-07-01 00:00:00	2023-06-30 00:00:00	365.0	Alameda	05 - EAST BAY	417.0	Non-Profit	UNIVERSITY OF CALIFORNIA	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	106010846.0	HIGHLAND HOSPITAL	2022-07-01 00:00:00	2023-06-30 00:00:00	365.0	Alameda	05 - EAST BAY	417.0	City/County	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	106010887.0	KINDRED HOSPITAL - SAN FRANCISCO BAY AREA	2023-01-01 00:00:00	2023-12-31 00:00:00	365.0	Alameda	05 - EAST BAY	421.0	Investor	SCIONHEALTH	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
13513	106444013.0	WATSONVILLE COMMUNITY HOSPITAL	2024-01-01 00:00:00	2024-12-31 00:00:00	366.0	Santa Cruz	08 - MID-COAST	711.0	Non-Profit	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
13514	106301379.0	WEST ANAHEIM MEDICAL CENTER	2024-01-01 00:00:00	2024-12-31 00:00:00	366.0	Orange	13 - ORANGE COUNTY	1012.0	Investor	PRIME HEALTHCARE SERVICES	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
13515	106190883.0	WHITTIER HOSPITAL MEDICAL CENTER	2023-07-01 00:00:00	2024-06-30 00:00:00	366.0	Los Angeles	11 - LOS ANGELES COUNTY	919.0	Investor	AHMC HEALTHCARE, INC.	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
13516	106571086.0	WOODLAND MEMORIAL HOSPITAL	2023-07-01 00:00:00	2024-06-30 00:00:00	366.0	Yolo	02 - GOLDEN EMPIRE	313.0	Non-Profit	DIGNITY HEALTH	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
13517	106380939.0	ZUCKERBERG SAN FRANCISCO GENERAL HOSPITAL & TR...	2023-07-01 00:00:00	2024-06-30 00:00:00	366.0	San Francisco	04 - WEST BAY	423.0	City/County	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN