|  |  | 
					
						
						|  | """ | 
					
						
						|  | Run Statistical Modeling: Linear regression, diagnostics, p-values, confidence intervals, plots | 
					
						
						|  | """ | 
					
						
						|  | import os | 
					
						
						|  | import sys | 
					
						
						|  | import glob | 
					
						
						|  | sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'src')) | 
					
						
						|  | import pandas as pd | 
					
						
						|  | import numpy as np | 
					
						
						|  | import matplotlib.pyplot as plt | 
					
						
						|  | import seaborn as sns | 
					
						
						|  | from sklearn.linear_model import LinearRegression | 
					
						
						|  | from sklearn.model_selection import train_test_split | 
					
						
						|  | from sklearn.preprocessing import StandardScaler | 
					
						
						|  | import statsmodels.api as sm | 
					
						
						|  | from scipy import stats | 
					
						
						|  | from statsmodels.stats.diagnostic import het_breuschpagan | 
					
						
						|  | from statsmodels.stats.outliers_influence import variance_inflation_factor | 
					
						
						|  |  | 
					
						
						|  | def find_latest_data(): | 
					
						
						|  | data_files = glob.glob('data/processed/fred_data_*.csv') | 
					
						
						|  | if not data_files: | 
					
						
						|  | raise FileNotFoundError("No FRED data files found. Run the pipeline first.") | 
					
						
						|  | return max(data_files, key=os.path.getctime) | 
					
						
						|  |  | 
					
						
						|  | def main(): | 
					
						
						|  | print("="*60) | 
					
						
						|  | print("FRED Statistical Modeling: Linear Regression & Diagnostics") | 
					
						
						|  | print("="*60) | 
					
						
						|  | data_file = find_latest_data() | 
					
						
						|  | print(f"Using data file: {data_file}") | 
					
						
						|  | df = pd.read_csv(data_file, index_col=0, parse_dates=True) | 
					
						
						|  | df_clean = df.dropna() | 
					
						
						|  | target_var = 'GDP' | 
					
						
						|  | if target_var not in df_clean.columns: | 
					
						
						|  | print(f"Target variable '{target_var}' not found in data.") | 
					
						
						|  | return | 
					
						
						|  | feature_cols = [col for col in df_clean.columns if col != target_var] | 
					
						
						|  | X = df_clean[feature_cols] | 
					
						
						|  | y = df_clean[target_var] | 
					
						
						|  | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | 
					
						
						|  |  | 
					
						
						|  | model = LinearRegression() | 
					
						
						|  | model.fit(X_train, y_train) | 
					
						
						|  | y_pred_train = model.predict(X_train) | 
					
						
						|  | y_pred_test = model.predict(X_test) | 
					
						
						|  |  | 
					
						
						|  | r2_train = model.score(X_train, y_train) | 
					
						
						|  | r2_test = model.score(X_test, y_test) | 
					
						
						|  | print(f"R² (Train): {r2_train:.4f} | R² (Test): {r2_test:.4f}") | 
					
						
						|  |  | 
					
						
						|  | print("\nCoefficients:") | 
					
						
						|  | for feature, coef in zip(feature_cols, model.coef_): | 
					
						
						|  | print(f"  {feature}: {coef:.4f}") | 
					
						
						|  | print(f"  Intercept: {model.intercept_:.4f}") | 
					
						
						|  |  | 
					
						
						|  | X_with_const = sm.add_constant(X_train) | 
					
						
						|  | model_sm = sm.OLS(y_train, X_with_const).fit() | 
					
						
						|  | print("\nStatistical Significance:") | 
					
						
						|  | print(model_sm.summary().tables[1]) | 
					
						
						|  |  | 
					
						
						|  | with open('data/exports/regression_summary.txt', 'w') as f: | 
					
						
						|  | f.write(str(model_sm.summary())) | 
					
						
						|  |  | 
					
						
						|  | residuals = y_train - y_pred_train | 
					
						
						|  |  | 
					
						
						|  | _, p_value_norm = stats.normaltest(residuals) | 
					
						
						|  | print(f"Normality test (p-value): {p_value_norm:.4f}") | 
					
						
						|  |  | 
					
						
						|  | vif_data = [] | 
					
						
						|  | for i in range(X_train.shape[1]): | 
					
						
						|  | try: | 
					
						
						|  | vif = variance_inflation_factor(X_train.values, i) | 
					
						
						|  | vif_data.append(vif) | 
					
						
						|  | except: | 
					
						
						|  | vif_data.append(np.nan) | 
					
						
						|  | print("\nVariance Inflation Factors:") | 
					
						
						|  | for feature, vif in zip(feature_cols, vif_data): | 
					
						
						|  | print(f"  {feature}: {vif:.3f}") | 
					
						
						|  |  | 
					
						
						|  | try: | 
					
						
						|  | _, p_value_het = het_breuschpagan(residuals, X_with_const) | 
					
						
						|  | print(f"Homoscedasticity test (p-value): {p_value_het:.4f}") | 
					
						
						|  | except: | 
					
						
						|  | print("Homoscedasticity test failed") | 
					
						
						|  |  | 
					
						
						|  | plt.figure(figsize=(12,4)) | 
					
						
						|  | plt.subplot(1,2,1) | 
					
						
						|  | plt.scatter(y_pred_train, residuals, alpha=0.5) | 
					
						
						|  | plt.axhline(0, color='red', linestyle='--') | 
					
						
						|  | plt.xlabel('Fitted Values') | 
					
						
						|  | plt.ylabel('Residuals') | 
					
						
						|  | plt.title('Residuals vs Fitted') | 
					
						
						|  | plt.subplot(1,2,2) | 
					
						
						|  | stats.probplot(residuals, dist="norm", plot=plt) | 
					
						
						|  | plt.title('Normal Q-Q') | 
					
						
						|  | plt.tight_layout() | 
					
						
						|  | plt.savefig('data/exports/regression_diagnostics.png', dpi=200) | 
					
						
						|  | plt.close() | 
					
						
						|  | print("\nStatistical modeling complete. Outputs saved to data/exports/.") | 
					
						
						|  |  | 
					
						
						|  | if __name__ == "__main__": | 
					
						
						|  | main() |