import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Read the CSV file
df = pd.read_csv('nitrous_comps_april_may_2026_ORDERS.csv')

# Data cleaning
# Remove rows where Order ID is NaN (if any)
df = df.dropna(subset=['Order ID'])

# Convert Date to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Clean Amount column (remove £ symbol and convert to float)
df['Amount (£)'] = df['Amount (£)'].astype(str).str.replace('£', '').astype(float)

# Create output directory for visualizations
os.makedirs('marketing_report_visuals', exist_ok=True)

# 1. Traffic Source Analysis
traffic_source_counts = df['Traffic Source'].value_counts()
plt.figure(figsize=(12, 8))
traffic_source_counts.head(10).plot(kind='bar')
plt.title('Top 10 Traffic Sources', fontsize=16, fontweight='bold')
plt.xlabel('Traffic Source', fontsize=12)
plt.ylabel('Number of Orders', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('marketing_report_visuals/traffic_sources.png', dpi=300, bbox_inches='tight')
plt.close()

# 2. Order Status Distribution
status_counts = df['Status'].value_counts()
plt.figure(figsize=(10, 8))
plt.pie(status_counts.values, labels=status_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Order Status Distribution', fontsize=16, fontweight='bold')
plt.axis('equal')
plt.tight_layout()
plt.savefig('marketing_report_visuals/order_status.png', dpi=300, bbox_inches='tight')
plt.close()

# 3. Payment Method Analysis
payment_counts = df['Payment Method'].value_counts()
plt.figure(figsize=(12, 8))
payment_counts.head(10).plot(kind='bar')
plt.title('Top 10 Payment Methods', fontsize=16, fontweight='bold')
plt.xlabel('Payment Method', fontsize=12)
plt.ylabel('Number of Orders', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('marketing_report_visuals/payment_methods.png', dpi=300, bbox_inches='tight')
plt.close()

# 4. Daily Revenue Trend
daily_revenue = df.groupby('Date')['Amount (£)'].sum().reset_index()
plt.figure(figsize=(14, 8))
plt.plot(daily_revenue['Date'], daily_revenue['Amount (£)'], marker='o', linewidth=2, markersize=4)
plt.title('Daily Revenue Trend', fontsize=16, fontweight='bold')
plt.xlabel('Date', fontsize=12)
plt.ylabel('Revenue (£)', fontsize=12)
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('marketing_report_visuals/daily_revenue.png', dpi=300, bbox_inches='tight')
plt.close()

# 5. Revenue by Traffic Source
revenue_by_source = df.groupby('Traffic Source')['Amount (£)'].sum().sort_values(ascending=False)
plt.figure(figsize=(12, 8))
revenue_by_source.head(10).plot(kind='bar')
plt.title('Top 10 Traffic Sources by Revenue', fontsize=16, fontweight='bold')
plt.xlabel('Traffic Source', fontsize=12)
plt.ylabel('Revenue (£)', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('marketing_report_visuals/revenue_by_source.png', dpi=300, bbox_inches='tight')
plt.close()

# 6. Completed Orders Analysis
completed_df = df[df['Status'] == 'Completed']
completed_revenue_by_source = completed_df.groupby('Traffic Source')['Amount (£)'].sum().sort_values(ascending=False)
plt.figure(figsize=(12, 8))
completed_revenue_by_source.head(10).plot(kind='bar', color='green')
plt.title('Top 10 Traffic Sources by Completed Order Revenue', fontsize=16, fontweight='bold')
plt.xlabel('Traffic Source', fontsize=12)
plt.ylabel('Revenue (£) from Completed Orders', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('marketing_report_visuals/completed_revenue_by_source.png', dpi=300, bbox_inches='tight')
plt.close()

# 7. Geographic Distribution - Top Billing Cities
top_billing_cities = df['Billing City'].value_counts().head(10)
plt.figure(figsize=(12, 8))
top_billing_cities.plot(kind='bar')
plt.title('Top 10 Billing Cities by Order Count', fontsize=16, fontweight='bold')
plt.xlabel('Billing City', fontsize=12)
plt.ylabel('Number of Orders', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('marketing_report_visuals/top_billing_cities.png', dpi=300, bbox_inches='tight')
plt.close()

# Generate summary statistics
total_orders = len(df)
completed_orders = len(df[df['Status'] == 'Completed'])
cancelled_orders = len(df[df['Status'] == 'Cancelled'])
draft_orders = len(df[df['Status'] == 'Checkout-Draft'])
total_revenue = df['Amount (£)'].sum()
completed_revenue = completed_df['Amount (£)'].sum()
avg_order_value = df['Amount (£)'].mean()
completed_avg_order_value = completed_df['Amount (£)'].mean()

# Create marketing report
report_content = f"""
# Marketing Report Analysis
## Nitrous Competitions - April-May 2026

### Executive Summary
- **Total Orders**: {total_orders:,}
- **Completed Orders**: {completed_orders:,} ({completed_orders/total_orders*100:.1f}%)
- **Cancelled Orders**: {cancelled_orders:,} ({cancelled_orders/total_orders*100:.1f}%)
- **Checkout Drafts**: {draft_orders:,} ({draft_orders/total_orders*100:.1f}%)
- **Total Revenue**: £{total_revenue:,.2f}
- **Completed Revenue**: £{completed_revenue:,.2f}
- **Average Order Value**: £{avg_order_value:.2f}
- **Average Completed Order Value**: £{completed_avg_order_value:.2f}

### Key Insights

#### Traffic Source Performance
1. **Top Traffic Sources by Volume**:
   {chr(10).join([f"   - {source}: {count:,} orders" for source, count in traffic_source_counts.head(5).items()])}

2. **Top Traffic Sources by Revenue**:
   {chr(10).join([f"   - {source}: £{revenue:,.2f}" for source, revenue in revenue_by_source.head(5).items()])}

#### Conversion Analysis
- **Overall Conversion Rate**: {completed_orders/total_orders*100:.1f}%
- **Revenue Conversion Efficiency**: {(completed_revenue/total_revenue)*100:.1f}% of total potential revenue realized

#### Payment Method Preferences
1. **Most Popular Payment Methods**:
   {chr(10).join([f"   - {method}: {count:,} orders" for method, count in payment_counts.head(5).items()])}

#### Geographic Insights
- **Top Billing Cities**: {', '.join(top_billing_cities.index[:5].tolist())}

### Recommendations

1. **Focus on High-Performing Channels**: Allocate more budget to top-performing traffic sources identified in the analysis
2. **Optimize Checkout Process**: Address the {draft_orders/total_orders*100:.1f}% of orders that remain in checkout draft status
3. **Payment Method Optimization**: Ensure seamless integration for top payment methods to reduce friction
4. **Geographic Targeting**: Consider targeted campaigns for top-performing regions
5. **A/B Testing**: Test different approaches for traffic sources with high volume but lower conversion rates

### Visualizations Generated
The following visualizations have been created in the 'marketing_report_visuals' directory:
1. Traffic Sources Distribution
2. Order Status Breakdown
3. Payment Method Preferences
4. Daily Revenue Trends
5. Revenue by Traffic Source
6. Completed Orders Revenue by Source
7. Top Billing Cities

### Data Quality Notes
- Data covers period: {df['Date'].min().strftime('%Y-%m-%d')} to {df['Date'].max().strftime('%Y-%m-%d')}
- Missing values handled appropriately
- Currency values cleaned and converted to numeric format
"""

# Save the report
with open('marketing_report.md', 'w') as f:
    f.write(report_content)

print("Marketing analysis complete!")
print(f"Report saved as 'marketing_report.md'")
print(f"Visualizations saved in 'marketing_report_visuals/' directory")