"""
SeqMaster Runtime - Statistics Service

Provides statistical analysis for step-level test data.
Calculates mean, std dev, Cp/Cpk, histograms, and Pareto analysis.
"""

import math
from dataclasses import dataclass, field
from typing import List, Dict, Optional, Tuple
from collections import defaultdict


@dataclass
class DistributionBin:
    """A histogram bin with range and count."""
    bin_start: float
    bin_end: float
    count: int
    percentage: float


@dataclass
class StatisticsResult:
    """Complete statistics result for a step."""
    step_name: str
    count: int
    mean: float
    median: float
    std_dev: float
    min_value: float
    max_value: float
    range_value: float
    cp: float
    cpk: float
    cpk_interpretation: str
    suggested_lower_limit: float
    suggested_upper_limit: float
    current_lower_limit: Optional[float]
    current_upper_limit: Optional[float]
    distribution: List[DistributionBin] = field(default_factory=list)


@dataclass
class ParetoItem:
    """A single item in Pareto analysis."""
    step_name: str
    group_name: Optional[str]
    failure_count: int
    failure_percentage: float
    cumulative_percentage: float
    tester_id: Optional[str] = None


class StatisticsCalculator:
    """
    Calculate statistics, process capability, and histograms for step data.
    
    Based on industry-standard ±3σ methodology.
    """
    
    SIGMA_MULTIPLIER = 3.0  # For ±3σ limits
    CPK_TARGET = 1.33  # Industry standard Cpk target
    
    @classmethod
    def calculate(
        cls,
        values: List[float],
        current_lower: Optional[float] = None,
        current_upper: Optional[float] = None,
        step_name: str = "",
        bin_count: int = 20
    ) -> Optional[StatisticsResult]:
        """
        Calculate complete statistics for a list of values.
        
        Args:
            values: List of numeric measurement values
            current_lower: Current lower spec limit
            current_upper: Current upper spec limit
            step_name: Name of the step
            bin_count: Number of histogram bins
            
        Returns:
            StatisticsResult with all calculated metrics, or None if no data
        """
        if not values or len(values) == 0:
            return None
        
        n = len(values)
        
        # Basic statistics
        min_val = min(values)
        max_val = max(values)
        range_val = max_val - min_val
        mean = sum(values) / n
        median = cls._calculate_median(values)
        std_dev = cls._calculate_std_dev(values, mean)
        
        # Suggested limits based on ±3σ
        suggested_lower = mean - (cls.SIGMA_MULTIPLIER * std_dev)
        suggested_upper = mean + (cls.SIGMA_MULTIPLIER * std_dev)
        
        # Process capability indices
        cp = cls._calculate_cp(current_lower, current_upper, std_dev)
        cpk = cls._calculate_cpk(current_lower, current_upper, mean, std_dev)
        cpk_interpretation = cls.get_cpk_interpretation(cpk)
        
        # Distribution histogram
        distribution = cls._create_distribution(values, bin_count)
        
        return StatisticsResult(
            step_name=step_name,
            count=n,
            mean=mean,
            median=median,
            std_dev=std_dev,
            min_value=min_val,
            max_value=max_val,
            range_value=range_val,
            cp=cp,
            cpk=cpk,
            cpk_interpretation=cpk_interpretation,
            suggested_lower_limit=suggested_lower,
            suggested_upper_limit=suggested_upper,
            current_lower_limit=current_lower,
            current_upper_limit=current_upper,
            distribution=distribution
        )
    
    @staticmethod
    def _calculate_median(values: List[float]) -> float:
        """Calculate median of values."""
        sorted_values = sorted(values)
        n = len(sorted_values)
        
        if n % 2 == 0:
            return (sorted_values[n // 2 - 1] + sorted_values[n // 2]) / 2.0
        else:
            return sorted_values[n // 2]
    
    @staticmethod
    def _calculate_std_dev(values: List[float], mean: float) -> float:
        """Calculate sample standard deviation."""
        if len(values) < 2:
            return 0.0
        
        sum_of_squares = sum((v - mean) ** 2 for v in values)
        return math.sqrt(sum_of_squares / (len(values) - 1))
    
    @staticmethod
    def _calculate_cp(lsl: Optional[float], usl: Optional[float], std_dev: float) -> float:
        """
        Calculate Cp (Process Capability).
        
        Cp = (USL - LSL) / (6σ)
        Measures how much of the specification range the process uses.
        """
        if std_dev == 0 or lsl is None or usl is None:
            return float('inf') if std_dev == 0 else 0.0
        
        return (usl - lsl) / (6 * std_dev)
    
    @staticmethod
    def _calculate_cpk(
        lsl: Optional[float], 
        usl: Optional[float], 
        mean: float, 
        std_dev: float
    ) -> float:
        """
        Calculate Cpk (Process Capability Index).
        
        Cpk = min(CPU, CPL) where:
        - CPU = (USL - mean) / (3σ)
        - CPL = (mean - LSL) / (3σ)
        
        Measures how centered the process is within specification limits.
        """
        if std_dev == 0:
            return float('inf')
        
        cpk_values = []
        
        if lsl is not None:
            cpk_lower = (mean - lsl) / (3 * std_dev)
            cpk_values.append(cpk_lower)
        
        if usl is not None:
            cpk_upper = (usl - mean) / (3 * std_dev)
            cpk_values.append(cpk_upper)
        
        if not cpk_values:
            return 0.0
        
        return min(cpk_values)
    
    @staticmethod
    def _create_distribution(values: List[float], bin_count: int) -> List[DistributionBin]:
        """Create histogram distribution bins."""
        if not values:
            return []
        
        min_val = min(values)
        max_val = max(values)
        range_val = max_val - min_val
        n = len(values)
        
        if range_val == 0:
            # All values are the same
            return [DistributionBin(
                bin_start=min_val,
                bin_end=min_val,
                count=n,
                percentage=100.0
            )]
        
        bin_width = range_val / bin_count
        bins = [0] * bin_count
        
        for value in values:
            bin_index = min(int((value - min_val) / bin_width), bin_count - 1)
            bins[bin_index] += 1
        
        distribution = []
        for i, count in enumerate(bins):
            bin_start = min_val + (i * bin_width)
            bin_end = min_val + ((i + 1) * bin_width)
            percentage = (count / n) * 100 if n > 0 else 0
            distribution.append(DistributionBin(
                bin_start=bin_start,
                bin_end=bin_end,
                count=count,
                percentage=percentage
            ))
        
        return distribution
    
    @staticmethod
    def get_cpk_interpretation(cpk: float) -> str:
        """Get human-readable interpretation of Cpk value."""
        if cpk == float('inf'):
            return "Perfect"
        elif cpk >= 2.0:
            return "Excellent"
        elif cpk >= 1.33:
            return "Good"
        elif cpk >= 1.0:
            return "Acceptable"
        elif cpk >= 0.67:
            return "Marginal"
        else:
            return "Poor"


class ParetoCalculator:
    """
    Calculate Pareto analysis for failure data.
    
    Identifies top failure modes and calculates cumulative percentages.
    """
    
    @classmethod
    def calculate(
        cls,
        failure_data: List[Dict],
        top_count: int = 10
    ) -> List[ParetoItem]:
        """
        Calculate Pareto items from failure data.
        
        Args:
            failure_data: List of dicts with 'step_name', 'failure_count', optionally 'tester_id'
            top_count: Maximum number of items to return
            
        Returns:
            List of ParetoItem sorted by failure count descending
        """
        if not failure_data:
            return []
        
        # Sort by failure count descending
        sorted_data = sorted(failure_data, key=lambda x: x.get('failure_count', 0), reverse=True)
        
        # Calculate total failures
        total_failures = sum(item.get('failure_count', 0) for item in sorted_data)
        
        if total_failures == 0:
            return []
        
        # Take top N items
        top_items = sorted_data[:top_count]
        
        items = []
        cumulative_count = 0
        
        for item in top_items:
            failure_count = item.get('failure_count', 0)
            step_name = item.get('step_name', 'Unknown')
            group_name = item.get('group_name')
            tester_id = item.get('tester_id')
            
            percentage = (failure_count / total_failures) * 100
            cumulative_count += failure_count
            cumulative_percentage = (cumulative_count / total_failures) * 100
            
            items.append(ParetoItem(
                step_name=step_name,
                group_name=group_name,
                failure_count=failure_count,
                failure_percentage=percentage,
                cumulative_percentage=cumulative_percentage,
                tester_id=tester_id
            ))
        
        return items
    
    @staticmethod
    def get_threshold_index(items: List[ParetoItem], threshold: int = 80) -> int:
        """
        Find index where cumulative percentage reaches threshold.
        
        Args:
            items: List of ParetoItems
            threshold: Percentage threshold (e.g., 80 for 80/20 rule)
            
        Returns:
            Index of item that crosses threshold, or -1 if not found
        """
        if not items:
            return -1
        
        for i, item in enumerate(items):
            if item.cumulative_percentage >= threshold:
                return i
        
        return len(items) - 1
