Statistical Analysis
This page demonstrates using UniDist.jl for common statistical analysis tasks.
Hypothesis Testing
One-Sample Z-Test
using UniDist
# Test if sample mean differs from hypothesized value
# H₀: μ = 100, H₁: μ ≠ 100
sample_mean = 103.5
sample_size = 50
known_sd = 15
hypothesized_mean = 100
# Calculate z-statistic
se = known_sd / sqrt(sample_size)
z = (sample_mean - hypothesized_mean) / se
# P-value (two-tailed)
standard_normal = Normal(0, 1)
p_value = 2 * (1 - cdf(standard_normal, abs(z)))
println("Z-statistic: $(round(z, digits=3))")
println("P-value: $(round(p_value, digits=4))")
if p_value < 0.05
println("Reject H₀ at α = 0.05")
else
println("Fail to reject H₀ at α = 0.05")
endChi-Square Goodness of Fit
using UniDist
# Test if observed frequencies match expected
observed = [45, 35, 20] # Observed counts
expected = [40, 40, 20] # Expected counts
# Chi-square statistic
chi_sq = sum((o - e)^2 / e for (o, e) in zip(observed, expected))
# Degrees of freedom = categories - 1
df = length(observed) - 1
# P-value
chi_dist = ChiSquare(df)
p_value = 1 - cdf(chi_dist, chi_sq)
println("χ² statistic: $(round(chi_sq, digits=3))")
println("Degrees of freedom: $df")
println("P-value: $(round(p_value, digits=4))")F-Test for Variance Ratio
using UniDist
# Test if two populations have equal variances
var1, n1 = 25.0, 30 # Sample 1: variance, size
var2, n2 = 16.0, 25 # Sample 2: variance, size
# F-statistic (larger variance in numerator)
f_stat = var1 / var2
# Degrees of freedom
df1 = n1 - 1
df2 = n2 - 1
# P-value (two-tailed)
f_dist = F(df1, df2)
p_value = 2 * min(cdf(f_dist, f_stat), 1 - cdf(f_dist, f_stat))
println("F-statistic: $(round(f_stat, digits=3))")
println("P-value: $(round(p_value, digits=4))")Confidence Intervals
CI for Population Mean (Known Variance)
using UniDist
sample_mean = 75.3
known_sd = 10.0
n = 40
confidence = 0.95
# Z critical value
z_crit = quantile(Normal(0, 1), 1 - (1 - confidence) / 2)
# Margin of error
se = known_sd / sqrt(n)
margin = z_crit * se
# Confidence interval
ci_lower = sample_mean - margin
ci_upper = sample_mean + margin
println("$(Int(confidence*100))% CI: [$(round(ci_lower, digits=2)), $(round(ci_upper, digits=2))]")CI for Population Proportion
using UniDist
# Survey: 120 out of 400 prefer option A
successes = 120
n = 400
confidence = 0.95
# Sample proportion
p_hat = successes / n
# Standard error
se = sqrt(p_hat * (1 - p_hat) / n)
# Z critical value
z_crit = quantile(Normal(0, 1), 1 - (1 - confidence) / 2)
# Confidence interval
ci_lower = p_hat - z_crit * se
ci_upper = p_hat + z_crit * se
println("Sample proportion: $(round(p_hat, digits=3))")
println("$(Int(confidence*100))% CI: [$(round(ci_lower, digits=3)), $(round(ci_upper, digits=3))]")Power Analysis
Sample Size for Desired Power
using UniDist
# Detect effect size d = 0.5 with 80% power at α = 0.05
effect_size = 0.5
alpha = 0.05
power = 0.80
# Critical values
z_alpha = quantile(Normal(0, 1), 1 - alpha/2) # Two-tailed
z_beta = quantile(Normal(0, 1), power)
# Required sample size per group (two-sample t-test approximation)
n = 2 * ((z_alpha + z_beta) / effect_size)^2
println("Required sample size per group: $(ceil(Int, n))")Power for Given Sample Size
using UniDist
# Calculate power for n=50 per group, effect size=0.4, α=0.05
n = 50
effect_size = 0.4
alpha = 0.05
# Non-centrality parameter
ncp = effect_size * sqrt(n / 2)
# Critical value under null
z_crit = quantile(Normal(0, 1), 1 - alpha/2)
# Power = P(reject H₀ | H₁ true)
# Under alternative, test statistic ~ N(ncp, 1)
alt_dist = Normal(ncp, 1)
power = 1 - cdf(alt_dist, z_crit) + cdf(alt_dist, -z_crit)
println("Power: $(round(power * 100, digits=1))%")Distribution Fitting Assessment
Q-Q Plot Data
using UniDist
# Generate theoretical quantiles for Q-Q plot
data = [2.3, 3.1, 3.5, 4.2, 4.8, 5.1, 5.9, 6.3, 7.1, 8.2]
n = length(data)
# Sort data
sorted_data = sort(data)
# Theoretical quantiles (standard normal)
theoretical_dist = Normal(0, 1)
probabilities = [(i - 0.5) / n for i in 1:n]
theoretical_quantiles = quantile(theoretical_dist, probabilities)
println("Data Quantile\tTheoretical Quantile")
for (d, t) in zip(sorted_data, theoretical_quantiles)
println("$(round(d, digits=2))\t\t$(round(t, digits=2))")
endProbability Plot Correlation
using UniDist
# Check if data follows exponential distribution
data = [0.5, 0.8, 1.2, 1.5, 2.1, 2.8, 3.5, 4.2, 5.1, 7.3]
n = length(data)
# Fit exponential: estimate rate from data
mean_data = sum(data) / n
fitted_dist = Exponential(mean_data)
# Theoretical quantiles
sorted_data = sort(data)
probs = [(i - 0.5) / n for i in 1:n]
theoretical = quantile(fitted_dist, probs)
# Correlation coefficient
mean_sorted = sum(sorted_data) / n
mean_theoretical = sum(theoretical) / n
numerator = sum((sorted_data[i] - mean_sorted) * (theoretical[i] - mean_theoretical) for i in 1:n)
denom_sorted = sqrt(sum((x - mean_sorted)^2 for x in sorted_data))
denom_theoretical = sqrt(sum((x - mean_theoretical)^2 for x in theoretical))
correlation = numerator / (denom_sorted * denom_theoretical)
println("Probability plot correlation: $(round(correlation, digits=4))")
println("(Values close to 1 suggest good fit)")Risk Analysis
Value at Risk (VaR)
using UniDist
# Portfolio returns assumed to follow Normal distribution
# Mean daily return: 0.05%, Volatility: 2%
daily_return = Normal(0.0005, 0.02)
# VaR at different confidence levels
println("Value at Risk (daily):")
for conf in [0.90, 0.95, 0.99]
var = -quantile(daily_return, 1 - conf)
println("$(Int(conf*100))% VaR: $(round(var * 100, digits=2))%")
endExpected Shortfall (CVaR)
using UniDist
# For Normal distribution, ES has closed form
μ = 0.0005
σ = 0.02
returns = Normal(μ, σ)
alpha = 0.05 # 95% confidence
# VaR
var = -quantile(returns, alpha)
# Expected Shortfall for Normal: ES = μ + σ * φ(Φ⁻¹(α)) / α
# where φ is PDF and Φ⁻¹ is quantile of standard normal
z_alpha = quantile(Normal(0, 1), alpha)
es = -(μ - σ * pdf(Normal(0, 1), z_alpha) / alpha)
println("95% VaR: $(round(var * 100, digits=2))%")
println("95% ES (CVaR): $(round(es * 100, digits=2))%")Quality Control
Process Capability
using UniDist
# Process specifications
lsl = 95 # Lower specification limit
usl = 105 # Upper specification limit
# Process parameters (from sample)
process_mean = 100.2
process_sd = 1.5
process = Normal(process_mean, process_sd)
# Defect rates
p_below_lsl = cdf(process, lsl)
p_above_usl = 1 - cdf(process, usl)
total_defect_rate = p_below_lsl + p_above_usl
println("P(below LSL): $(round(p_below_lsl * 1e6, digits=1)) ppm")
println("P(above USL): $(round(p_above_usl * 1e6, digits=1)) ppm")
println("Total defect rate: $(round(total_defect_rate * 1e6, digits=1)) ppm")
# Process capability indices
cp = (usl - lsl) / (6 * process_sd)
cpk = min(usl - process_mean, process_mean - lsl) / (3 * process_sd)
println("\nCp: $(round(cp, digits=3))")
println("Cpk: $(round(cpk, digits=3))")Control Chart Limits
using UniDist
# X-bar chart for sample means
# Process: μ = 50, σ = 5, sample size n = 4
mu = 50
sigma = 5
n = 4
# Distribution of sample means
se = sigma / sqrt(n)
xbar_dist = Normal(mu, se)
# 3-sigma control limits
lcl = quantile(xbar_dist, 0.00135) # ≈ μ - 3σ/√n
ucl = quantile(xbar_dist, 0.99865) # ≈ μ + 3σ/√n
println("Center Line: $mu")
println("LCL: $(round(lcl, digits=2))")
println("UCL: $(round(ucl, digits=2))")
# Probability of false alarm (Type I error)
p_false_alarm = 2 * cdf(xbar_dist, lcl)
println("P(false alarm): $(round(p_false_alarm * 100, digits=3))%")Acceptance Sampling
Single Sampling Plan
using UniDist
# Lot size N = 1000, sample size n = 50, accept if defects ≤ c = 2
# What is the probability of accepting a lot with 5% defective?
n = 50
c = 2
p_defective = 0.05
# Number of defectives in sample follows Binomial(n, p)
defectives = Binomial(n, p_defective)
# Probability of acceptance
p_accept = cdf(defectives, c)
println("Sampling plan: n=$n, c=$c")
println("If lot is 5% defective:")
println("P(accept) = $(round(p_accept * 100, digits=1))%")
println("P(reject) = $(round((1 - p_accept) * 100, digits=1))%")Operating Characteristic Curve
using UniDist
# OC curve for sampling plan n=50, c=2
n = 50
c = 2
println("Lot defect rate\tP(accept)")
for p in [0.01, 0.02, 0.03, 0.05, 0.07, 0.10, 0.15]
defectives = Binomial(n, p)
p_accept = cdf(defectives, c)
println("$(Int(p*100))%\t\t$(round(p_accept * 100, digits=1))%")
end