Files
zephyr/statistics/primitives.go

164 lines
3.8 KiB
Go

package statistics
import (
"math"
"slices"
"strconv"
"github.com/ceticamarco/zephyr/types"
)
func Mean(temperatures []float64) float64 {
if len(temperatures) == 0 {
return 0
}
var sum float64
for _, val := range temperatures {
sum += val
}
return sum / float64(len(temperatures))
}
func StdDev(temperatures []float64) float64 {
if len(temperatures) == 0 {
return 0
}
mean := Mean(temperatures)
var variance float64
for _, val := range temperatures {
variance += math.Pow((val - mean), 2)
}
variance /= float64(len(temperatures))
return math.Sqrt(variance)
}
func Median(temperatures []float64) float64 {
if len(temperatures) == 0 {
return 0
}
slices.Sort(temperatures)
length := len(temperatures)
midValue := length / 2
if length%2 == 0 {
return (temperatures[midValue-1] + temperatures[midValue]) / 2
} else {
return temperatures[midValue]
}
}
// This method will always returns the largest mode
// on a multi-modal dataset
func Mode(temperatures []float64) float64 {
if len(temperatures) == 0 {
return 0
}
slices.Sort(temperatures)
frequencies := make(map[float64]int)
for _, val := range temperatures {
frequencies[val]++
}
var mode float64 = 0
var maxFreq int = 0
for val, freq := range frequencies {
if freq > maxFreq || (freq == maxFreq && val > mode) {
mode = val
maxFreq = freq
}
}
return mode
}
// Detects statistical anomalies using the Robust Z-Score algorithm
//
// This method is based on the median and the Median Absolute Deviation(MAD),
// making it more robust to anomalies than the standard z-score which uses the arithmetical mean
// and standard deviation
//
// A value is considered an anomaly if its modified z-score exceeds a fixed threshold(4.5)
// and whether the absolute deviation surpasses another fixed parameter(8 degrees).
// These constants have been fine-tuned to work well with the weather data of a wide range of climates
// and to ignore daily temperature fluctuations while still detecting anomalies.
//
// The scaling constant Φ⁻¹(0.75) ≈ 0.6745 adjusts the MAD to be comparable to the standard deviation
// under the assumption of normal distribution (i.e. 75% of values lie within ~0.6745 standard deviations
// of the median).
//
// Daily temperatures collected over a short time window(1/2 month) *should* be normally distributed.
// This algorithm only work under this assumption.
func RobustZScore(temperatures []float64) []struct {
Idx int
Value float64
} {
const threshold = 4.5 // threshold for MAD ZScore algorithms
const scale = 0.6745 // Φ⁻¹(3/4) ≈ 0.6745
const minDeviation = 8.0 // outliers must deviate at least 8°C from the median
const epsilon = 1e-10
med := Median(temperatures)
absDevs := make([]float64, len(temperatures))
for idx, val := range temperatures {
absDevs[idx] = math.Abs(val - med)
}
madAbsDev := Median(absDevs)
if madAbsDev < epsilon {
return nil
}
var anomalies []struct {
Idx int
Value float64
}
for idx, val := range temperatures {
z := scale * (val - med) / madAbsDev
if math.Abs(z) > threshold && math.Abs(val-med) >= minDeviation {
anomalies = append(anomalies, struct {
Idx int
Value float64
}{
Idx: idx,
Value: val,
})
}
}
return anomalies
}
func DetectAnomalies(weatherArr []types.Weather) []types.WeatherAnomaly {
temps := make([]float64, len(weatherArr))
for idx, weather := range weatherArr {
temp, _ := strconv.ParseFloat(weather.Temperature, 64)
temps[idx] = temp
}
// Apply the Robust/MAD Z-Score anomaly detection algorithm
anomalies := RobustZScore(temps)
result := make([]types.WeatherAnomaly, 0, len(anomalies))
for _, anomaly := range anomalies {
result = append(result, types.WeatherAnomaly{
Date: weatherArr[anomaly.Idx].Date,
Temp: anomaly.Value,
})
}
return result
}