Deeper statistical analysis of results. | (ns metabase.analyze.fingerprint.insights (:require [java-time.api :as t] [kixi.stats.core :as stats] [kixi.stats.math :as math] [medley.core :as m] [metabase.analyze.fingerprint.fingerprinters :as fingerprinters] [metabase.legacy-mbql.util :as mbql.u] [metabase.models.interface :as mi] [metabase.sync.util :as sync-util] [metabase.util :as u] [metabase.util.date-2 :as u.date] [redux.core :as redux]) (:import (java.time Instant LocalDate LocalDateTime LocalTime OffsetDateTime OffsetTime ZonedDateTime) (java.util Random))) |
(set! *warn-on-reflection* true) | |
(defn- last-2 [] (let [none (Object.)] (fn ([] (object-array [none none])) ([^objects acc] (let [a (aget acc 0) b (aget acc 1)] (cond (identical? b none) [nil nil] (identical? a none) [nil b] :else [a b]))) ([^objects acc, x] (aset acc 0 (aget acc 1)) (aset acc 1 x) acc)))) | |
Relative difference between | (defn change [x2 x1] (when (and x1 x2 (not (zero? x1))) (let [x2 (double x2) x1 (double x1)] (cond (every? neg? [x1 x2]) (change (- x1) (- x2)) (and (neg? x1) (pos? x2)) (- (change x1 x2)) (neg? x1) (- (change x2 (- x1))) :else (/ (- x2 x1) x1))))) |
Transducer that samples a fixed number | (defn- reservoir-sample [n f] (let [n (int n) rng (Random. n) counter (int-array 1) ;; A box for a mutable primitive int. reservoir (object-array n)] (fn ([] nil) ([_] (let [count (aget counter 0)] (vec (if (< count n) (java.util.Arrays/copyOfRange reservoir 0 count) reservoir)))) ([_ x] (let [c (aget counter (unchecked-int 0)) c+1 (inc c) idx (.nextInt rng c+1)] (aset counter 0 c+1) (cond (< c n) (aset reservoir c (f x)) (< idx n) (aset reservoir idx (f x)))))))) |
Faster and more efficient implementation of | (defn- simple-linear-regression [fx fy x-scale y-scale] (fn ([] (double-array 6)) ([^doubles arr e] (let [x (fx e) y (fy e)] (if (or (nil? x) (nil? y)) arr (let [x (cond-> (double x) (identical? x-scale :log) Math/log) y (cond-> (double y) (identical? y-scale :log) Math/log) c (aget arr 0) mx (aget arr 1) my (aget arr 2) ssx (aget arr 3) ssy (aget arr 4) ssxy (aget arr 5) c' (inc c) mx' (+ mx (/ (- x mx) c')) my' (+ my (/ (- y my) c'))] (aset arr 0 c') (aset arr 1 mx') (aset arr 2 my') (aset arr 3 (+ ssx (* (- x mx') (- x mx)))) (aset arr 4 (+ ssy (* (- y my') (- y my)))) (aset arr 5 (+ ssxy (* (- x mx') (- y my)))) arr)))) ([^doubles arr] (let [mx (aget arr 1) my (aget arr 2) ssx (aget arr 3) ssxy (aget arr 5)] (when-not (zero? ssx) (let [slope (/ ssxy ssx) offset (- my (* mx slope))] [offset slope])))))) |
Given two functions: (fÅ· input) and (fy input), returning the predicted and actual values of y respectively, calculates the mean absolute error of the estimate. https://en.wikipedia.org/wiki/Meanabsoluteerror | (defn mae [fy-hat fy] ((map (fn [x] (when x (math/abs (- (fy x) (fy-hat x)))))) stats/mean)) |
(def ^:private trendline-function-families ;; http://mathworld.wolfram.com/LeastSquaresFitting.html [{:x-scale :linear :y-scale :linear :model (fn [offset slope] (fn [x] (+ offset (* slope x)))) :formula (fn [offset slope] [:+ offset [:* slope :x]])} ;; http://mathworld.wolfram.com/LeastSquaresFittingExponential.html {:x-scale :linear :y-scale :log :model (fn [offset slope] (fn [x] (* (math/exp offset) (math/exp (* slope x))))) :formula (fn [offset slope] [:* (math/exp offset) [:exp [:* slope :x]]])} ;; http://mathworld.wolfram.com/LeastSquaresFittingLogarithmic.html {:x-scale :log :y-scale :identity :model (fn [offset slope] (fn [x] (+ offset (* slope (Math/log x))))) :formula (fn [offset slope] [:+ offset [:* slope [:log :x]]])} ;; http://mathworld.wolfram.com/LeastSquaresFittingPowerLaw.html {:x-scale :log :y-scale :log :model (fn [offset slope] (fn [x] (* (math/exp offset) (math/pow x slope)))) :formula (fn [offset slope] [:* (math/exp offset) [:pow :x slope]])}]) | |
(def ^:private ^:const ^Long validation-set-size 20) | |
Fit curves from | (defn- best-fit [fx fy] (redux/post-complete (fingerprinters/robust-fuse {:fits (->> (for [{:keys [x-scale y-scale formula model]} trendline-function-families] (redux/post-complete (simple-linear-regression fx fy x-scale y-scale) (fn [[offset slope]] (when (every? u/real-number? [offset slope]) {:model (model offset slope) :formula (formula offset slope)})))) redux/juxt*) :validation-set ((filter (fn [row] (and (fx row) (fy row)))) (reservoir-sample validation-set-size (fn [row] [(fx row) (fy row)])))}) (fn [{:keys [validation-set fits]}] (some->> fits (remove nil?) (map #(assoc % :mae (transduce identity (mae (comp (:model %) first) second) validation-set))) (filter (comp u/real-number? :mae)) not-empty (apply min-key :mae) :formula)))) |
(defn- timeseries? [{:keys [numbers datetimes]}] (and (pos? (count numbers)) (= (count datetimes) 1))) | |
We downsize UNIX timestamps to lessen the chance of overflows and numerical instabilities. | (def ^Double ^:const ^:private ms-in-a-day (* 1000.0 60 60 24)) |
(defn- ms->day [dt] (/ dt ms-in-a-day)) | |
(defn- about= [a b] (< 0.9 (/ a b) 1.1)) | |
(def ^:private unit->duration {:minute (/ 1 24 60) :hour (/ 24) :day 1 :week 7 :month 30.5 :quarter (* 30.4 3) :year 365.1}) | |
(defn- valid-period? [from to unit] (when (and from to unit) ;; Make sure we work for both ascending and descending time series (let [[from to] (sort [from to])] (about= (- to from) (unit->duration unit))))) | |
(defn- infer-unit [from to] (m/find-first (partial valid-period? from to) (keys unit->duration))) | |
(defn- ->millis-from-epoch [t] (cond (instance? Instant t) (.toEpochMilli ^Instant t) (instance? OffsetDateTime t) (.toEpochMilli (.toInstant ^OffsetDateTime t)) (instance? ZonedDateTime t) (.toEpochMilli (.toInstant ^ZonedDateTime t)) (instance? LocalDate t) (recur (t/offset-date-time t (t/local-time 0) (t/zone-offset 0))) (instance? LocalDateTime t) (recur (t/offset-date-time t (t/zone-offset 0))) (instance? LocalTime t) (recur (t/offset-date-time (t/local-date "1970-01-01") t (t/zone-offset 0))) (instance? OffsetTime t) (recur (t/offset-date-time (t/local-date "1970-01-01") t (t/zone-offset t))) :else (throw (ex-info (str "->millis-from-epoch: unsupported type " (class t)) {})))) | |
(defn- timeseries-insight [{:keys [numbers datetimes]}] (let [datetime (first datetimes) x-position (:position datetime) xfn #(nth % x-position)] (fingerprinters/with-error-handling ((map (fn [row] ;; Convert string datetimes or Instants into into days-from-epoch early. (update (vec row) x-position #(some-> % fingerprinters/->temporal ->millis-from-epoch ms->day)))) (redux/juxt* (for [number-col numbers] (redux/post-complete (let [y-position (:position number-col) yfn #(nth % y-position)] ((filter (comp u/real-number? yfn)) (redux/juxt ((map yfn) (last-2)) ((map xfn) (last-2)) (simple-linear-regression xfn yfn :linear :linear) (best-fit xfn yfn)))) (fn [[[y-previous y-current] [x-previous x-current] [offset slope] best-fit-equation]] (let [unit (let [unit (some-> datetime :unit mbql.u/normalize-token)] (if (or (nil? unit) (= unit :default)) (infer-unit x-previous x-current) unit)) show-change? (valid-period? x-previous x-current unit)] (fingerprinters/robust-map :last-value y-current :previous-value (when show-change? y-previous) :last-change (when show-change? (change y-current y-previous)) :slope slope :offset offset :best-fit best-fit-equation :col (:name number-col) :unit unit))))))) (format "Error generating timeseries insight keyed by: %s" (sync-util/name-for-logging (mi/instance :model/Field datetime)))))) | |
Based on the shape of returned data construct a transducer to statistically analyize data. | (defn insights [cols] (let [cols-by-type (->> cols (map-indexed (fn [idx col] (assoc col :position idx))) (group-by (fn [{base-type :base_type effective-type :effective_type semantic-type :semantic_type unit :unit}] (cond (isa? semantic-type :Relation/*) :others (= unit :year) :datetimes (u.date/extract-units unit) :numbers (isa? (or effective-type base-type) :type/Temporal) :datetimes (isa? base-type :type/Number) :numbers :else :others))))] (cond (timeseries? cols-by-type) (timeseries-insight cols-by-type) :else (fingerprinters/constant-fingerprinter nil)))) |