Middleware for automatically bucketing unbucketed | (ns metabase.query-processor.middleware.auto-bucket-datetimes (:require [medley.core :as m] [metabase.legacy-mbql.util :as mbql.u] [metabase.lib.core :as lib] [metabase.lib.metadata :as lib.metadata] [metabase.lib.schema :as lib.schema] [metabase.lib.schema.common :as lib.schema.common] [metabase.lib.schema.id :as lib.schema.id] [metabase.lib.util.match :as lib.util.match] [metabase.lib.walk :as lib.walk] [metabase.util.log :as log] [metabase.util.malli :as mu] [metabase.util.malli.registry :as mr])) |
(mr/def ::column-type-info
[:map
[:base-type [:maybe ::lib.schema.common/base-type]]
[:effective-type [:maybe ::lib.schema.common/base-type]]
[:semantic-type {:optional true} [:maybe ::lib.schema.common/semantic-or-relation-type]]]) | |
(mr/def ::column-id-or-name->type-info [:map-of [:or ::lib.schema.common/non-blank-string ::lib.schema.id/field] [:maybe ::column-type-info]]) | |
Unfortunately these Fields won't be in the store yet since Field resolution can't happen before we add the implicit
TODO - What we could do tho is fetch all the stuff we need for the Store and then save these Fields in the store, which would save a bit of time when we do resolve them | (mu/defn- unbucketed-fields->field-id->type-info :- [:maybe ::column-id-or-name->type-info]
"Fetch a map of Field ID -> type information for the Fields referred to by the `unbucketed-fields`. Return an empty map
for empty `unbucketed-fields`."
[metadata-providerable unbucketed-fields :- [:maybe [:sequential :mbql.clause/field]]]
(merge
;; build map of field-literal-name -> {:base-type base-type}
(into {} (for [[_tag opts id-or-name] unbucketed-fields
:when (string? id-or-name)]
[id-or-name {:base-type (:base-type opts)
:effective-type ((some-fn :effective-type :base-type) opts)}]))
;; build map of field ID -> <info from DB>
(when-let [field-ids (not-empty (into #{}
(comp (map peek)
(filter integer?))
unbucketed-fields))]
(into {} (for [{id :id, :as field} (try
(lib.metadata/bulk-metadata-or-throw metadata-providerable
:metadata/column
field-ids)
;; don't fail if some of the Fields are invalid.
(catch Throwable e
(log/errorf e "Error fetching Fields: %s" (ex-message e))
nil))]
[id (select-keys field [:base-type :effective-type :semantic-type])]))))) |
(defn- yyyy-MM-dd-date-string? [x]
(and (string? x)
(re-matches #"^\d{4}-\d{2}-\d{2}$" x))) | |
(defn- auto-bucketable-value? [v]
(or (yyyy-MM-dd-date-string? v)
(mbql.u/is-clause? :relative-datetime v))) | |
(mu/defn- filter-clause?
[query :- ::lib.schema/query
stage-path :- ::lib.walk/stage-path
x]
(and (mbql.u/mbql-clause? x)
(when-let [expr-type (try
(lib.walk/apply-f-for-stage-at-path lib/type-of query stage-path x)
(catch Throwable e
(log/errorf e "Error calculating expression type: %s" (ex-message e))
nil))]
(isa? expr-type :type/Boolean)))) | |
(mu/defn- simple-filter-clause?
[query :- ::lib.schema/query
stage-path :- ::lib.walk/stage-path
x]
(and (filter-clause? query stage-path x)
(not (mbql.u/is-clause? #{:and :or :not} x)))) | |
(mr/def ::do-not-bucket-reason
[:and
qualified-keyword?
[:fn
{:error/message "do-not-bucket-reason keyword"}
#(= (namespace %) "do-not-bucket-reason")]]) | |
This returns a keyword corresponding to why we're not autobucketing for debugging/testing purposes | (mu/defn- should-not-be-autobucketed? :- [:maybe ::do-not-bucket-reason]
"Is `x` a clause (or a clause that contains a clause) that we should definitely not autobucket?"
[query :- ::lib.schema/query
stage-path :- ::lib.walk/stage-path
x]
(cond
;; do not autobucket clauses in a non-compound filter clause that either:
(simple-filter-clause? query stage-path x)
(cond
;; * is not an equality or comparison filter. e.g. wouldn't make sense to bucket a field and then check if it is
;; `NOT NULL`
(not (mbql.u/is-clause? #{:= :!= :< :> :<= :>= :between} x))
:do-not-bucket-reason/not-equality-or-comparison-filter
;; * has arguments that aren't `yyyy-MM-dd` date strings. The only reason we auto-bucket datetime clauses in the
;; * first place is for legacy reasons, if someone is specifying additional info like hour/minute then we
;; * shouldn't assume they want to bucket by day
(let [[_tag _opts _ref & values] x]
(not (every? auto-bucketable-value? values)))
:do-not-bucket-reason/not-all-values-are-auto-bucketable)
;; * do not autobucket clauses that are updating the time interval
(lib.util.match/match-one x
[(_tag :guard #{:+ :-})
_
[(_ :guard #{:expression :field}) _ _]
[:interval _ _n (unit :guard #{:minute :hour :second})]])
:do-not-bucket-reason/bucket-between-relative-starting-from
;; do not auto-bucket clauses inside a `:time-interval` filter: it already supplies its own unit
;; do not auto-bucket clauses inside a `:datetime-diff` clause: the precise timestamp is needed for the difference
(mbql.u/is-clause? #{:time-interval :datetime-diff} x)
:do-not-bucket-reason/bucketed-or-precise-operation
;; do not autobucket clauses that already have a temporal unit, or have a binning strategy
(and (or (mbql.u/is-clause? :expression x)
(mbql.u/is-clause? :field x))
(let [[_tag opts _id-or-name] x]
((some-fn :temporal-unit :binning) opts)))
:do-not-bucket-reason/field-with-bucketing-or-binning
(and (vector? x)
(get-in x [1 :inherited-temporal-unit]))
:do-not-bucket-reason/bucketed-in-previous-stages)) |
(mu/defn- date-or-datetime-clause?
[{base-type :base-type, effective-type :effective-type} :- ::column-type-info]
(some (fn [field-type]
(some #(isa? field-type %)
[:type/Date :type/DateTime]))
[base-type effective-type])) | |
(mu/defn- wrap-unbucketed-clauses :- ::lib.schema/stage
"Add `:temporal-unit` to `:field`s and `:expression`s in breakouts and filters if appropriate; for fields, look
at corresponing type information in `field-id->type-info` to see if we should do so. For expressions examine the clause
options."
;; we only want to wrap clauses in `:breakout` and `:filter` so just make a 3-arg version of this fn that takes the
;; name of the clause to rewrite and call that twice
[query :- ::lib.schema/query
stage-path :- ::lib.walk/stage-path
stage :- ::lib.schema/stage
field-id->type-info :- [:maybe ::column-id-or-name->type-info]]
(letfn [(datetime-but-not-time? [field-id]
(some-> field-id field-id->type-info date-or-datetime-clause?))
;; Following function copies type extraction logic from [[unbucketed-fields->field-id->type-info]],
;; to conform original schema.
(expression-opts->type-info [{:keys [base-type effective-type]}] :- ::column-id-or-name->type-info
{:base-type base-type
:effective-type (or effective-type base-type)})
(wrap-clauses [x]
(lib.util.match/replace x
;; don't replace anything that's already bucketed or otherwise is not subject to autobucketing
(_ :guard (partial should-not-be-autobucketed? query stage-path))
&match
;; if it's a `:field` clause and `field-id->type-info` tells us it's a `:type/Temporal` (but not
;; `:type/Time`), then go ahead and replace it
[:field opts (id-or-name :guard datetime-but-not-time?)]
[:field (assoc opts :temporal-unit :day) id-or-name]
[:expression (opts :guard (comp date-or-datetime-clause? expression-opts->type-info)) name']
[:expression (assoc opts :temporal-unit :day) name']))
(rewrite-clause [stage clause-to-rewrite]
(m/update-existing stage clause-to-rewrite wrap-clauses))]
(-> stage
(rewrite-clause :breakout)
(rewrite-clause :filters)))) | |
(mu/defn- auto-bucket-datetimes-this-stage :- ::lib.schema/stage
[query :- ::lib.schema/query
stage-path :- ::lib.walk/stage-path
{breakouts :breakout, :keys [filters], :as stage} :- ::lib.schema/stage]
;; find any breakouts or filters in the query that are just plain `[:field-id ...]` clauses (unwrapped by any other
;; clause)
(if-let [unbucketed-clauses (lib.util.match/match (cons filters breakouts)
(_clause :guard (partial should-not-be-autobucketed? query stage-path)) nil
:expression &match
:field &match)]
;; if we found some unbucketed breakouts/filters, fetch the Fields & type info that are referred to by those
;; breakouts/filters...
(let [unbucketed-fields (filter (comp (partial = :field) first) unbucketed-clauses)
field-id->type-info (unbucketed-fields->field-id->type-info query unbucketed-fields)]
;; ...and then update each breakout/filter by wrapping it if appropriate
(wrap-unbucketed-clauses query stage-path stage field-id->type-info))
;; otherwise if there are no unbucketed breakouts/filters return the query as-is
stage)) | |
(mu/defn auto-bucket-datetimes :- ::lib.schema/query
"Middleware that automatically adds `:temporal-unit` `:day` to breakout and filter `:field` clauses if the Field they
refer to has a type that derives from `:type/Temporal` (but not `:type/Time`). (This is done for historic reasons,
before datetime bucketing was added to MBQL; datetime Fields defaulted to breaking out by day. We might want to
revisit this behavior in the future.)
Applies to any unbucketed Field in a breakout, or fields in a filter clause being compared against `yyyy-MM-dd`
format datetime strings."
[query :- ::lib.schema/query]
(lib.walk/walk-stages
query
(fn [query stage-path stage]
(when (or (seq (:filters stage))
(seq (:breakout stage)))
(auto-bucket-datetimes-this-stage query stage-path stage))))) | |