(ns metabase.search.appdb.specialization.postgres (:require [clojure.string :as str] [metabase.search.appdb.specialization.api :as specialization] [metabase.util :as u] [toucan2.core :as t2])) | |
(def ^:private tsv-language "english") | |
(defmethod specialization/table-schema :postgres [base-schema] (into [[:id :bigint [:primary-key] [:raw "GENERATED BY DEFAULT AS IDENTITY"]] [:search_vector :tsvector :not-null] [:with_native_query_vector :tsvector :not-null]] base-schema)) | |
TODO I strongly suspect that there are more indexes that would help performance, we should examine EXPLAIN. Another idea to try, is using a tsvector for all the non-range filter fields. | (defmethod specialization/post-create-statements :postgres [prefix table-name] (mapv (fn [template] (format template prefix table-name)) ["CREATE UNIQUE INDEX IF NOT EXISTS %s_identity_idx ON %s (model, model_id)" "CREATE INDEX IF NOT EXISTS %s_tsvector_idx ON %s USING gin (search_vector)" "CREATE INDEX IF NOT EXISTS %s_native_tsvector_idx ON %s USING gin (with_native_query_vector)" ;; Spam all the indexes for now, let's see if they get used on Stats / Ephemeral envs. "CREATE INDEX IF NOT EXISTS %s_model_archived_idx ON %s (model, archived)" "CREATE INDEX IF NOT EXISTS %s_archived_idx ON %s (archived)"])) |
(defmethod specialization/batch-upsert! :postgres [table entries] (when (seq entries) (t2/query ;; The cost of dynamically calculating these keys should be small compared to the IO cost, so unoptimized. (let [update-keys (vec (disj (set (keys (first entries))) :id :model :model_id)) excluded-kw (fn [column] (keyword (str "excluded." (name column))))] {:insert-into table :values entries :on-conflict [:model :model_id] :do-update-set (zipmap update-keys (map excluded-kw update-keys))})))) | |
(defn- quote* [s] (str "'" (str/replace s "'" "''") "'")) | |
(defn- process-phrase [word-or-phrase] ;; a phrase is quoted even if the closing quotation mark has not been typed yet (cond ;; trailing quotation mark (= word-or-phrase "\) nil ;; quoted phrases must be matched sequentially (str/starts-with? word-or-phrase "\) (as-> word-or-phrase <> ;; remove the quote mark(s) (str/replace <> #"^\"|\"$" ) (str/trim <>) (str/split <> #"\s+") (map quote* <>) (str/join " <-> " <>)) ;; negation (str/starts-with? word-or-phrase "-") (str "!" (quote* (subs word-or-phrase 1))) ;; just a regular word :else (quote* word-or-phrase))) | |
Break up the words in the search input, preserving quoted and partially quoted segments. | (defn- split-preserving-quotes [s] (re-seq #"\"[^\"]*(?:\"|$)|[^\s\"]+|\s+" (u/lower-case-en s))) |
(defn- process-clause [words-and-phrases] (->> words-and-phrases (remove #{"and"}) (map process-phrase) (remove str/blank?) (str/join " & "))) | |
Add wildcards at the end of the final word, so that we match ts completions. | (defn- complete-last-word [expression] (str/replace expression #"(\S+)(?=\s*$)" "$1:*")) |
Given the user input, construct a query in the Postgres tsvector query language. | (defn- to-tsquery-expr [input] (str (when input (let [trimmed (str/trim input) complete? (not (str/ends-with? trimmed "\"")) ;; TODO also only complete if the :context is appropriate maybe-complete (if complete? complete-last-word identity)] (->> (split-preserving-quotes trimmed) (remove str/blank?) (partition-by #{"or"}) (remove #(= (first %) "or")) (map process-clause) (str/join " | ") maybe-complete))))) |
(defmethod specialization/base-query :postgres [active-table search-term search-ctx select-items] {:select select-items :from [[active-table :search_index]] ;; Using a join allows us to share the query expression between our SELECT and WHERE clauses. :join [[[:raw "to_tsquery('" tsv-language "', " [:lift (to-tsquery-expr search-term)] ")"] :query] [:= 1 1]] :where (if (str/blank? search-term) [:= [:inline 1] [:inline 1]] [:raw (str (if (:search-native-query search-ctx) "with_native_query_vector" "search_vector") " @@ query")])}) | |
(defn- weighted-tsvector [weight text] [:setweight [:to_tsvector [:inline tsv-language] [:cast text :text]] [:inline weight]]) | |
(defmethod specialization/extra-entry-fields :postgres [entity] {:search_vector [:|| (weighted-tsvector "A" (:name entity)) (weighted-tsvector "B" (:searchable_text entity ""))] :with_native_query_vector [:|| (weighted-tsvector "A" (:name entity)) (weighted-tsvector "B" (str/join " " (keep entity [:searchable_text :native_query])))]}) | |
See https://www.postgresql.org/docs/current/textsearch-controls.html#TEXTSEARCH-RANKING 0 (the default) ignores the document length 1 divides the rank by 1 + the logarithm of the document length 2 divides the rank by the document length 4 divides the rank by the mean harmonic distance between extents (this is implemented only by tsrankcd) 8 divides the rank by the number of unique words in document 16 divides the rank by 1 + the logarithm of the number of unique words in document 32 divides the rank by itself + 1 | (def ^:private ts-rank-normalization 0) |
(defmethod specialization/text-score :postgres [] [:ts_rank :search_vector :query [:inline ts-rank-normalization]]) | |
(defmethod specialization/view-count-percentile-query :postgres [index-table p-value] (let [expr [:raw "percentile_cont(" [:lift p-value] ") WITHIN GROUP (ORDER BY view_count)"]] {:select [:search_index.model [expr :vcp]] :from [[index-table :search_index]] :group-by [:search_index.model] :having [:is-not expr nil]})) | |