(ns metabase.search.appdb.specialization.postgres
  (:require
   [clojure.string :as str]
   [metabase.search.appdb.specialization.api :as specialization]
   [metabase.util :as u]
   [toucan2.core :as t2]))
(def ^:private tsv-language "english")
(defmethod specialization/table-schema :postgres [base-schema]
  (into [[:id :bigint [:primary-key] [:raw "GENERATED BY DEFAULT AS IDENTITY"]]
         [:search_vector :tsvector :not-null]
         [:with_native_query_vector :tsvector :not-null]]
        base-schema))

TODO I strongly suspect that there are more indexes that would help performance, we should examine EXPLAIN. Another idea to try, is using a tsvector for all the non-range filter fields.

(defmethod specialization/post-create-statements :postgres [prefix table-name]
  (mapv
   (fn [template] (format template prefix table-name))
   ["CREATE UNIQUE INDEX IF NOT EXISTS %s_identity_idx ON %s (model, model_id)"
    "CREATE INDEX IF NOT EXISTS %s_tsvector_idx ON %s USING gin (search_vector)"
    "CREATE INDEX IF NOT EXISTS %s_native_tsvector_idx ON %s USING gin (with_native_query_vector)"
    ;; Spam all the indexes for now, let's see if they get used on Stats / Ephemeral envs.
    "CREATE INDEX IF NOT EXISTS %s_model_archived_idx ON %s (model, archived)"
    "CREATE INDEX IF NOT EXISTS %s_archived_idx ON %s (archived)"]))
(defmethod specialization/batch-upsert! :postgres [table entries]
  (when (seq entries)
    (t2/query
     ;; The cost of dynamically calculating these keys should be small compared to the IO cost, so unoptimized.
     (let [update-keys (vec (disj (set (keys (first entries))) :id :model :model_id))
           excluded-kw (fn [column] (keyword (str "excluded." (name column))))]
       {:insert-into   table
        :values        entries
        :on-conflict   [:model :model_id]
        :do-update-set (zipmap update-keys (map excluded-kw update-keys))}))))
(defn- quote* [s]
  (str "'" (str/replace s "'" "''") "'"))
(defn- process-phrase [word-or-phrase]
  ;; a phrase is quoted even if the closing quotation mark has not been typed yet
  (cond
    ;; trailing quotation mark
    (= word-or-phrase "\) nil
    ;; quoted phrases must be matched sequentially
    (str/starts-with? word-or-phrase "\)
    (as-> word-or-phrase <>
      ;; remove the quote mark(s)
      (str/replace <> #"^\"|\"$" )
      (str/trim <>)
      (str/split <> #"\s+")
      (map quote* <>)
      (str/join " <-> " <>))
    ;; negation
    (str/starts-with? word-or-phrase "-")
    (str "!" (quote* (subs word-or-phrase 1)))
    ;; just a regular word
    :else
    (quote* word-or-phrase)))

Break up the words in the search input, preserving quoted and partially quoted segments.

(defn- split-preserving-quotes
  [s]
  (re-seq #"\"[^\"]*(?:\"|$)|[^\s\"]+|\s+" (u/lower-case-en s)))
(defn- process-clause [words-and-phrases]
  (->> words-and-phrases
       (remove #{"and"})
       (map process-phrase)
       (remove str/blank?)
       (str/join " & ")))

Add wildcards at the end of the final word, so that we match ts completions.

(defn- complete-last-word
  [expression]
  (str/replace expression #"(\S+)(?=\s*$)" "$1:*"))

Given the user input, construct a query in the Postgres tsvector query language.

(defn- to-tsquery-expr
  [input]
  (str
   (when input
     (let [trimmed        (str/trim input)
           complete?      (not (str/ends-with? trimmed "\""))
           ;; TODO also only complete if the :context is appropriate
           maybe-complete (if complete? complete-last-word identity)]
       (->> (split-preserving-quotes trimmed)
            (remove str/blank?)
            (partition-by #{"or"})
            (remove #(= (first %) "or"))
            (map process-clause)
            (str/join " | ")
            maybe-complete)))))
(defmethod specialization/base-query :postgres
  [active-table search-term search-ctx select-items]
  {:select select-items
   :from   [[active-table :search_index]]
   ;; Using a join allows us to share the query expression between our SELECT and WHERE clauses.
   :join   [[[:raw "to_tsquery('"
              tsv-language "', "
              [:lift (to-tsquery-expr search-term)] ")"]
             :query] [:= 1 1]]
   :where  (if (str/blank? search-term)
             [:= [:inline 1] [:inline 1]]
             [:raw
              (str
               (if (:search-native-query search-ctx)
                 "with_native_query_vector"
                 "search_vector")
               " @@ query")])})
(defn- weighted-tsvector [weight text]
  [:setweight [:to_tsvector [:inline tsv-language] [:cast text :text]] [:inline weight]])
(defmethod specialization/extra-entry-fields :postgres [entity]
  {:search_vector
   [:||
    (weighted-tsvector "A" (:name entity))
    (weighted-tsvector "B" (:searchable_text entity ""))]

   :with_native_query_vector
   [:||
    (weighted-tsvector "A" (:name entity))
    (weighted-tsvector "B" (str/join " " (keep entity [:searchable_text :native_query])))]})

See https://www.postgresql.org/docs/current/textsearch-controls.html#TEXTSEARCH-RANKING 0 (the default) ignores the document length 1 divides the rank by 1 + the logarithm of the document length 2 divides the rank by the document length 4 divides the rank by the mean harmonic distance between extents (this is implemented only by tsrankcd) 8 divides the rank by the number of unique words in document 16 divides the rank by 1 + the logarithm of the number of unique words in document 32 divides the rank by itself + 1

(def ^:private ts-rank-normalization 0)
(defmethod specialization/text-score :postgres
  []
  [:ts_rank :search_vector :query [:inline ts-rank-normalization]])
(defmethod specialization/view-count-percentile-query :postgres
  [index-table p-value]
  (let [expr [:raw "percentile_cont(" [:lift p-value] ") WITHIN GROUP (ORDER BY view_count)"]]
    {:select   [:search_index.model [expr :vcp]]
     :from     [[index-table :search_index]]
     :group-by [:search_index.model]
     :having   [:is-not expr nil]}))