Macaw

0.0.1-SNAPSHOT


A Clojure wrapper for JSqlParser 🦜




(this space intentionally left almost blank)
 
(ns macaw.core
  (:require
   [macaw.rewrite :as rewrite]
   [macaw.util :as u]
   [macaw.walk :as mw])
  (:import
   (com.metabase.macaw AstWalker$QueueItem)
   (net.sf.jsqlparser.expression Alias)
   (net.sf.jsqlparser.parser CCJSqlParserUtil)
   (net.sf.jsqlparser.schema Column Table)
   (net.sf.jsqlparser.statement Statement)
   (net.sf.jsqlparser.statement.select AllTableColumns)))
(set! *warn-on-reflection* true)
(defn- conj-to
  ([key-name]
   (conj-to key-name identity))
  ([key-name xf]
   (fn item-conjer [results component context]
     (update results key-name conj {:component (xf component)
                                    :context   (mapv
                                                 (fn [^AstWalker$QueueItem x]
                                                   [(keyword (.getKey x)) (.getValue x)])
                                                 context)}))))
(defn- query->raw-components
  [^Statement parsed-query]
  (mw/fold-query parsed-query
                 {:column         (conj-to :columns)
                  :mutation       (conj-to :mutation-commands)
                  :wildcard       (conj-to :has-wildcard? (constantly true))
                  :table          (conj-to :tables)
                  :table-wildcard (conj-to :table-wildcards)}
                 {:columns           #{}
                  :has-wildcard?     #{}
                  :mutation-commands #{}
                  :tables            #{}
                  :table-wildcards   #{}}))

tables

(defn- make-table [^Table t _ctx]
  (merge
    {:table (.getName t)}
    (when-let [s (.getSchemaName t)]
      {:schema s})))
(defn- alias-mapping
  [^Table table ctx]
  (when-let [^Alias table-alias (.getAlias table)]
    [(.getName table-alias) (make-table table ctx)]))

JSQLParser can't tell whether the f in select f.* refers to a real table or an alias. Therefore, we have to disambiguate them based on our own map of aliases->table names. So this function will return the real name of the table referenced in a table-wildcard (as far as can be determined from the query).

(defn- resolve-table-name
  [{:keys [alias->table name->table]} ^AllTableColumns atc _ctx]
  (let [table-name (-> atc .getTable .getName)]
    (or (alias->table table-name)
        (name->table table-name))))

columns

(defn- maybe-column-alias [[maybe-alias :as _ctx]]
  (when (= (first maybe-alias) :alias)
    {:alias (second maybe-alias)}))
(defn- maybe-column-table [{:keys [alias->table name->table]} ^Column c]
  (if-let [t (.getTable c)]
    (or
      (get alias->table (.getName t))
      (:component (get name->table (.getName t))))
    ;; if we see only a single table, we can safely say it's the table of that column
    (when (= (count name->table) 1)
      (:component (val (first name->table))))))
(defn- make-column [data ^Column c ctx]
  (merge
    {:column (.getColumnName c)}
    (maybe-column-alias ctx)
    (maybe-column-table data c)))

get them together

(defn- only-query-context [ctx]
  (into [] (comp (filter #(= (first %) :query))
                 (map second))
    ctx))
(defn- update-components
  [f components]
  (map #(-> %
            (update :component f (:context %))
            (update :context only-query-context))
    components))

Given a parsed query (i.e., a [subclass of] Statement) return a map with the elements found within it.

(Specifically, it returns their fully-qualified names as strings, where 'fully-qualified' means 'as referred to in the query'; this function doesn't do additional inference work to find out a table's schema.)

(defn query->components
  [^Statement parsed-query]
  (let [{:keys [columns
                has-wildcard?
                mutation-commands
                tables
                table-wildcards]} (query->raw-components parsed-query)
        alias-map                 (into {} (map #(-> % :component (alias-mapping (:context %))) tables))
        table-map                 (->> (update-components make-table tables)
                                              (u/group-with #(-> % :component :table)
                                                (fn [a b] (if (:schema a) a b))))
        data                      {:alias->table alias-map
                                   :name->table  table-map}]
    {:columns           (into #{} (update-components (partial make-column data) columns))
     :has-wildcard?     (into #{} (update-components (fn [x & _args] x) has-wildcard?))
     :mutation-commands (into #{} mutation-commands)
     :tables            (into #{} (vals table-map))
     :table-wildcards   (into #{} (update-components (partial resolve-table-name data) table-wildcards))}))

Main entry point: takes a string query and returns a Statement object that can be handled by the other functions.

(defn parsed-query
  [^String query]
  (CCJSqlParserUtil/parse query))

Given a SQL query, apply the given table, column, and schema renames.

(defn replace-names
  [sql renames]
  (rewrite/replace-names sql (parsed-query sql) renames))
 
(ns macaw.rewrite
  (:require
   [macaw.walk :as mw])
  (:import
   (net.sf.jsqlparser.parser ASTNodeAccess SimpleNode)
   (net.sf.jsqlparser.schema Column Table)))
(set! *warn-on-reflection* true)
(defn- index-of-nth [^String haystack ^String needle n]
  (assert (not (neg? n)))
  (if (zero? n)
    -1
    (loop [n   n
           idx 0]
      (let [next-id (.indexOf haystack needle idx)]
        (cond
          (= 1 n) next-id
          (neg? next-id) next-id
          :else (recur (dec n) (inc next-id)))))))
(defn- ->idx [^String sql line col]
  (+ col (index-of-nth sql "\n" (dec line))))

Find the start and end index of the underlying tokens for a given AST node from a given SQL string.

(defn- node->idx-range
  [^SimpleNode node sql]
  (let [first-token (.jjtGetFirstToken node)
        last-token  (.jjtGetLastToken node)
        first-idx   (->idx sql
                           (.-beginLine first-token)
                           (.-beginColumn first-token))
        last-idx    (->idx sql
                           (.-endLine last-token)
                           (.-endColumn last-token))]
    [first-idx last-idx]))
(defn- splice-replacements [^String sql replacements]
  (let [sb     (StringBuilder.)
        append #(.append sb %)]
    (loop [start 0
           [[[first-idx last-idx] value] & rst] replacements]
      (if (nil? last-idx)
        (when (< start (count sql))
          (append (.substring sql start)))
        (do (append (.substring sql start first-idx))
            (append value)
            (recur (inc ^long last-idx) rst))))
    (str sb)))

Emit a SQL string for an updated AST, preserving the comments and whitespace from the original SQL.

(defn- update-query
  [updated-ast updated-node? sql]
  (let [replacement  (fn [->text visitable]
                       (let [ast-node  (.getASTNode ^ASTNodeAccess visitable)
                             idx-range (node->idx-range ast-node sql)
                             node-text (->text visitable)]
                         [idx-range node-text]))
        replace-name (fn [->text]
                       (fn [acc visitable _ctx]
                         (cond-> acc
                           (updated-node? visitable)
                           (conj (replacement ->text visitable)))))]
    (splice-replacements
     sql
     (mw/fold-query
      updated-ast
      {:table  (replace-name #(.getFullyQualifiedName ^Table %))
       :column (replace-name #(.getFullyQualifiedName ^Column %))}
      []))))
(defn- rename-table
  [updated-nodes table-renames schema-renames ^Table table _ctx]
  (when-let [name' (get table-renames (.getName table))]
    (vswap! updated-nodes conj table)
    (.setName table name'))
  (when-let [new-schema-name (get schema-renames (.getSchemaName table))]
    (.setSchemaName table new-schema-name)))
(defn- rename-column
  [updated-nodes column-renames ^Column column _ctx]
  (when-let [name' (get column-renames (.getColumnName column))]
    (vswap! updated-nodes conj column)
    (.setColumnName column name')))

Given a SQL query and its corresponding (untransformed) AST, apply the given table and column renames.

(defn replace-names
  [sql parsed-ast {schema-renames :schemas
                   table-renames  :tables
                   column-renames :columns}]
  (let [updated-nodes (volatile! #{})]
    (-> parsed-ast
        (mw/walk-query
         {:table            (partial rename-table updated-nodes table-renames schema-renames)
          :column-qualifier (partial rename-table updated-nodes table-renames schema-renames)
          :column           (partial rename-column updated-nodes column-renames)})
        (update-query @updated-nodes sql))))
 
(ns macaw.util)

Generalized group-by, where you can supply your own reducing function (instead of usual conj).

https://ask.clojure.org/index.php/12319/can-group-by-be-generalized

(defn group-with
  [kf rf coll]
  (persistent!
    (reduce
      (fn [ret x]
        (let [k (kf x)]
          (assoc! ret k (rf (get ret k) x))))
      (transient {})
      coll)))
 
(ns macaw.walk
  (:import
   (com.metabase.macaw AstWalker AstWalker$CallbackKey)))
(set! *warn-on-reflection* true)

keyword->key map for the AST-folding callbacks.

(def ->callback-key
  ;; TODO: Move this to a Malli schema to simplify the indirection
  {:column           AstWalker$CallbackKey/COLUMN
   :column-qualifier AstWalker$CallbackKey/COLUMN_QUALIFIER
   :mutation         AstWalker$CallbackKey/MUTATION_COMMAND
   :table            AstWalker$CallbackKey/TABLE
   :table-wildcard   AstWalker$CallbackKey/ALL_TABLE_COLUMNS
   :wildcard         AstWalker$CallbackKey/ALL_COLUMNS})

Lift a side effecting callback so that it preserves the accumulator.

(defn- preserve
  [f]
  (fn [acc & args]
    (apply f args)
    acc))

work around ast walker repeatedly visiting the same expressions (bug ?!)

(defn- deduplicate-visits [f]
  (let [seen (volatile! #{})]
    (fn [& [acc visitable & _ :as args]]
      (if (contains? @seen visitable)
        acc
        (do (vswap! seen conj visitable)
            (apply f args))))))
(defn- update-keys-vals [m key-f val-f]
  (into {} (map (fn [[k v]]
                  [(key-f k) (val-f v)]))
        m))

Walk over the query's AST, using the callbacks for their side-effects, for example to mutate the AST itself.

(defn walk-query
  [parsed-query callbacks]
  (let [callbacks (update-keys-vals callbacks ->callback-key (comp deduplicate-visits preserve))]
    (.walk (AstWalker. callbacks ::ignored) parsed-query)))

Fold over the query's AST, using the callbacks to update the accumulator.

(defn fold-query
  [parsed-query callbacks init-val]
  (let [callbacks (update-keys-vals callbacks ->callback-key deduplicate-visits)]
    (.fold (AstWalker. callbacks init-val) parsed-query)))
 
(ns ^:parallel macaw.core-test
  (:require
   [clojure.test :refer [deftest testing is]]
   [macaw.core :as m]
   [macaw.walk :as mw])
  (:import
   (net.sf.jsqlparser.schema Table)))
(set! *warn-on-reflection* true)
(defn- and*
  [x y]
  (and x y))
(def components     (comp m/query->components m/parsed-query))
(def raw-components (comp (partial into #{}) (partial map :component)))
(def columns        (comp raw-components :columns components))
(def has-wildcard?  (comp (partial reduce and*) raw-components :has-wildcard? components))
(def mutations      (comp raw-components :mutation-commands components))
(def tables         (comp raw-components :tables components))
(def table-wcs      (comp raw-components :table-wildcards components))
(defn column-qualifiers
  [query]
  (mw/fold-query (m/parsed-query query)
                 {:column-qualifier (fn [acc tbl _ctx] (conj acc (.getName ^Table tbl)))}
                 #{}))
(deftest query->tables-test
  (testing "Simple queries"
    (is (= #{{:table "core_user"}}
           (tables "SELECT * FROM core_user;")))
    (is (= #{{:table "core_user"}}
           (tables "SELECT id, email FROM core_user;"))))
  (testing "With a schema (Postgres)" ;; TODO: only run this against supported DBs
    (is (= #{{:table "core_user" :schema "the_schema_name"}}
           (tables "SELECT * FROM the_schema_name.core_user;"))))
  (testing "Sub-selects"
    (is (= #{{:table "core_user"}}
           (tables "SELECT * FROM (SELECT DISTINCT email FROM core_user) q;")))))
(deftest tables-with-complex-aliases-issue-14-test
  (testing "With an alias that is also a table name"
    (is (= #{{:table "user"}
             {:table "user2_final"}}
           (tables
            "SELECT legacy_user.id AS old_id,
                    user.id AS new_id
             FROM user AS legacy_user
             OUTER JOIN user2_final AS user
             ON legacy_user.email = user2_final.email;")))))
(deftest column-qualifier-test
  (testing "column-qualifiers works with tables and aliases"
    (is (= #{"user" "legacy_user"}
           (column-qualifiers "SELECT
                                 user.id AS user_id,
                                 legacy_user.id AS old_id
                               FROM user
                               OUTER JOIN user as legacy_user
                               ON user.email = user.electronic_mail_address
                               JOIN unrelated_table on foo = user.unrelated_id;")))))
(deftest query->columns-test
  (testing "Simple queries"
    (is (= #{{:column "foo"}
             {:column "bar"}
             {:column "id" :table "quux"}
             {:column "quux_id" :table "baz"}}
           (columns "SELECT foo, bar FROM baz INNER JOIN quux ON quux.id = baz.quux_id"))))
  (testing "'group by' columns present"
    (is (= #{{:column "id" :table "orders"}
             {:column "user_id" :table "orders"}}
           (columns "SELECT id FROM orders GROUP BY user_id"))))
  (testing "table alias present"
    (is (= #{{:column "id" :table "orders" :schema "public"}}
           (columns "SELECT o.id FROM public.orders o")))))
(deftest infer-test
  (testing "We can first column through a few hoops"
    (is (= #{{:column "amount" :table "orders"}}
           (columns "SELECT amount FROM (SELECT amount FROM orders)")))
    (is (= #{{:column "amount" :alias "cost" :table "orders"}
             ;; FIXME: we need to figure out that `cost` is an alias from subquery
             {:column "cost", :table "orders"}}
           (columns "SELECT cost FROM (SELECT amount AS cost FROM orders)")))))
(deftest mutation-test
  (is (= #{"alter-sequence"}
         (mutations "ALTER SEQUENCE serial RESTART WITH 42")))
  (is (= #{"alter-session"}
         (mutations "ALTER SESSION SET foo = 'bar'")))
  (is (= #{"alter-system"}
         (mutations "ALTER SYSTEM RESET ALL")))
  (is (= #{"alter-table"}
         (mutations "ALTER TABLE orders ADD COLUMN email text")))
  (is (= #{"alter-view"}
         (mutations "ALTER VIEW foo AS SELECT bar;")))
  (is (= #{"create-function"}           ; Postgres syntax
         (mutations "CREATE FUNCTION multiply(integer, integer) RETURNS integer AS 'SELECT $1 * $2;' LANGUAGE SQL
         IMMUTABLE RETURNS NULL ON NULL INPUT;")))
  (is (= #{"create-function"}           ; Conventional syntax
         (mutations "CREATE FUNCTION multiply(a integer, b integer) RETURNS integer LANGUAGE SQL IMMUTABLE RETURNS
         NULL ON NULL INPUT RETURN a + b;")))
  (is (= #{"create-index"}
         (mutations "CREATE INDEX idx_user_id ON orders(user_id);")))
  (is (= #{"create-schema"}
         (mutations "CREATE SCHEMA perthshire")))
  (is (= #{"create-sequence"}
         (mutations "CREATE SEQUENCE users_seq START WITH 42 INCREMENT BY 17")))
  (is (= #{"create-synonym"}
         (mutations "CREATE SYNONYM folk FOR people")))
  (is (= #{"create-table"}
         (mutations "CREATE TABLE poets (name text, id integer)")))
  (is (= #{"create-view"}
         (mutations "CREATE VIEW folk AS SELECT * FROM people WHERE id > 10")))
  (is (= #{"delete"}
         (mutations "DELETE FROM people")))
  (is (= #{"drop"}
         (mutations "DROP TABLE people")))
  (is (= #{"grant"}
         (mutations "GRANT SELECT, UPDATE, INSERT ON people TO myself")))
  (is (= #{"insert"}
         (mutations "INSERT INTO people(name, source) VALUES ('Robert Fergusson', 'Twitter'), ('Robert Burns',
         'Facebook')")))
  (is (= #{"purge"}
         (mutations "PURGE TABLE people")))
  (is (= #{"rename-table"}
         (mutations "RENAME TABLE people TO folk")))
  (is (= #{"truncate"}
         (mutations "TRUNCATE TABLE people")))
  (is (= #{"update"}
         (mutations "UPDATE people SET name = 'Robert Fergusson' WHERE id = 23"))))
(deftest complicated-mutations-test
  ;; https://github.com/metabase/macaw/issues/18
  #_  (is (= #{"delete" "insert"}
             (mutations "WITH outdated_orders AS (
                       DELETE FROM orders
                       WHERE
                         date <= '2018-01-01'
                       RETURNING *)
                     INSERT INTO order_log
                     SELECT * from outdated_orders;")))
    (is (= #{ "insert"}
         (mutations "WITH outdated_orders AS (
                       SELECT * from orders)
                     INSERT INTO order_log
                     SELECT * from outdated_orders;"))))
(deftest alias-inclusion-test
  (testing "Aliases are not included"
    (is (= #{{:table "orders"} {:table "foo"}}
           (tables "SELECT id, o.id FROM orders o JOIN foo ON orders.id = foo.order_id")))))
(deftest select-*-test
  (is (true? (has-wildcard? "SELECT * FROM orders")))
  (is (true? (has-wildcard? "SELECT id, * FROM orders JOIN foo ON orders.id = foo.order_id"))))
(deftest table-wildcard-test-without-aliases
  (is (= #{{:component {:table "orders"} :context ["FROM" "SELECT"]}}
         (table-wcs "SELECT orders.* FROM orders JOIN foo ON orders.id = foo.order_id")))
  (is (= #{{:component {:table "foo" :schema "public"} :context ["FROM" "JOIN" "SELECT"]}}
         (table-wcs "SELECT foo.* FROM orders JOIN public.foo f ON orders.id = foo.order_id"))))
(deftest table-star-test-with-aliases
  (is (= #{{:table "orders"}}
         (table-wcs "SELECT o.* FROM orders o JOIN foo ON orders.id = foo.order_id")))
  (is (= #{{:table "foo"}}
         (table-wcs "SELECT f.* FROM orders o JOIN foo f ON orders.id = foo.order_id"))))
(deftest context-test
  (testing "Sub-select with outer wildcard"
    (is (= {:columns
            #{{:component {:column "total" :table "orders"}, :context ["SELECT" "SUB_SELECT" "FROM" "SELECT"]}
              {:component {:column "id"    :table "orders"}, :context ["SELECT" "SUB_SELECT" "FROM" "SELECT"]}
              {:component {:column "total" :table "orders"}, :context ["WHERE" "JOIN" "FROM" "SELECT"]}},
            :has-wildcard?     #{{:component true, :context ["SELECT"]}},
            :mutation-commands #{},
            :tables            #{{:component {:table "orders"}, :context ["FROM" "SELECT" "SUB_SELECT" "FROM" "SELECT"]}},
            :table-wildcards   #{}}
           (components "SELECT * FROM (SELECT id, total FROM orders) WHERE total > 10"))))
  (testing "Sub-select with inner wildcard"
    (is (= {:columns
            #{{:component {:column "id"    :table "orders"}, :context ["SELECT"]}
              {:component {:column "total" :table "orders"}, :context ["SELECT"]}
              {:component {:column "total" :table "orders"}, :context ["WHERE" "JOIN" "FROM" "SELECT"]}},
            :has-wildcard?     #{{:component true, :context ["SELECT" "SUB_SELECT" "FROM" "SELECT"]}},
            :mutation-commands #{},
            :tables            #{{:component {:table "orders"}, :context ["FROM" "SELECT" "SUB_SELECT" "FROM" "SELECT"]}},
            :table-wildcards   #{}}
           (components "SELECT id, total FROM (SELECT * FROM orders) WHERE total > 10"))))
  (testing "Sub-select with dual wildcards"
    (is (= {:columns           #{{:component {:column "total" :table "orders"}, :context ["WHERE" "JOIN" "FROM" "SELECT"]}},
            :has-wildcard?
            #{{:component true, :context ["SELECT" "SUB_SELECT" "FROM" "SELECT"]}
              {:component true, :context ["SELECT"]}},
            :mutation-commands #{},
            :tables            #{{:component {:table "orders"}, :context ["FROM" "SELECT" "SUB_SELECT" "FROM" "SELECT"]}},
            :table-wildcards   #{}}
           (components "SELECT * FROM (SELECT * FROM orders) WHERE total > 10"))))
  (testing "Join; table wildcard"
    (is (= {:columns           #{{:component {:column "order_id" :table "foo"}, :context ["JOIN" "SELECT"]}
                                 {:component {:column "id" :table "orders"}, :context ["JOIN" "SELECT"]}},
            :has-wildcard?     #{},
            :mutation-commands #{},
            :tables            #{{:component {:table "foo"}, :context ["FROM" "JOIN" "SELECT"]}
                                 {:component {:table "orders"}, :context ["FROM" "SELECT"]}},
            :table-wildcards   #{{:component {:table "orders"}, :context ["SELECT"]}}}
         (components "SELECT o.* FROM orders o JOIN foo ON orders.id = foo.order_id")))))
(defn test-replacement [before replacements after]
  (is (= after (m/replace-names before replacements))))
(deftest replace-names-test
  (test-replacement "SELECT a.x, b.y FROM a, b;"
                    {:tables {"a" "aa"}
                     :columns  {"x" "xx"}}
                    "SELECT aa.xx, b.y FROM aa, b;")
  (test-replacement
   "SELECT *, boink
  , yoink AS oink
 FROM /* /* lore */
    core_user,
  bore_user,  /* more */ snore_user ;"
   {:tables  {"core_user"  "floor_muser"
              "bore_user"  "user"
              "snore_user" "vigilant_user"
              "cruft"      "tuft"}
    :columns {"boink" "sturmunddrang"
              "yoink" "oink"
              "hoi"   "polloi"}}
   "SELECT *, sturmunddrang
  , oink AS oink
 FROM /* /* lore */
    floor_muser,
  user,  /* more */ vigilant_user ;"))
(deftest replace-schema-test
  (test-replacement "SELECT public.orders.x FROM public.orders"
                    {:schemas {"public" "totally_private"}
                     :tables  {"orders" "purchases"}
                     :columns {"x" "xx"}}
                    "SELECT totally_private.purchases.xx FROM totally_private.purchases"))