This namespace contains functionality that is not compatible with js, hence can not be stored in correspoding cljc ns, ie. [[metabase.legacy-mbql.util]].

(ns metabase.legacy-mbql.jvm-util
  (:require
   [metabase.lib.util.match :as lib.util.match]))

Following regex definitions are incompatible with Safari browser. Code is unused on FE.

(def ^:private host-regex
  ;; Extracts the "host" from a URL or an email.
  ;; By host we mean the main domain name and the TLD, eg. metabase.com, amazon.co.jp, bbc.co.uk.
  ;; For a URL, this is not the RFC3986 "host", which would include any subdomains and the optional `:3000` port number.
  ;;
  ;; For an email, this is generally the part after the @, but it will skip any subdomains:
  ;;   someone@email.mycompany.net -> mycompany.net
  ;;
  ;; Referencing the indexes below:
  ;; 1.  Positive lookbehind:
  ;;       Just past one of:
  ;; 2.      @  from an email or URL userinfo@ prefix
  ;; 3.      // from a URL scheme
  ;; 4.      .  from a previous subdomain segment
  ;; 5.      Start of string
  ;; 6.  Negative lookahead: don't capture www as part of the domain
  ;; 7.  Main domain segment
  ;; 8.  Ending in a dot
  ;; 9.  Optional short final segment (eg. co in .co.uk)
  ;; 10. Top-level domain
  ;; 11. Optional :port, /path, ?query or #hash
  ;; 12. Anchor to the end
  ;;1   2 3  4  5 6        7          8 9                     10         11           12
  #"(?<=@|//|\.|^)(?!www\.)[^@\.:/?#]+\.(?:[^@\.:/?#]{1,3}\.)?[^@\.:/?#]+(?=[:/?#].*$|$)")
(def ^:private domain-regex
  ;; Deliberately no ^ at the start; there might be several subdomains before this spot.
  ;; By "short tail" below, I mean a pseudo-TLD nested under a proper TLD. For example, mycompany.co.uk.
  ;; This can accidentally capture a short domain name, eg. "subdomain.aol.com" -> "subdomain", oops.
  ;; But there's a load of these, not a short list we can include here, so it's either preprocess the (huge) master list
  ;; from Mozilla or accept that this regex is a bit best-effort.
  ;; Referencing the indexes below:
  ;; 1.  Positive lookbehind:
  ;;       Just past one of:
  ;; 2.      @  from an email or URL userinfo@ prefix
  ;; 3.      // from a URL scheme
  ;; 4.      .  from a previous subdomain segment
  ;; 5.      Start of string
  ;; 6.  Negative lookahead: don't capture www as the domain
  ;; 7.  One domain segment
  ;; 8.  Positive lookahead:
  ;;       Either:
  ;; 9.      Short final segment (eg. .co.uk)
  ;; 10.     Top-level domain
  ;; 11.     Optional :port, /path, ?query or #hash
  ;; 12.     Anchor to end
  ;;       Or:
  ;; 13.     Top-level domain
  ;; 14.     Optional :port, /path, ?query or #hash
  ;; 15.     Anchor to end
  ;;1   2 3  4  5 6        7          (8   9                10         11          12|  13         14           15)
  #"(?<=@|//|\.|^)(?!www\.)[^@\.:/?#]+(?=\.[^@\.:/?#]{1,3}\.[^@\.:/?#]+(?:[:/?#].*)?$|\.[^@\.:/?#]+(?:[:/?#].*)?$)")
(def ^:private subdomain-regex
  ;; This grabs the first segment that isn't "www", AND excludes the main domain name.
  ;; See [[domain-regex]] for more details about how those are matched.
  ;; Referencing the indexes below:
  ;; 1.  Positive lookbehind:
  ;;       Just past one of:
  ;; 2.      @  from an email or URL userinfo@ prefix
  ;; 3.      // from a URL scheme
  ;; 4.      .  from a previous subdomain segment
  ;; 5.      Start of string
  ;; 6.  Negative lookahead: don't capture www as the domain
  ;; 7.  Negative lookahead: don't capture the main domain name or part of the TLD
  ;;       That would look like:
  ;; 8.      The next segment we *would* capture as the subdomain
  ;; 9.      Optional short segment, like "co" in .co.uk
  ;; 10.     Top-level domain
  ;; 11.     Optionally more URL things: :port or /path or ?query or #fragment
  ;; 12.     End of string
  ;; 13. Match the actual subdomain
  ;; 14. Positive lookahead: the . after the subdomain, which we want to detect but not capture.
  ;;1   2 3  4  5 6        7  8           9                    10        11           12 13       14
  #"(?<=@|//|\.|^)(?!www\.)(?![^\.:/?#]+\.(?:[^\.:/?#]{1,3}\.)?[^\.:/?#]+(?:[:/?#].*)?$)[^\.:/?#]+(?=\.)")

Unwrap host and domain.

(defn desugar-host-and-domain
  [expression]
  (lib.util.match/replace
    expression
    [:host column]
    (recur [:regex-match-first column (str host-regex)])
    [:domain column]
    (recur [:regex-match-first column (str domain-regex)])
    [:subdomain column]
    (recur [:regex-match-first column (str subdomain-regex)])))