Skip to content

Commit

Permalink
feat: apply to_tsvector() explicitly to the full-text search filtered…
Browse files Browse the repository at this point in the history
… column, only if it's not of tsvector type
  • Loading branch information
laurenceisla committed Jan 25, 2025
1 parent e7cc8fe commit cb069ea
Show file tree
Hide file tree
Showing 12 changed files with 422 additions and 126 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ This project adheres to [Semantic Versioning](http://semver.org/).
- #3727, Log maximum pool size - @steve-chavez
- #1536, Add string comparison feature for jwt-role-claim-key - @taimoorzaeem
- #3747, Allow `not_null` value for the `is` operator - @taimoorzaeem
- #2255, Apply `to_tsvector()` explicitly to the full-text search filtered column (excluding `tsvector` types) - @laurenceisla

### Fixed

Expand Down
16 changes: 15 additions & 1 deletion docs/references/api/tables_views.rst
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,21 @@ The :code:`fts` filter mentioned above has a number of options to support flexib
curl "http://localhost:3000/tsearch?my_tsv=not.wfts(french).amusant"
Using `websearch_to_tsquery` requires PostgreSQL of version at least 11.0 and will raise an error in earlier versions of the database.
.. _fts_to_tsvector:

Automatic ``tsvector`` conversion
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

If the filtered column is not of type ``tsvector``, then it will be automatically converted using `to_tsvector() <https://www.postgresql.org/docs/current/functions-textsearch.html#TEXTSEARCH-FUNCTIONS-TABLE>`_.
This allows using ``fts`` on text and JSON types out of the box.

.. code-block:: bash
curl "http://localhost:3000/tsearch?my_text_column=fts(french).amusant"
.. code-block:: bash
curl "http://localhost:3000/tsearch?my_json_column=not.phfts(english).The%20Fat%20Cats"
.. _v_filter:

Expand Down
1 change: 1 addition & 0 deletions src/PostgREST/ApiRequest/Types.hs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ module PostgREST.ApiRequest.Types
, JsonOperand(..)
, JsonOperation(..)
, JsonPath
, Language
, ListVal
, LogicOperator(..)
, LogicTree(..)
Expand Down
48 changes: 27 additions & 21 deletions src/PostgREST/Plan.hs
Original file line number Diff line number Diff line change
Expand Up @@ -265,27 +265,29 @@ data ResolverContext = ResolverContext
, outputType :: Text -- ^ The output type for the response payload; e.g. "csv", "json", "binary".
}

resolveColumnField :: Column -> CoercibleField
resolveColumnField col = CoercibleField (colName col) mempty False (colNominalType col) Nothing (colDefault col) False
resolveColumnField :: Column -> Maybe ToTsVector -> CoercibleField
resolveColumnField col toTsV = CoercibleField (colName col) mempty False toTsV (colNominalType col) Nothing (colDefault col) False

resolveTableFieldName :: Table -> FieldName -> CoercibleField
resolveTableFieldName table fieldName =
resolveTableFieldName :: Table -> FieldName -> Maybe ToTsVector -> CoercibleField
resolveTableFieldName table fieldName toTsV=
fromMaybe (unknownField fieldName []) $ HMI.lookup fieldName (tableColumns table) >>=
Just . resolveColumnField
Just . flip resolveColumnField toTsV

-- | Resolve a type within the context based on the given field name and JSON path. Although there are situations where failure to resolve a field is considered an error (see `resolveOrError`), there are also situations where we allow it (RPC calls). If it should be an error and `resolveOrError` doesn't fit, ensure to check the `cfIRType` isn't empty.
resolveTypeOrUnknown :: ResolverContext -> Field -> CoercibleField
resolveTypeOrUnknown ResolverContext{..} (fn, jp) =
resolveTypeOrUnknown :: ResolverContext -> Field -> Maybe ToTsVector -> CoercibleField
resolveTypeOrUnknown ResolverContext{..} (fn, jp) toTsV =
case res of
-- types that are already json/jsonb don't need to be converted with `to_jsonb` for using arrow operators `data->attr`
-- this prevents indexes not applying https://github.com/PostgREST/postgrest/issues/2594
cf@CoercibleField{cfIRType="json"} -> cf{cfJsonPath=jp, cfToJson=False}
cf@CoercibleField{cfIRType="jsonb"} -> cf{cfJsonPath=jp, cfToJson=False}
cf@CoercibleField{cfIRType="json"} -> cf{cfJsonPath=jp, cfToJson=False}
cf@CoercibleField{cfIRType="jsonb"} -> cf{cfJsonPath=jp, cfToJson=False}
-- Do not apply to_tsvector to tsvector types
cf@CoercibleField{cfIRType="tsvector"} -> cf{cfJsonPath=jp, cfToJson=True, cfToTsVector=Nothing}
-- other types will get converted `to_jsonb(col)->attr`, even unknown types
cf -> cf{cfJsonPath=jp, cfToJson=True}
cf -> cf{cfJsonPath=jp, cfToJson=True}
where
res = fromMaybe (unknownField fn jp) $ HM.lookup qi tables >>=
Just . flip resolveTableFieldName fn
Just . (\t -> resolveTableFieldName t fn toTsV)

-- | Install any pre-defined data representation from source to target to coerce this reference.
--
Expand All @@ -311,11 +313,15 @@ withJsonParse ctx field@CoercibleField{cfIRType} = withTransformer ctx "json" cf

-- | Map the intermediate representation type to the output type defined by the resolver context (normally json), if available.
resolveOutputField :: ResolverContext -> Field -> CoercibleField
resolveOutputField ctx field = withOutputFormat ctx $ resolveTypeOrUnknown ctx field
resolveOutputField ctx field = withOutputFormat ctx $ resolveTypeOrUnknown ctx field Nothing

-- | Map the query string format of a value (text) into the intermediate representation type, if available.
resolveQueryInputField :: ResolverContext -> Field -> CoercibleField
resolveQueryInputField ctx field = withTextParse ctx $ resolveTypeOrUnknown ctx field
resolveQueryInputField :: ResolverContext -> Field -> OpExpr -> CoercibleField
resolveQueryInputField ctx field opExpr = withTextParse ctx $ resolveTypeOrUnknown ctx field toTsVector
where
toTsVector = case opExpr of
OpExpr _ (Fts _ lang _) -> Just $ ToTsVector lang
_ -> Nothing

-- | Builds the ReadPlan tree on a number of stages.
-- | Adds filters, order, limits on its respective nodes.
Expand Down Expand Up @@ -452,7 +458,7 @@ expandStarsForTable ctx@ResolverContext{representations, outputType} hasAgg rp@R

expandStarSelectField :: Bool -> [Column] -> CoercibleSelectField -> [CoercibleSelectField]
expandStarSelectField _ columns sel@CoercibleSelectField{csField=CoercibleField{cfName="*", cfJsonPath=[]}, csAggFunction=Nothing} =
map (\col -> sel { csField = withOutputFormat ctx $ resolveColumnField col }) columns
map (\col -> sel { csField = withOutputFormat ctx $ resolveColumnField col Nothing }) columns
expandStarSelectField True _ sel@CoercibleSelectField{csField=fld@CoercibleField{cfName="*", cfJsonPath=[]}, csAggFunction=Just Count} =
[sel { csField = fld { cfFullRow = True } }]
expandStarSelectField _ _ selectField = [selectField]
Expand Down Expand Up @@ -766,7 +772,7 @@ addOrders ctx ApiRequest{..} rReq =

resolveOrder :: ResolverContext -> OrderTerm -> CoercibleOrderTerm
resolveOrder _ (OrderRelationTerm a b c d) = CoercibleOrderRelationTerm a b c d
resolveOrder ctx (OrderTerm fld dir nulls) = CoercibleOrderTerm (resolveTypeOrUnknown ctx fld) dir nulls
resolveOrder ctx (OrderTerm fld dir nulls) = CoercibleOrderTerm (resolveTypeOrUnknown ctx fld Nothing) dir nulls

-- Validates that the related resource on the order is an embedded resource,
-- e.g. if `clients` is inside the `select` in /projects?order=clients(id)&select=*,clients(*),
Expand Down Expand Up @@ -831,7 +837,7 @@ addRelatedOrders (Node rp@ReadPlan{order,from} forest) = do
-- where_ = [
-- CoercibleStmnt (
-- CoercibleFilter {
-- field = CoercibleField {cfName = "projects", cfJsonPath = [], cfToJson=False, cfIRType = "", cfTransform = Nothing, cfDefault = Nothing, cfFullRow = False},
-- field = CoercibleField {cfName = "projects", cfJsonPath = [], cfToJson=False, cfToTsVector = Nothing, cfIRType = "", cfTransform = Nothing, cfDefault = Nothing, cfFullRow = False},
-- opExpr = op
-- }
-- )
Expand All @@ -847,7 +853,7 @@ addRelatedOrders (Node rp@ReadPlan{order,from} forest) = do
-- Don't do anything to the filter if there's no embedding (a subtree) on projects. Assume it's a normal filter.
--
-- >>> ReadPlan.where_ . rootLabel <$> addNullEmbedFilters (readPlanTree nullOp [])
-- Right [CoercibleStmnt (CoercibleFilter {field = CoercibleField {cfName = "projects", cfJsonPath = [], cfToJson = False, cfIRType = "", cfTransform = Nothing, cfDefault = Nothing, cfFullRow = False}, opExpr = OpExpr True (Is IsNull)})]
-- Right [CoercibleStmnt (CoercibleFilter {field = CoercibleField {cfName = "projects", cfJsonPath = [], cfToJson = False, cfToTsVector = Nothing, cfIRType = "", cfTransform = Nothing, cfDefault = Nothing, cfFullRow = False}, opExpr = OpExpr True (Is IsNull)})]
--
-- If there's an embedding on projects, then change the filter to use the internal aggregate name (`clients_projects_1`) so the filter can succeed later.
--
Expand All @@ -866,7 +872,7 @@ addNullEmbedFilters (Node rp@ReadPlan{where_=curLogic} forest) = do
newNullFilters rPlans = \case
(CoercibleExpr b lOp trees) ->
CoercibleExpr b lOp <$> (newNullFilters rPlans `traverse` trees)
flt@(CoercibleStmnt (CoercibleFilter (CoercibleField fld [] _ _ _ _ _) opExpr)) ->
flt@(CoercibleStmnt (CoercibleFilter (CoercibleField fld [] _ _ _ _ _ _) opExpr)) ->
let foundRP = find (\ReadPlan{relName, relAlias} -> fld == fromMaybe relName relAlias) rPlans in
case (foundRP, opExpr) of
(Just ReadPlan{relAggAlias}, OpExpr b (Is IsNull)) -> Right $ CoercibleStmnt $ CoercibleFilterNullEmbed b relAggAlias
Expand Down Expand Up @@ -900,7 +906,7 @@ resolveLogicTree ctx (Stmnt flt) = CoercibleStmnt $ resolveFilter ctx flt
resolveLogicTree ctx (Expr b op lts) = CoercibleExpr b op (map (resolveLogicTree ctx) lts)

resolveFilter :: ResolverContext -> Filter -> CoercibleFilter
resolveFilter ctx (Filter fld opExpr) = CoercibleFilter{field=resolveQueryInputField ctx fld, opExpr=opExpr}
resolveFilter ctx (Filter fld opExpr) = CoercibleFilter{field=resolveQueryInputField ctx fld opExpr, opExpr=opExpr}

-- Validates that spread embeds are only done on to-one relationships
validateSpreadEmbeds :: ReadPlanTree -> Either ApiRequestError ReadPlanTree
Expand Down Expand Up @@ -963,7 +969,7 @@ mutatePlan mutation qi ApiRequest{iPreferences=Preferences{..}, ..} SchemaCache{
resolveOrError :: ResolverContext -> Maybe Table -> FieldName -> Either ApiRequestError CoercibleField
resolveOrError _ Nothing _ = Left NotFound
resolveOrError ctx (Just table) field =
case resolveTableFieldName table field of
case resolveTableFieldName table field Nothing of
CoercibleField{cfIRType=""} -> Left $ ColumnNotFound (tableName table) field
cf -> Right $ withJsonParse ctx cf

Expand Down
26 changes: 16 additions & 10 deletions src/PostgREST/Plan/Types.hs
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,27 @@ module PostgREST.Plan.Types
, CoercibleLogicTree(..)
, CoercibleFilter(..)
, TransformerProc
, ToTsVector(..)
, CoercibleOrderTerm(..)
, RelSelectField(..)
, RelJsonEmbedMode(..)
, SpreadSelectField(..)
) where

import PostgREST.ApiRequest.Types (AggregateFunction, Alias, Cast,
Field, JsonPath, LogicOperator,
OpExpr, OrderDirection, OrderNulls)
Field, JsonPath, Language,
LogicOperator, OpExpr,
OrderDirection, OrderNulls)

import PostgREST.SchemaCache.Identifiers (FieldName)

import Protolude

type TransformerProc = Text

newtype ToTsVector = ToTsVector (Maybe Language)
deriving (Eq, Show)

Check warning on line 27 in src/PostgREST/Plan/Types.hs

View check run for this annotation

Codecov / codecov/patch

src/PostgREST/Plan/Types.hs#L27

Added line #L27 was not covered by tests

-- | A CoercibleField pairs the name of a query element with any type coercion information we need for some specific use case.
-- |
-- | As suggested by the name, it's often a reference to a field in a table but really it can be any nameable element (function parameter, calculation with an alias, etc) with a knowable type.
Expand All @@ -33,17 +38,18 @@ type TransformerProc = Text
-- |
-- | The type value is allowed to be the empty string. The analog here is soft type checking in programming languages: sometimes we don't need a variable to have a specified type and things will work anyhow. So the empty type variant is valid when we don't know and *don't need to know* about the specific type in some context. Note that this variation should not be used if it guarantees failure: in that case you should instead raise an error at the planning stage and bail out. For example, we can't parse JSON with `json_to_recordset` without knowing the types of each recipient field, and so error out. Using the empty string for the type would be incorrect and futile. On the other hand we use the empty type for RPC calls since type resolution isn't implemented for RPC, but it's fine because the query still works with Postgres' implicit coercion. In the future, hopefully we will support data representations across the board and then the empty type may be permanently retired.
data CoercibleField = CoercibleField
{ cfName :: FieldName
, cfJsonPath :: JsonPath
, cfToJson :: Bool
, cfIRType :: Text -- ^ The native Postgres type of the field, the intermediate (IR) type before mapping.
, cfTransform :: Maybe TransformerProc -- ^ The optional mapping from irType -> targetType.
, cfDefault :: Maybe Text
, cfFullRow :: Bool -- ^ True if the field represents the whole selected row. Used in spread rels: instead of COUNT(*), it does a COUNT(<row>) in order to not mix with other spreaded resources.
{ cfName :: FieldName
, cfJsonPath :: JsonPath
, cfToJson :: Bool

Check warning on line 43 in src/PostgREST/Plan/Types.hs

View check run for this annotation

Codecov / codecov/patch

src/PostgREST/Plan/Types.hs#L42-L43

Added lines #L42 - L43 were not covered by tests
, cfToTsVector :: Maybe ToTsVector -- ^ If the field should be converted using to_tsvector(<language>, <field>)
, cfIRType :: Text -- ^ The native Postgres type of the field, the intermediate (IR) type before mapping.
, cfTransform :: Maybe TransformerProc -- ^ The optional mapping from irType -> targetType.
, cfDefault :: Maybe Text

Check warning on line 47 in src/PostgREST/Plan/Types.hs

View check run for this annotation

Codecov / codecov/patch

src/PostgREST/Plan/Types.hs#L46-L47

Added lines #L46 - L47 were not covered by tests
, cfFullRow :: Bool -- ^ True if the field represents the whole selected row. Used in spread rels: instead of COUNT(*), it does a COUNT(<row>) in order to not mix with other spreaded resources.
} deriving (Eq, Show)

unknownField :: FieldName -> JsonPath -> CoercibleField
unknownField name path = CoercibleField name path False "" Nothing Nothing False
unknownField name path = CoercibleField name path False Nothing "" Nothing Nothing False

-- | Like an API request LogicTree, but with coercible field information.
data CoercibleLogicTree
Expand Down
2 changes: 1 addition & 1 deletion src/PostgREST/Query/QueryBuilder.hs
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ callPlanToQuery (FunctionCall qi params arguments returnsScalar returnsSetOfScal
KeyParams [] -> "FROM " <> callIt mempty
KeyParams prms -> case arguments of
DirectArgs args -> "FROM " <> callIt (fmtArgs prms args)
JsonArgs json -> fromJsonBodyF json ((\p -> CoercibleField (ppName p) mempty False (ppTypeMaxLength p) Nothing Nothing False) <$> prms) False True False <> ", " <>
JsonArgs json -> fromJsonBodyF json ((\p -> CoercibleField (ppName p) mempty False Nothing (ppTypeMaxLength p) Nothing Nothing False) <$> prms) False True False <> ", " <>
"LATERAL " <> callIt (fmtParams prms)

callIt :: SQL.Snippet -> SQL.Snippet
Expand Down
20 changes: 14 additions & 6 deletions src/PostgREST/Query/SqlFragment.hs
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ import PostgREST.Plan.Types (CoercibleField (..),
CoercibleSelectField (..),
RelSelectField (..),
SpreadSelectField (..),
ToTsVector (..),
unknownField)
import PostgREST.RangeQuery (NonnegRange, allRange,
rangeLimit, rangeOffset)
Expand Down Expand Up @@ -252,10 +253,15 @@ pgFmtCallUnary :: Text -> SQL.Snippet -> SQL.Snippet
pgFmtCallUnary f x = SQL.sql (encodeUtf8 f) <> "(" <> x <> ")"

pgFmtField :: QualifiedIdentifier -> CoercibleField -> SQL.Snippet
pgFmtField table CoercibleField{cfFullRow=True} = fromQi table
pgFmtField table CoercibleField{cfName=fn, cfJsonPath=[]} = pgFmtColumn table fn
pgFmtField table CoercibleField{cfName=fn, cfToJson=doToJson, cfJsonPath=jp} | doToJson = "to_jsonb(" <> pgFmtColumn table fn <> ")" <> pgFmtJsonPath jp
| otherwise = pgFmtColumn table fn <> pgFmtJsonPath jp
pgFmtField table cf = case cfToTsVector cf of
Just (ToTsVector lang) -> "to_tsvector(" <> pgFmtFtsLang lang <> fmtFld <> ")"
_ -> fmtFld
where
fmtFld = case cf of
CoercibleField{cfFullRow=True} -> fromQi table
CoercibleField{cfName=fn, cfJsonPath=[]} -> pgFmtColumn table fn
CoercibleField{cfName=fn, cfToJson=doToJson, cfJsonPath=jp} | doToJson -> "to_jsonb(" <> pgFmtColumn table fn <> ")" <> pgFmtJsonPath jp
| otherwise -> pgFmtColumn table fn <> pgFmtJsonPath jp

-- Select the value of a named element from a table, applying its optional coercion mapping if any.
pgFmtTableCoerce :: QualifiedIdentifier -> CoercibleField -> SQL.Snippet
Expand Down Expand Up @@ -399,16 +405,18 @@ pgFmtFilter table (CoercibleFilter fld (OpExpr hasNot oper)) = notOp <> " " <> p
[""] -> "= ANY('{}') "
_ -> "= ANY (" <> pgFmtArrayLiteralForField vals fld <> ") "

Fts op lang val -> " " <> ftsOperator op <> "(" <> ftsLang lang <> unknownLiteral val <> ") "
Fts op lang val -> " " <> ftsOperator op <> "(" <> pgFmtFtsLang lang <> unknownLiteral val <> ") "
where
ftsLang = maybe mempty (\l -> unknownLiteral l <> ", ")
notOp = if hasNot then "NOT" else mempty
star c = if c == '*' then '%' else c
fmtQuant q val = case q of
Just QuantAny -> "ANY(" <> val <> ")"
Just QuantAll -> "ALL(" <> val <> ")"
Nothing -> val

pgFmtFtsLang :: Maybe Text -> SQL.Snippet
pgFmtFtsLang = maybe mempty (\l -> unknownLiteral l <> ", ")

pgFmtJoinCondition :: JoinCondition -> SQL.Snippet
pgFmtJoinCondition (JoinCondition (qi1, col1) (qi2, col2)) =
pgFmtColumn qi1 col1 <> " = " <> pgFmtColumn qi2 col2
Expand Down
13 changes: 12 additions & 1 deletion test/spec/Feature/Query/AndOrParamsSpec.hs
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ spec =
it "can handle is" $
get "/entities?and=(name.is.null,arr.is.null)&select=id" `shouldRespondWith`
[json|[{ "id": 4 }]|] { matchHeaders = [matchContentTypeJson] }
it "can handle fts" $ do
it "can handle fts on tsvector columns" $ do
get "/entities?or=(text_search_vector.fts.bar,text_search_vector.fts.baz)&select=id" `shouldRespondWith`
[json|[{ "id": 1 }, { "id": 2 }]|] { matchHeaders = [matchContentTypeJson] }
get "/tsearch?or=(text_search_vector.plfts(german).Art%20Spass, text_search_vector.plfts(french).amusant%20impossible, text_search_vector.fts(english).impossible)" `shouldRespondWith`
Expand All @@ -90,6 +90,17 @@ spec =
{"text_search_vector": "'amus':5 'fair':7 'impossibl':9 'peu':4" },
{"text_search_vector": "'art':4 'spass':5 'unmog':7"}
]|] { matchHeaders = [matchContentTypeJson] }
it "can handle fts on text and json columns" $ do
get "/grandchild_entities?or=(jsonb_col.fts.bar,jsonb_col.fts.foo)&select=jsonb_col" `shouldRespondWith`
[json|[
{ "jsonb_col": {"a": {"b":"foo"}} },
{ "jsonb_col": {"b":"bar"} }]
|] { matchHeaders = [matchContentTypeJson] }
get "/tsearch_to_tsvector?and=(text_search.not.plfts(german).Art%20Spass, text_search.not.plfts(french).amusant%20impossible, text_search.not.fts(english).impossible)&select=text_search" `shouldRespondWith`
[json|[
{ "text_search": "But also fun to do what is possible" },
{ "text_search": "Fat cats ate rats" }]
|] { matchHeaders = [matchContentTypeJson] }
it "can handle isdistinct" $
get "/entities?and=(id.gte.2,arr.isdistinct.{1,2})&select=id" `shouldRespondWith`
[json|[{ "id": 3 }, { "id": 4 }]|] { matchHeaders = [matchContentTypeJson] }
Expand Down
Loading

0 comments on commit cb069ea

Please sign in to comment.