-
Notifications
You must be signed in to change notification settings - Fork 122
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[CoreNodes] Add cleanup script for Google Drive parents #10442
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,277 @@ | ||||||||||||||
import { concurrentExecutor, CoreAPI, Ok } from "@dust-tt/types"; | ||||||||||||||
import assert from "assert"; | ||||||||||||||
|
||||||||||||||
import apiConfig from "@app/lib/api/config"; | ||||||||||||||
import { getCorePrimaryDbConnection } from "@app/lib/production_checks/utils"; | ||||||||||||||
import { DataSourceModel } from "@app/lib/resources/storage/models/data_source"; | ||||||||||||||
import { withRetries } from "@app/lib/utils/retries"; | ||||||||||||||
import type Logger from "@app/logger/logger"; | ||||||||||||||
import { makeScript } from "@app/scripts/helpers"; | ||||||||||||||
|
||||||||||||||
const QUERY_BATCH_SIZE = 256; | ||||||||||||||
const NODE_CONCURRENCY = 16; | ||||||||||||||
|
||||||||||||||
async function migrateDocument({ | ||||||||||||||
coreAPI, | ||||||||||||||
dataSource, | ||||||||||||||
coreNode, | ||||||||||||||
execute, | ||||||||||||||
skipIfParentsAreAlreadyCorrect, | ||||||||||||||
logger, | ||||||||||||||
}: { | ||||||||||||||
coreAPI: CoreAPI; | ||||||||||||||
dataSource: DataSourceModel; | ||||||||||||||
coreNode: { | ||||||||||||||
parents: string[]; | ||||||||||||||
node_id: string; | ||||||||||||||
}; | ||||||||||||||
execute: boolean; | ||||||||||||||
skipIfParentsAreAlreadyCorrect: boolean; | ||||||||||||||
logger: typeof Logger; | ||||||||||||||
}) { | ||||||||||||||
let newParents = coreNode.parents; | ||||||||||||||
let newParentId: string | null = null; | ||||||||||||||
try { | ||||||||||||||
const uniqueIds = [ | ||||||||||||||
new Set( | ||||||||||||||
[coreNode.node_id, ...coreNode.parents].map((x) => | ||||||||||||||
x.replace("gdrive-", "") | ||||||||||||||
) | ||||||||||||||
), | ||||||||||||||
]; | ||||||||||||||
newParents = uniqueIds.map((id) => `gdrive-${id}`); | ||||||||||||||
newParentId = newParents[1] || null; | ||||||||||||||
} catch (e) { | ||||||||||||||
logger.error( | ||||||||||||||
{ | ||||||||||||||
nodeId: coreNode.node_id, | ||||||||||||||
parents: coreNode.parents, | ||||||||||||||
}, | ||||||||||||||
`TRANSFORM_ERROR` | ||||||||||||||
); | ||||||||||||||
throw e; | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
if ( | ||||||||||||||
skipIfParentsAreAlreadyCorrect && | ||||||||||||||
newParents.every((x, i) => x === coreNode.parents[i]) | ||||||||||||||
) { | ||||||||||||||
logger.info( | ||||||||||||||
{ | ||||||||||||||
documentId: coreNode.node_id, | ||||||||||||||
fromParents: coreNode.parents, | ||||||||||||||
toParents: newParents, | ||||||||||||||
}, | ||||||||||||||
`SKIP document (parents are already correct)` | ||||||||||||||
); | ||||||||||||||
return new Ok(undefined); | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
if (execute) { | ||||||||||||||
await withRetries( | ||||||||||||||
async () => { | ||||||||||||||
const updateRes = await coreAPI.updateDataSourceDocumentParents({ | ||||||||||||||
projectId: dataSource.dustAPIProjectId, | ||||||||||||||
dataSourceId: dataSource.dustAPIDataSourceId, | ||||||||||||||
documentId: coreNode.node_id, | ||||||||||||||
parents: newParents, | ||||||||||||||
parentId: newParentId, | ||||||||||||||
}); | ||||||||||||||
if (updateRes.isErr()) { | ||||||||||||||
logger.error( | ||||||||||||||
{ | ||||||||||||||
nodeId: coreNode.node_id, | ||||||||||||||
fromParents: coreNode.parents, | ||||||||||||||
toParents: newParents, | ||||||||||||||
toParentId: newParentId, | ||||||||||||||
}, | ||||||||||||||
`Error while updating parents` | ||||||||||||||
); | ||||||||||||||
throw new Error(updateRes.error.message); | ||||||||||||||
} | ||||||||||||||
}, | ||||||||||||||
{ retries: 10 } | ||||||||||||||
)({}); | ||||||||||||||
|
||||||||||||||
logger.info( | ||||||||||||||
{ | ||||||||||||||
nodeId: coreNode.node_id, | ||||||||||||||
fromParents: coreNode.parents, | ||||||||||||||
toParents: newParents, | ||||||||||||||
}, | ||||||||||||||
`LIVE` | ||||||||||||||
); | ||||||||||||||
} else { | ||||||||||||||
logger.info( | ||||||||||||||
{ | ||||||||||||||
nodeId: coreNode.node_id, | ||||||||||||||
fromParents: coreNode.parents, | ||||||||||||||
toParents: newParents, | ||||||||||||||
}, | ||||||||||||||
`DRY` | ||||||||||||||
); | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
return new Ok(undefined); | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
async function migrateDataSource({ | ||||||||||||||
coreAPI, | ||||||||||||||
dataSource, | ||||||||||||||
execute, | ||||||||||||||
skipIfParentsAreAlreadyCorrect, | ||||||||||||||
parentLogger, | ||||||||||||||
}: { | ||||||||||||||
coreAPI: CoreAPI; | ||||||||||||||
dataSource: DataSourceModel; | ||||||||||||||
execute: boolean; | ||||||||||||||
skipIfParentsAreAlreadyCorrect: boolean; | ||||||||||||||
parentLogger: typeof Logger; | ||||||||||||||
}) { | ||||||||||||||
const logger = parentLogger.child({ dataSourceId: dataSource.id }); | ||||||||||||||
const corePrimary = getCorePrimaryDbConnection(); | ||||||||||||||
|
||||||||||||||
// Retrieve the core data source. | ||||||||||||||
const [coreDataSourceRows] = (await corePrimary.query( | ||||||||||||||
`SELECT id, data_source_id | ||||||||||||||
FROM data_sources | ||||||||||||||
WHERE project = ? | ||||||||||||||
AND data_source_id = ?`, | ||||||||||||||
{ | ||||||||||||||
replacements: [ | ||||||||||||||
dataSource.dustAPIProjectId, | ||||||||||||||
dataSource.dustAPIDataSourceId, | ||||||||||||||
], | ||||||||||||||
} | ||||||||||||||
)) as { id: number; data_source_id: string }[][]; | ||||||||||||||
|
||||||||||||||
assert( | ||||||||||||||
coreDataSourceRows.length === 1 && | ||||||||||||||
coreDataSourceRows[0].data_source_id === dataSource.dustAPIDataSourceId, | ||||||||||||||
"Core data source mismatch" | ||||||||||||||
); | ||||||||||||||
const coreDataSourceId = coreDataSourceRows[0].id; | ||||||||||||||
|
||||||||||||||
// For all nodes in the data source (can be big). | ||||||||||||||
let nextId = ""; | ||||||||||||||
|
||||||||||||||
for (;;) { | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: those are hard to read, i suggest you always use a control flow statement expliciting the condition here, or if no condition use forEach / map There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can turn this into a do while but since I have to keep the break the condition isn't doing anything so didn't bother with an unused condition There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so i'd have gone with while(nextId) and having nextId = rows[rows.length - 1]?.node_id; There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah that works too! |
||||||||||||||
const [rows] = (await (async () => { | ||||||||||||||
return corePrimary.query( | ||||||||||||||
`SELECT node_id, parents | ||||||||||||||
FROM data_sources_nodes | ||||||||||||||
WHERE data_source = :coreDataSourceId | ||||||||||||||
AND node_id > :nextId | ||||||||||||||
AND EXISTS | ||||||||||||||
( | ||||||||||||||
SELECT 1 | ||||||||||||||
FROM UNNEST(parents) p | ||||||||||||||
WHERE p NOT LIKE 'gdrive-%' | ||||||||||||||
) | ||||||||||||||
ORDER BY node_id | ||||||||||||||
LIMIT :batchSize`, | ||||||||||||||
{ | ||||||||||||||
replacements: { | ||||||||||||||
coreDataSourceId, | ||||||||||||||
nextId, | ||||||||||||||
batchSize: QUERY_BATCH_SIZE, | ||||||||||||||
}, | ||||||||||||||
} | ||||||||||||||
); | ||||||||||||||
})()) as { | ||||||||||||||
parents: string[]; | ||||||||||||||
node_id: string; | ||||||||||||||
}[][]; | ||||||||||||||
|
||||||||||||||
logger.info({ nextId, rowCount: rows.length }, "BATCH"); | ||||||||||||||
|
||||||||||||||
if (rows.length === 0) { | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i understand it might be a little faster to write, but it's better to optimize for easier to understand :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have to keep this break for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not necessarily, but fine as is 👍 |
||||||||||||||
break; | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
nextId = rows[rows.length - 1].node_id; | ||||||||||||||
|
||||||||||||||
// concurrentExecutor on documents | ||||||||||||||
try { | ||||||||||||||
await concurrentExecutor( | ||||||||||||||
rows, | ||||||||||||||
(coreNode) => | ||||||||||||||
migrateDocument({ | ||||||||||||||
coreAPI, | ||||||||||||||
dataSource, | ||||||||||||||
coreNode, | ||||||||||||||
skipIfParentsAreAlreadyCorrect, | ||||||||||||||
execute, | ||||||||||||||
logger, | ||||||||||||||
}), | ||||||||||||||
{ concurrency: NODE_CONCURRENCY } | ||||||||||||||
); | ||||||||||||||
} catch (e) { | ||||||||||||||
logger.error( | ||||||||||||||
{ | ||||||||||||||
error: e, | ||||||||||||||
nextDataSourceId: dataSource.id, | ||||||||||||||
nextId, | ||||||||||||||
}, | ||||||||||||||
`ERROR` | ||||||||||||||
); | ||||||||||||||
throw e; | ||||||||||||||
} | ||||||||||||||
} | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
async function migrateAll({ | ||||||||||||||
coreAPI, | ||||||||||||||
nextDataSourceId, | ||||||||||||||
execute, | ||||||||||||||
skipIfParentsAreAlreadyCorrect, | ||||||||||||||
logger, | ||||||||||||||
}: { | ||||||||||||||
coreAPI: CoreAPI; | ||||||||||||||
nextDataSourceId: number; | ||||||||||||||
execute: boolean; | ||||||||||||||
skipIfParentsAreAlreadyCorrect: boolean; | ||||||||||||||
logger: typeof Logger; | ||||||||||||||
}) { | ||||||||||||||
// retrieve all data sources for the provider | ||||||||||||||
const dataSources = await DataSourceModel.findAll({ | ||||||||||||||
where: { connectorProvider: "google_drive" }, | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||
order: [["id", "ASC"]], | ||||||||||||||
}); | ||||||||||||||
|
||||||||||||||
for (const dataSource of dataSources) { | ||||||||||||||
if (dataSource.id >= nextDataSourceId) { | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do that in the where clause There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I actually meant to log when skipping a datasource but forgot about it, adding it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not sure I get that. You use this condition to make the nextDataSourceId param useful it's not critical to the PR so still going to approve though There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the purpose is that it doesn't cost anything and if anything fishy happens on a datasource, I'd grep on the datasource ID and it's more readable to see that it was skipped vs not seeing anything and having to check in the script what it means. In general I'd rather not optimize queries that are ran once and are fast anyway if it allows me to have more easily parsable logs |
||||||||||||||
logger.info({ dataSourceId: dataSource.id }, "MIGRATING"); | ||||||||||||||
await migrateDataSource({ | ||||||||||||||
coreAPI, | ||||||||||||||
dataSource, | ||||||||||||||
execute, | ||||||||||||||
skipIfParentsAreAlreadyCorrect, | ||||||||||||||
parentLogger: logger, | ||||||||||||||
}); | ||||||||||||||
} else { | ||||||||||||||
logger.info({ dataSourceId: dataSource.id }, "SKIP"); | ||||||||||||||
} | ||||||||||||||
} | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
makeScript( | ||||||||||||||
{ | ||||||||||||||
skipIfParentsAreAlreadyCorrect: { type: "boolean", default: false }, | ||||||||||||||
nextDataSourceId: { type: "number", default: 0 }, | ||||||||||||||
}, | ||||||||||||||
async ( | ||||||||||||||
{ nextDataSourceId, execute, skipIfParentsAreAlreadyCorrect }, | ||||||||||||||
logger | ||||||||||||||
) => { | ||||||||||||||
const coreAPI = new CoreAPI(apiConfig.getCoreAPIConfig(), logger); | ||||||||||||||
|
||||||||||||||
await migrateAll({ | ||||||||||||||
coreAPI, | ||||||||||||||
nextDataSourceId, | ||||||||||||||
execute, | ||||||||||||||
skipIfParentsAreAlreadyCorrect, | ||||||||||||||
logger, | ||||||||||||||
}); | ||||||||||||||
} | ||||||||||||||
); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no guarantees on resulting order when using set
unless you're really sure, we'd need to have guarantees on parents sorting
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thought order was preserved when writing this, gonna check again out of curiosity, changing if needed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unlike most languages, js Sets actually preserve order: https://dust.tt/w/0ec9852c2f/assistant/FycTesggB2
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok, thanks for looking
so here there is an assumption that botched parents are still symmetric in the right order--which indeed seems correct to me from afar
IMO for unconventional language features such as this (in the future, not here), might warrant a comment (showing you know or you checked)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, if the order of the parents was lost at any point we won't recover it here (we would need to run a backfill relying on connectors db)