Skip to content

Commit

Permalink
Documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
ccampo133 committed Apr 2, 2024
1 parent 11a1d59 commit 05827ac
Show file tree
Hide file tree
Showing 14 changed files with 184 additions and 92 deletions.
9 changes: 9 additions & 0 deletions discovery/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
// Package discovery provides mechanisms to perform database introspection and
// data discovery on various data repositories. It provides a Scanner type that
// can be used to scan a data repository for sensitive data, classify the data,
// and publish the results to external sources. Additionally, the sql subpackage
// provides various SQL repository implementations that can be used to
// introspect and sample SQL-based data repositories. Support for additional
// data repository types, such as NoSQL-based repos, is intended to be added in
// the future.
package discovery
3 changes: 2 additions & 1 deletion discovery/scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ import (

// Scanner is a data discovery scanner that scans a data repository for
// sensitive data. It also classifies the data and publishes the results to
// the configured external sources.
// the configured external sources. It currently only supports SQL-based
// repositories.
type Scanner struct {
config *config.Config
repository sql.Repository
Expand Down
17 changes: 12 additions & 5 deletions discovery/sql/denodo.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,17 +69,22 @@ func NewDenodoRepository(cfg config.RepoConfig) (*DenodoRepository, error) {
return &DenodoRepository{genericSqlRepo: sqlRepo}, nil
}

// TODO: godoc -ccampo 2024-04-02
// ListDatabases is left unimplemented for Denodo, because Denodo doesn't have
// the concept of databases.
func (r *DenodoRepository) ListDatabases(_ context.Context) ([]string, error) {
return nil, errors.New("ListDatabases is not implemented for Denodo repositories")
}

// TODO: godoc -ccampo 2024-04-02
// Introspect delegates introspection to GenericRepository. See
// Repository.Introspect and GenericRepository.IntrospectWithQuery for more
// details.
func (r *DenodoRepository) Introspect(ctx context.Context) (*Metadata, error) {
return r.genericSqlRepo.IntrospectWithQuery(ctx, DenodoIntrospectQuery)
}

// TODO: godoc -ccampo 2024-04-02
// SampleTable delegates sampling to GenericRepository, using a Denodo-specific
// table sample query. See Repository.SampleTable and
// GenericRepository.SampleTableWithQuery for more details.
func (r *DenodoRepository) SampleTable(
ctx context.Context,
meta *TableMetadata,
Expand All @@ -98,12 +103,14 @@ func (r *DenodoRepository) SampleTable(
return r.genericSqlRepo.SampleTableWithQuery(ctx, meta, query)
}

// TODO: godoc -ccampo 2024-04-02
// Ping delegates the ping to GenericRepository. See Repository.Ping and
// GenericRepository.Ping for more details.
func (r *DenodoRepository) Ping(ctx context.Context) error {
return r.genericSqlRepo.Ping(ctx)
}

// TODO: godoc -ccampo 2024-04-02
// Close delegates the close to GenericRepository. See Repository.Close and
// GenericRepository.Close for more details.
func (r *DenodoRepository) Close() error {
return r.genericSqlRepo.Close()
}
21 changes: 7 additions & 14 deletions discovery/sql/doc.go
Original file line number Diff line number Diff line change
@@ -1,15 +1,8 @@
// Package sql provides an API for performing data discovery on SQL databases.
// The Repository type encapsulates the concept of a Dmap data SQL repository.
// The package provides a registry for all supported repository implementations
// and a factory function to create new instances of a repository
// from the registry. All supported repositories are represented as sub-packages
// of the repository name, e.g. mysql, postgresql, etc.
//
// Repository implementations should reside in their own sub-package of the
// repository package. Each implementation register itself with the repository
// registry by calling the Register function with a RepoConstructor function
// that returns a new instance of the repository implementation. This will make
// the repository implementation available to the NewRepository factory
// function. Registration is typically done in the sub-package's init function.
// TODO: fix this doc -ccampo 2024-04-02
// Package sql provides an API for performing database introspection and data
// discovery on SQL databases. The Repository type encapsulates the concept of a
// Dmap data SQL repository. The package provides a Registry for all supported
// repository implementations and a factory function to create new instances of
// a repository from the registry. All out-of-the-box Repository implementations
// are included in their own files named after the repository type, e.g.
// mysql.go, postgres.go, etc.
package sql
41 changes: 33 additions & 8 deletions discovery/sql/generic.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package sql
import (
"context"
"database/sql"
"errors"
"fmt"

log "github.com/sirupsen/logrus"
Expand Down Expand Up @@ -36,9 +37,14 @@ const (
// subset of ANSI SQL compatible databases. Many Repository implementations may
// partially or fully delegate to this implementation. In that respect, it acts
// somewhat as a base implementation which can be used by SQL-compatible
// repositories. Note however that GenericRepository is not an implementation of
// the Repository interface, and is meant to be used as a building block for
// other Repository implementations.
// repositories. Note that while GenericRepository is an implementation of
// the Repository interface, GenericRepository is meant to be used as a building
// block for other Repository implementations, rather than as a standalone
// implementation. Specifically, the Repository.ListDatabases method is left
// un-implemented, since there is no standard way to list databases across
// different SQL database platforms. It does however provide the
// ListDatabasesWithQuery method, which dependent implementations can use to
// provide a custom query to list databases.
type GenericRepository struct {
repoName string
repoType string
Expand All @@ -48,6 +54,8 @@ type GenericRepository struct {
excludePaths []glob.Glob
}

var _ Repository = (*GenericRepository)(nil)

// NewGenericRepository is a constructor for the GenericRepository type. It
// opens a database handle for a given repoType and returns a pointer to a new
// GenericRepository instance. A connection may or may not be established
Expand All @@ -68,7 +76,7 @@ func NewGenericRepository(
*GenericRepository,
error,
) {
db, err := getDbHandle(repoType, connStr, maxOpenConns)
db, err := newDbHandle(repoType, connStr, maxOpenConns)
if err != nil {
return nil, fmt.Errorf("error retrieving DB handle for repo type %s: %w", repoType, err)
}
Expand All @@ -93,6 +101,13 @@ func NewGenericRepositoryFromDB(repoName, repoType, database string, db *sql.DB)
}
}

// ListDatabases is left unimplemented for GenericRepository, because there is
// no standard way to list databases across different SQL database platforms.
// See ListDatabasesWithQuery for a way to list databases using a custom query.
func (r *GenericRepository) ListDatabases(_ context.Context) ([]string, error) {
return nil, errors.New("ListDatabases is not implemented")
}

// ListDatabasesWithQuery returns a list of the names of all databases on the
// server, as determined by the given query. The query is expected to return
// a row set containing a single column corresponding to the database name. If
Expand Down Expand Up @@ -159,7 +174,10 @@ func (r *GenericRepository) IntrospectWithQuery(
return repoMeta, nil
}

// TODO: godoc -ccampo 2024-04-02
// SampleTable samples the table referenced by the TableMetadata meta parameter
// by issuing a standard, ANSI-compatible SELECT query to the database. All
// attributes of the table are selected, and are quoted using double quotes. See
// Repository.SampleTable for more details.
func (r *GenericRepository) SampleTable(
ctx context.Context,
meta *TableMetadata,
Expand Down Expand Up @@ -213,7 +231,8 @@ func (r *GenericRepository) SampleTableWithQuery(
return sample, nil
}

// TODO: godoc -ccampo 2024-04-02
// Ping verifies the connection to the database used by this repository by
// executing a simple query. If the query fails, an error is returned.
func (r *GenericRepository) Ping(ctx context.Context) error {
log.Tracef("Query: %s", GenericPingQuery)
rows, err := r.db.QueryContext(ctx, GenericPingQuery)
Expand All @@ -229,12 +248,15 @@ func (r *GenericRepository) GetDb() *sql.DB {
return r.db
}

// TODO: godoc -ccampo 2024-04-02
// Close closes the database connection used by the repository.
func (r *GenericRepository) Close() error {
return r.db.Close()
}

func getDbHandle(repoType, connStr string, maxOpenConns uint) (*sql.DB, error) {
// newDbHandle opens a new database sql.DB handle for the given repoType and
// connection string. The maxOpenConns parameter specifies the maximum number of
// open connections to the database.
func newDbHandle(repoType, connStr string, maxOpenConns uint) (*sql.DB, error) {
db, err := sql.Open(repoType, connStr)
if err != nil {
return nil, err
Expand Down Expand Up @@ -328,6 +350,9 @@ func getCurrentRowAsMap(rows *sql.Rows) (map[string]any, error) {
return row, nil
}

// matchPathPatterns checks if the given database, schema, and table match any
// of the given glob patterns. It returns true if the database, schema, and
// table match any of the patterns, and false otherwise.
func matchPathPatterns(database, schema, table string, patterns []glob.Glob) bool {
for _, pattern := range patterns {
if pattern.Match(fmt.Sprintf("%s.%s.%s", database, schema, table)) {
Expand Down
23 changes: 16 additions & 7 deletions discovery/sql/mysql.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,35 +61,44 @@ func NewMySqlRepository(cfg config.RepoConfig) (*MySqlRepository, error) {
return &MySqlRepository{genericSqlRepo: sqlRepo}, nil
}

// TODO: godoc -ccampo 2024-04-02
// ListDatabases returns a list of the names of all databases on the server by
// using a MySQL-specific database query. It delegates the actual work to
// GenericRepository.ListDatabasesWithQuery - see that method for more details.
func (r *MySqlRepository) ListDatabases(ctx context.Context) ([]string, error) {
return r.genericSqlRepo.ListDatabasesWithQuery(ctx, MySqlDatabaseQuery)
}

// TODO: godoc -ccampo 2024-04-02
// Introspect delegates introspection to GenericRepository. See
// Repository.Introspect and GenericRepository.IntrospectWithQuery for more
// details.
func (r *MySqlRepository) Introspect(ctx context.Context) (*Metadata, error) {
return r.genericSqlRepo.Introspect(ctx)
}

// TODO: godoc -ccampo 2024-04-02
// SampleTable delegates sampling to GenericRepository, using a MySQL-specific
// table sample query. See Repository.SampleTable and
// GenericRepository.SampleTableWithQuery for more details.
func (r *MySqlRepository) SampleTable(
ctx context.Context,
meta *TableMetadata,
params SampleParameters,
) (Sample, error) {
// MySQL uses backticks to quote identifiers
// MySQL uses backticks to quote identifiers.
attrStr := meta.QuotedAttributeNamesString("`")
// The generic select/limit/offset query and ? placeholders work fine with MySQL
// The generic select/limit/offset query and ? placeholders work fine with
// MySQL.
query := fmt.Sprintf(GenericSampleQueryTemplate, attrStr, meta.Schema, meta.Name)
return r.genericSqlRepo.SampleTableWithQuery(ctx, meta, query, params.SampleSize, params.Offset)
}

// TODO: godoc -ccampo 2024-04-02
// Ping delegates the ping to GenericRepository. See Repository.Ping and
// GenericRepository.Ping for more details.
func (r *MySqlRepository) Ping(ctx context.Context) error {
return r.genericSqlRepo.Ping(ctx)
}

// TODO: godoc -ccampo 2024-04-02
// Close delegates the close to GenericRepository. See Repository.Close and
// GenericRepository.Close for more details.
func (r *MySqlRepository) Close() error {
return r.genericSqlRepo.Close()
}
31 changes: 19 additions & 12 deletions discovery/sql/oracle.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,20 +84,24 @@ func (r *OracleRepository) ListDatabases(_ context.Context) ([]string, error) {
return nil, errors.New("ListDatabases is not implemented for Oracle repos")
}

// TODO: godoc -ccampo 2024-04-02
// Introspect delegates introspection to GenericRepository, using an
// Oracle-specific introspection query. See Repository.Introspect and
// GenericRepository.IntrospectWithQuery for more details.
func (r *OracleRepository) Introspect(ctx context.Context) (*Metadata, error) {
return r.genericSqlRepo.IntrospectWithQuery(ctx, OracleIntrospectQuery)
}

// TODO: godoc -ccampo 2024-04-02
// SampleTable delegates sampling to GenericRepository, using an Oracle-specific
// table sample query. See Repository.SampleTable and
// GenericRepository.SampleTableWithQuery for more details.
func (r *OracleRepository) SampleTable(
ctx context.Context,
meta *TableMetadata,
params SampleParameters,
) (Sample, error) {
// Oracle uses double-quotes to quote identifiers
// Oracle uses double-quotes to quote identifiers.
attrStr := meta.QuotedAttributeNamesString("\"")
// Oracle uses :x for placeholders
// Oracle uses :x for placeholders.
query := fmt.Sprintf(
"SELECT %s FROM %s.%s OFFSET :1 ROWS FETCH NEXT :2 ROWS ONLY",
attrStr, meta.Schema, meta.Name,
Expand All @@ -106,26 +110,29 @@ func (r *OracleRepository) SampleTable(
}

// Ping verifies the connection to Oracle database used by this Oracle
// Normally we would just delegate to the Ping method implemented by
// genericOracle However, that implementation executes a
// 'SELECT 1' query to test for connectivity, and Oracle being Oracle, does not
// like this. So instead, we defer to the native Ping method implemented by the
// Oracle DB driver.
// Normally we would just delegate to GenericRepository.Ping, however, that
// implementation executes a 'SELECT 1' query to test for connectivity, and
// Oracle being Oracle does not like this. Instead, we defer to the native
// Ping method implemented by the Oracle DB driver.
func (r *OracleRepository) Ping(ctx context.Context) error {
return r.genericSqlRepo.GetDb().PingContext(ctx)
}

// TODO: godoc -ccampo 2024-04-02
// Close delegates the close to GenericRepository. See Repository.Close and
// GenericRepository.Close for more details.
func (r *OracleRepository) Close() error {
return r.genericSqlRepo.Close()
}

// TODO: godoc -ccampo 2024-04-02
// OracleConfig is a struct to hold Oracle-specific configuration.
type OracleConfig struct {
// ServiceName is the Oracle service name.
ServiceName string
}

// TODO: godoc -ccampo 2024-04-02
// ParseOracleConfig parses the Oracle-specific configuration from the
// given config. The Oracle configuration is expected to be in the
// config's "advanced config" property.
func ParseOracleConfig(cfg config.RepoConfig) (*OracleConfig, error) {
oracleCfg, err := config.FetchAdvancedConfigString(
cfg,
Expand Down
33 changes: 20 additions & 13 deletions discovery/sql/postgres.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,17 +70,23 @@ func NewPostgresRepository(cfg config.RepoConfig) (*PostgresRepository, error) {
return &PostgresRepository{genericSqlRepo: sqlRepo}, nil
}

// TODO: godoc -ccampo 2024-04-02
// ListDatabases returns a list of the names of all databases on the server by
// using a Postgres-specific database query. It delegates the actual work to
// GenericRepository.ListDatabasesWithQuery - see that method for more details.
func (r *PostgresRepository) ListDatabases(ctx context.Context) ([]string, error) {
return r.genericSqlRepo.ListDatabasesWithQuery(ctx, PostgresDatabaseQuery)
}

// TODO: godoc -ccampo 2024-04-02
// Introspect delegates introspection to GenericRepository. See
// Repository.Introspect and GenericRepository.IntrospectWithQuery for more
// details.
func (r *PostgresRepository) Introspect(ctx context.Context) (*Metadata, error) {
return r.genericSqlRepo.Introspect(ctx)
}

// TODO: godoc -ccampo 2024-04-02
// SampleTable delegates sampling to GenericRepository, using a
// Postgres-specific table sample query. See Repository.SampleTable and
// GenericRepository.SampleTableWithQuery for more details.
func (r *PostgresRepository) SampleTable(
ctx context.Context,
meta *TableMetadata,
Expand All @@ -93,30 +99,31 @@ func (r *PostgresRepository) SampleTable(
return r.genericSqlRepo.SampleTableWithQuery(ctx, meta, query, params.SampleSize, params.Offset)
}

// TODO: godoc -ccampo 2024-04-02
// Ping delegates the ping to GenericRepository. See Repository.Ping and
// GenericRepository.Ping for more details.
func (r *PostgresRepository) Ping(ctx context.Context) error {
return r.genericSqlRepo.Ping(ctx)
}

// TODO: godoc -ccampo 2024-04-02
// Close delegates the close to GenericRepository. See Repository.Close and
// GenericRepository.Close for more details.
func (r *PostgresRepository) Close() error {
return r.genericSqlRepo.Close()
}

// TODO: godoc -ccampo 2024-04-02
// PostgresConfig contains Postgres-specific configuration parameters.
type PostgresConfig struct {
// ConnOptsStr is a string containing Postgres-specific connection options.
ConnOptsStr string
}

// ParsePostgresConfig produces a config structure with Postgres-specific
// parameters found in the repo config.
// ParsePostgresConfig parses the Postgres-specific configuration parameters
// from the given config. The Postgres connection options are built from the
// config and stored in the ConnOptsStr field of the returned PostgresConfig.
func ParsePostgresConfig(cfg config.RepoConfig) (*PostgresConfig, error) {
connOptsStr, err := config.BuildConnOptsStr(cfg)
if err != nil {
return nil, err
return nil, fmt.Errorf("error building connection options string: %w", err)
}

return &PostgresConfig{
ConnOptsStr: connOptsStr,
}, nil
return &PostgresConfig{ConnOptsStr: connOptsStr}, nil
}
Loading

0 comments on commit 05827ac

Please sign in to comment.