URL: http://www.sqlservercentral.com/scripts/Data+exploration/71008/
USAGE First run this script is the database where you want to store this procedure Then you can run the stored procedure with the following statement:
EXEC sp_ObjectExplore 'database.schema.tablename'
You do not need to supply databasename and schema (example: dbo) when table is in same database as this procedure.
If you want additional metrics concerning the distribution of all numeric variables, execute with parameter @distribution='yes'. Example: EXEC sp_ObjectExplore 'database.schema.tablename' , @distribution='yes'
If you are only interested in a subset of columns, you can specify them in the parameter @columns Columnnames must be separated by a comma. You can specify them either with or without brackets [] Columnnames may not contain commas. Example: EXEC sp_ObjectExplore 'database.schema.tablename' , @distribution='yes' , @columns='column1, column2, etc'
PURPOSE Provide summary information and metrics for any column in a given Table/View Metrics included are: * Total number of records in Object (Records) * Number of values in column (Cnt) * Number of unique values in column (CntDist) * Number of NULL values in column (NullValues) * Min and Max value in column (Min/Max) * Average value in column (Avg) * Standard Deviation in column (StDev) * Number of numeric values in column (IsNum) * Number of integer values in column (IsInt) Optional metrics (only computed if procedure is executed with parameter @distribution='yes' * First Quartile/25% (Q1) * Median (Median) * Third Quartile/75% (Q3) * Interquartile range (IQR) * Skew (Skew) * Kurtosis (Kurt)
CREATE PROCEDURE [dbo].[sp_ObjectExplore] @FullObjectName varchar(200) , @Distribution varchar(3) = 'No' , @COLUMNS varchar(max) = NULL AS -- ______________________________________________________ Object Explore _________________________________________________________ -- -- August 19th 2010, by Robin van Schaik -- Version 1.6 BETA -- -- USAGE -- First run this script is the database where you want to store this procedure -- Then you van run the stored procedure with the following statement: -- -- EXEC sp_ObjectExplore 'database.schema.tablename' -- -- You do not need to supply databasename and schema (example: dbo) when table is in same database as this procedure. -- -- If you want additional metrics concerning the distribution of all numeric variables, execute with parameter @distribution='yes'. -- Example: -- EXEC sp_ObjectExplore 'database.schema.tablename' -- , @distribution='yes' -- -- If you are only interested in a subset of columns, you can specify them in the parameter @columns -- Columnnames must be separated by a comma. You can specify them either with or without brackets [] -- Columnnames may not contain commas. -- Example -- EXEC sp_ObjectExplore 'database.schema.tablename' -- , @distribution='yes' -- , @columns='column1, column2, etc' -- -- PURPOSE -- Provide summary information and metrics for any column in a given Table/View -- Metrics included are: -- * Total number of records in Object (Records) -- * Number of values in column (Cnt) -- * Number of unique values in column (CntDist) -- * Number of NULL values in column (NullValues) -- * Min and Max value in column (Min/Max) -- * Average value in column (Avg) -- * Standard Deviation in column (StDev) -- * Number of numeric values in column (IsNum) -- * Number of integer values in column (IsInt) -- Optional metrics (only computed if procedure is executed with parameter @distribution='yes' -- * First Quartile/25% (Q1) -- * Median (Median) -- * Third Quartile/75% (Q3) -- * Interquartile range (IQR) -- * Skew (Skew) -- * Kurtosis (Kurt) -- -- VERSIONS -- 1.0: 01-JUL-2009. -- 1.1: 06-JUL-2009. Changed logic to determine primary key. Script did not work on columns -- with multiple indexes (duplicate rows) -- 1.2: 06-AUG-2009. Min and Max values for date fields converted (back) to yyyy-mm-dd hh:mi:ss(24h) -- Fixing bug for all numeric columnnames (added brackets [] to query) -- Prevent executing count function over certain datatypes (like xml, text, etc). -- 1.3: 31-OCT-2009. Fixed bug. Script did not run when table has a columnname with space(s). -- 1.4: 01-NOV-2009. Prevent executing min and max function over certain datatypes. -- Added Average, Standard Deviation and percentage of NULL values. -- 1.5: 12-NOV-2009. Added IsNum and IsInt variables. Isnum counts the number of records which -- comply with the IsNumeric() statement (numbers including period, $, hiven, etc.) -- IsInt counts the number of records that contain numeric values (only numbers). -- 1.6 BETA: 19-AUG-2010. -- Added Distribution variables for every numeric value. Quartiles, median, IQR, Skew and Kurtosis. -- These statistics are only computed when procedure is executed with parameter @distribution='yes' -- These stats are not computed for columns that are (or appear to be) ID's. -- Added possibility to run this only for a subset of columns, using @columns parameter. -- ******************************************************************************************************************************* SET NOCOUNT ON /* STEP 1: Initialize Procedure */ DECLARE @Object varchar(200) DECLARE @Schema varchar(200) DECLARE @DATABASE varchar(200) -- Break down parameter in Database/Schema/Object SET @Object = PARSENAME(@FullObjectName,1) SET @Schema = ISNULL(PARSENAME(@FullObjectName,2),'dbo') SET @DATABASE = PARSENAME(@FullObjectName,3) -- Creating temporary table to store queries used for calculating metrics CREATE TABLE #ObjectDef ( ObjectName varchar(200) , Columnname varchar(128) , ColumnId int , Query varchar(max) , DataType varchar(50) , MaxLength int , [Precision] int , Scale int , IsNullable tinyint , IsPrimaryKey tinyint , IsIdentity tinyint , SelectedColumn bit ) -- Temporary table used for results/output CREATE TABLE #ObjectAudit ( ObjectName varchar(200) , Columnname varchar(128) , ColumnId int -- Column Specifications , DataType varchar(50) , MaxLength int , [Precision] int , Scale int , IsNullable tinyint , IsPrimaryKey tinyint , IsIdentity tinyint -- Metrics , Records bigint -- All records in table , Cnt bigint -- Number of values in column , [IsNum] bigint -- Is numeric (including period, $, - etc.) , [IsInt] bigint -- Is integer , CntDist bigint -- Number of distinct values in column , NullValues bigint -- Number of Null values in column , [Min] nvarchar(4000) -- Min value in column , [Max] nvarchar(4000) -- Max value in column , [Avg] decimal(38,5) -- Average value in column , [StDev] decimal(38,5) -- Standard Deviation ) -- Insert all columns for object in temp table -- Check appropiate Schema and Database EXEC( 'INSERT INTO #ObjectDef (ObjectName, Columnname, ColumnId, DataType, MaxLength,[Precision],Scale,IsNullable,IsPrimaryKey, IsIdentity ) SELECT ''' +@FullObjectName+''' , b.name , b.column_id , type_name(user_type_id) AS DataType , convert(int, b.max_length) as max_length , b.[precision] , b.scale , b.is_nullable , isnull(c.PrimaryKey,0) , b.is_identity FROM ' +@DATABASE+'.sys.objects a INNER JOIN ' +@DATABASE+'.sys.columns b ON a.object_id=b.object_id LEFT OUTER JOIN ( SELECT a.object_id , 1 as PrimaryKey , c.column_id FROM '+@DATABASE+'.sys.indexes a INNER JOIN '+@DATABASE+'.sys.objects b ON a.object_id=b.parent_object_id AND a.name=b.name INNER JOIN '+@DATABASE+'.sys.index_columns c ON a.index_id=c.index_id AND b.parent_object_id=c.object_id WHERE b.type=''PK'' ) c ON a.object_id=c.object_id AND b.column_id=c.column_id INNER JOIN ' +@DATABASE+'.sys.schemas d ON a.schema_id=d.schema_id WHERE a.name = '''+@Object+''' AND d.name = '''+@Schema+''' ' ) -- Parse string of requested Columns -- If columns are selected the variable SelectedColumn in #ObjectDef will be set to 1 -- All other columns are set to 0. -- If no columns are selected the variable SelectedColumn will remain NULL IF @COLUMNS IS NOT NULL BEGIN CREATE TABLE #columns (ColumnName varchar(128)) DECLARE @ColumnInsert varchar(max) DECLARE @ColumnInsertTrim varchar(130) SET @COLUMNS=','+@COLUMNS -- adding comma in able to maintain logic below if only one column name is supplied WHILE CHARINDEX(',',@COLUMNS)>0 BEGIN SET @ColumnInsert=REVERSE(LEFT(REVERSE(@COLUMNS),charindex(',',REVERSE(@COLUMNS),1)-1)) SET @ColumnInsertTrim=LTRIM(RTRIM(@ColumnInsert)) INSERT INTO #columns (ColumnName) -- If columnnames are specified between brackets, remove brackets: SELECT CASE WHEN @ColumnInsertTrim LIKE '[[]%]' THEN SUBSTRING(@ColumnInsertTrim,2,DATALENGTH(@ColumnInsertTrim)-2) ELSE @ColumnInsertTrim END SET @COLUMNS=REPLACE(@COLUMNS,','+@ColumnInsert,'') END UPDATE #ObjectDef SET SelectedColumn=1 WHERE Columnname in (SELECT Columnname FROM #columns) UPDATE #ObjectDef SET SelectedColumn=0 WHERE SelectedColumn IS NULL END -- Validate input parameters -- If Object does not exist, end script IF NOT EXISTS (SELECT top 1 ObjectName FROM #ObjectDef) BEGIN DROP TABLE #ObjectDef , #ObjectAudit PRINT 'Object '+UPPER(@FullObjectName)+' does not exist! Please enter a valid object name.' RETURN END -- If one or more of the specified columns do not exist, end script -- or columns specified in @Columns? IF @COLUMNS IS NOT NULL BEGIN -- If so check one or more columns do not exist in the specified object IF EXISTS ( SELECT top 1 a.ColumnName FROM #Columns a LEFT OUTER JOIN #ObjectDef b ON a.ColumnName=b.ColumnName WHERE b.ColumnName IS NULL ) -- Are there missing columns? End Script BEGIN -- identify missing columns SELECT a.ColumnName INTO #MissingColumns FROM #Columns a LEFT OUTER JOIN #ObjectDef b ON a.ColumnName=b.ColumnName WHERE b.ColumnName IS NULL DECLARE @MaxMissingColumn varchar(128) SET @MaxMissingColumn='' DECLARE @MissingColumns varchar(max) SET @MissingColumns='' -- Building string to output missing columns to user WHILE (SELECT COUNT(ColumnName) FROM #MissingColumns)>0 BEGIN SET @MaxMissingColumn=(SELECT MAX(ColumnName) FROM #MissingColumns) SET @COLUMNS=REPLACE(@COLUMNS,@MaxMissingColumn,'') DELETE FROM #MissingColumns WHERE ColumnName=@MaxMissingColumn SET @MissingColumns=','+UPPER(@MaxMissingColumn)+@MissingColumns END DROP TABLE #ObjectDef , #ObjectAudit , #Columns -- output to user: PRINT 'One or more of the requested columns do not exist in object: ' + UPPER(@FullObjectName)+ CHAR(13)+'Invalid column(s): '+SUBSTRING(@MissingColumns,2,LEN(@MissingColumns)-1) RETURN END END /* STEP 2: Defining queries for calculating column metrics */ -- First determine which datatypes are suitable for operators count, min, max and avg -- Add select statement per column in temp table -- Naming convention for the metrics: -- [ColumnId]_CNT (number of values) -- _CNTDIST (distinct values), _MIN, _MAX, _Nulls (number of null values), _AVG -- available datatypes SELECT name INTO #datatypes FROM sys.types ALTER TABLE #datatypes ADD O_COUNT tinyint , O_MIN tinyint , O_NUM tinyint , O_IsNUM tinyint -- COUNT function UPDATE #datatypes SET O_COUNT=1 WHERE name IN ( 'bigint','binary','bit','char','date','datetime','datetime2' , 'datetimeoffset','decimal','float','hierarchyid','int' , 'money','nchar','numeric','nvarchar','real','smalldatetime' , 'smallint','smallmoney','sql_variant','sysname','time' , 'timestamp','tinyint','uniqueidentifier','varbinary' , 'varchar' ) -- MIN/MAX function UPDATE #datatypes SET O_MIN =1 WHERE name IN ( 'int','bigint','binary','bit','char','date','datetime' , 'datetime2','datetimeoffset','decimal','float','hierarchyid' , 'int','money','nchar','numeric','nvarchar','real' , 'smalldatetime','smallint','smallmoney','sql_variant' , 'sysname','time','tinyint','varbinary','varchar' ) -- NUMERIC FUNCTIONS (avg, stdev, etc.) UPDATE #datatypes SET O_NUM =1 WHERE name IN ( 'int','smallint','bigint','tinyint','float','decimal','numeric' , 'money','smallmoney','real' ) -- ISNUMERIC / ISINT function UPDATE #datatypes SET O_ISNUM =1 WHERE name IN ( 'bigint','binary','bit','char','datetime','decimal','float','int' , 'money','nchar','numeric','nvarchar','real','smalldatetime' , 'smallint','smallmoney','sysname','tinyint','uniqueidentifier' , 'varbinary','varchar' ) UPDATE #ObjectDef SET query= CASE WHEN datatype IN (SELECT name FROM #datatypes WHERE o_count=1) THEN 'COUNT(['+columnname+']) as ['+CAST(columnid AS varchar)+'_CNT]' + ',COUNT(distinct ['+columnname+']) as ['+CAST(columnid AS varchar)+'_CNTDIST]' ELSE 'NULL as ['+CAST(columnid AS varchar)+'_CNT], NULL as ['+CAST(columnid AS varchar)+'_CNTDIST]' END -- Min and Max statement not posssible on datatype "Bit" -- Therefore, convert this datatype to a varchar + CASE WHEN datatype IN (SELECT name FROM #datatypes WHERE o_min=1) THEN ', MIN('+ CASE WHEN datatype = 'bit' THEN 'cast(['+columnname+'] as int)' WHEN datatype IN ('datetime','datetime2','smalldatetime','date','datetimeoffset','hierarchyid','sql_variant') THEN 'convert(varchar,['+columnname+'],120)' -- convert date to format yyyy-mm-dd hh:mm:ss ELSE '['+columnname+']' END+')' ELSE ', NULL ' -- MIN function not available for datatype END +' as ['+CAST(columnid AS varchar)+'_MIN]' + CASE WHEN datatype IN (SELECT name FROM #datatypes WHERE o_min=1) THEN ', MAX('+ CASE WHEN datatype = 'bit' THEN 'cast(['+columnname+'] as int)' WHEN datatype IN ('datetime','datetime2','smalldatetime','date','datetimeoffset','hierarchyid','sql_variant') THEN 'convert(varchar,['+columnname+'],120)' ELSE '['+columnname+']' END+')' ELSE ', NULL ' -- MAX function not available for datatype END +' as ['+CAST(columnid AS varchar)+'_MAX]' +',SUM(CASE WHEN ['+columnname+'] is null then 1 else 0 end) as ['+CAST(columnid AS varchar)+'_Nulls]' + CASE WHEN datatype IN (SELECT name FROM #datatypes WHERE o_num=1) -- AVG for ID's not particularly usefull, therefore filter them out: AND IsPrimaryKey=0 AND IsIdentity=0 THEN ', AVG(['+columnname+']/1.0)' -- AVG function not available for datatype ELSE ', NULL' END + ' as ['+CAST(columnid AS varchar)+'_AVG]' + CASE WHEN datatype IN (SELECT name FROM #datatypes WHERE o_num=1) -- Standard Deviation for ID's not particularly usefull, therefore filter them out: AND IsPrimaryKey=0 AND IsIdentity=0 THEN ', STDEV(['+columnname+']/1.0)' -- STDDEV function not available for datatype ELSE ', NULL' END + ' as ['+CAST(columnid AS varchar)+'_DEV]' + CASE WHEN datatype IN (SELECT name FROM #datatypes WHERE o_isnum=1) THEN ', SUM(ISNUMERIC(['+columnname+']))' -- SUM function not available for datatype ELSE ', NULL' END + ' as ['+CAST(columnid AS varchar)+'_ISNUM]' + CASE WHEN datatype IN (SELECT name FROM #datatypes WHERE o_isnum=1) THEN ', SUM(CASE WHEN ['+columnname+'] like ''%[^0-9]%'' OR ['+columnname+'] IS NULL then 0 else 1 end)' -- Not available for datatype ELSE ', NULL' END + ' as ['+CAST(columnid AS varchar)+'_ISINT]' -- Concatenating the individual select statement per column to one select statement for entire object DECLARE @Query varchar(max) DECLARE @Sql varchar(max) DECLARE @Sql2 varchar(max) DECLARE @Sql3 varchar(max) -- Start Cursor c_Query DECLARE c_Query CURSOR FAST_FORWARD FOR SELECT Query FROM #ObjectDef WHERE isnull(SelectedColumn,1)<>0 OPEN c_Query FETCH NEXT FROM c_Query INTO @Query WHILE @@FETCH_STATUS = 0 BEGIN SET @sql =cast(@Query AS varchar(max)) -- Adding current statement to previous statement. Last run results in one select statement. SET @Sql2=cast(@sql AS varchar (max))+','+ISNULL(@Sql2,'') FETCH NEXT FROM c_Query INTO @Query END CLOSE c_Query DEALLOCATE c_Query -- End Cursor c_Query -- Adding SELECT and FROM Statement, resulting in a query SET @Sql3='select count(*) as records, '+LEFT(ltrim(rtrim(@Sql2)),LEN(ltrim(rtrim(@Sql2)))-1) +'into ##ObjectSingleRow from '+@FullObjectName -- Execute the final query -- This results in a one-row table with a column for every combination of metric/column -- USING Exec statement sets up a new connection, therefore results are added to a -- Global temporary table (##) EXEC(@sql3) /* STEP 3: Output */ -- Add columns and column specifications to temp table INSERT INTO #ObjectAudit ( ObjectName , Columnname , ColumnId , DataType , MaxLength , [Precision] , Scale , IsNullable , IsPrimaryKey , IsIdentity ) SELECT ObjectName , ColumnName , ColumnId , DataType , MaxLength , [Precision] , Scale , IsNullable , IsPrimaryKey , IsIdentity FROM #ObjectDef WHERE isnull(SelectedColumn,1)<>0 DECLARE @COLUMN varchar(5) DECLARE c_update CURSOR FAST_FORWARD FOR SELECT ColumnId FROM #ObjectAudit -- Start cursor c_update OPEN c_update FETCH NEXT FROM c_update INTO @COLUMN WHILE @@FETCH_STATUS = 0 BEGIN -- Define Update Query for updating metric columns SET @Sql = 'UPDATE #ObjectAudit SET Records=(select records from ##ObjectSingleRow) , Cnt=(select ['+@COLUMN+'_CNT] from ##ObjectSingleRow) , IsNum=(select ['+@COLUMN +'_ISNUM] from ##ObjectSingleRow) , IsInt=(select ['+@COLUMN +'_ISINT] from ##ObjectSingleRow) , CntDist=(select ['+@COLUMN +'_CNTDist] from ##ObjectSingleRow) , Min=(select ['+@COLUMN +'_MIN] from ##ObjectSingleRow) , Max=(select ['+@COLUMN +'_MAX] from ##ObjectSingleRow) , NullValues=(select ['+@COLUMN+'_Nulls] from ##ObjectSingleRow) , Avg=(select ['+@COLUMN +'_AVG] from ##ObjectSingleRow) , StDev=(select ['+@COLUMN +'_DEV] from ##ObjectSingleRow) WHERE columnid= '+@COLUMN+'' -- Executing above update statement EXEC(@Sql) FETCH NEXT FROM c_update INTO @COLUMN END CLOSE c_update DEALLOCATE c_update -- END CURSOR c_update -- DISTRIBUTION -- Only calculate distribution variables if procedure is executed with parameter 'distribution' -- Do not calculate for key/ID variables IF EXISTS (SELECT top 1 ColumnName FROM #ObjectAudit WHERE Datatype IN ( SELECT name FROM #datatypes WHERE o_num=1 ) AND IsPrimaryKey=0 AND IsIdentity=0 -- filter out variables which are probably Key / ID variables: AND NOT( IsInt=CNT AND CAST([Max] AS bigint)-CAST([Min] AS bigint)+1=[Cnt] AND CNTDist=Cnt ) ) AND LOWER(@Distribution)='yes' BEGIN ALTER TABLE #ObjectAudit ADD Q1 decimal(38,5), Median decimal(38,5), Q3 decimal(38,5), IQR decimal(38,5), Skew numeric(38,3), Kurt numeric(38,3) DECLARE @ColumnName AS varchar(128) DECLARE c_ntiles cursor fast_forward FOR SELECT ColumnName FROM #ObjectAudit WHERE datatype IN ( SELECT Name FROM #datatypes WHERE o_num=1 ) AND Cnt>=4 -- to avoid divide by 0 error when calculating kurtosis AND IsPrimaryKey=0 AND IsIdentity=0 -- filter out variables which are probably Key / ID variables: AND NOT( IsInt=CNT AND CAST([Max] AS bigint)-CAST([Min] AS bigint)+1=[Cnt] AND CNTDist=Cnt ) OPEN c_ntiles FETCH NEXT FROM c_ntiles INTO @ColumnName WHILE @@FETCH_STATUS = 0 BEGIN SET @SQL = ' DECLARE @CNT as decimal(38,5) DECLARE @STDEV as decimal(38,5) DECLARE @AVG as decimal(38,5) DECLARE @CORR_SKEW as numeric(10,9) DECLARE @CORR_KURT as numeric(10,9) DECLARE @SUBFACT as numeric(10,9) DECLARE @Q1 as int DECLARE @Q2a as decimal(38,5) DECLARE @Q2b as decimal(38,5) DECLARE @Q3 as int SET @CNT=(SELECT Cnt FROM #ObjectAudit WHERE ColumnName='''+@ColumnName+''') SET @CORR_SKEW=(@CNT)/(@CNT-1)/(@CNT-2) SET @CORR_KURT=(@CNT)*(@CNT+1)/(@CNT-1)/(@CNT-2)/(@CNT-3) SET @SUBFACT=3*SQUARE((@CNT-1))/(@CNT-2)/(@CNT-3) SET @STDEV=(SELECT stDEV FROM #ObjectAudit WHERE ColumnName='''+@ColumnName+''') SET @AVG=(SELECT Avg FROM #ObjectAudit WHERE ColumnName='''+@ColumnName+''') SET @Q1 = round(0.25*(@cnt+1),0) SET @Q2a = 0.50*(@cnt+@cnt%2) SET @Q2b = 0.50*(@cnt+@cnt%2)+(@cnt+1)%2 SET @Q3 = round(0.75*(@cnt+1),0) SELECT ['+@ColumnName+'], row_number() OVER (partition by 1 ORDER BY ['+@ColumnName+']) as rownum , POWER((['+@ColumnName+']*1.0-@AVG)/@STDEV,3) as Skew , SQUARE(SQUARE(((['+@ColumnName+']-@AVG)/@STDEV))) as Kurt INTO #quartiles_rows FROM '+@FullObjectName+' where ['+@ColumnName+']>0 SELECT MAX(CASE rownum WHEN @Q1 THEN ['+@ColumnName+'] ELSE null END) as Q1 , MAX(CASE rownum when @Q2a THEN ['+@ColumnName+'] ELSE null END) as Q2a , MAX(CASE rownum when @Q2b THEN ['+@ColumnName+'] ELSE null END) as Q2b , MAX(CASE rownum when @Q3 THEN ['+@ColumnName+'] ELSE null END) as Q3 , SUM(Skew)*@CORR_SKEW as Skew , SUM(Kurt)*@CORR_KURT-@SUBFACT as Kurt INTO ##quartiles FROM #quartiles_rows ' EXEC (@SQL) UPDATE #ObjectAudit SET Q1=##Quartiles.Q1 , Median=(##Quartiles.Q2a+##Quartiles.Q2b)/2 , Q3=##Quartiles.Q3 , Skew=##Quartiles.Skew , Kurt=##Quartiles.Kurt , IQR=##Quartiles.Q3-##Quartiles.Q1 FROM ##Quartiles WHERE ColumnName=@ColumnName DROP TABLE ##Quartiles FETCH NEXT FROM c_ntiles INTO @columnName END CLOSE c_ntiles DEALLOCATE c_ntiles END -- Output to screen SELECT * , CAST(CAST(NullValues AS decimal(38,2))/records*100 AS decimal(38,2)) AS NullPerc , CASE WHEN CNT>0 THEN CAST(CAST(CNTDist AS decimal(38,2))/CNT*100 AS decimal(38,2)) ELSE NULL END AS DistinctPerc FROM #ObjectAudit -- Clean up! DROP TABLE #ObjectDef , #ObjectAudit , ##ObjectSingleRow /* End Of Script */ GO
You need to login to post a comment.
