Monday, September 26, 2011

Finding the Character Set For a SQL Field

The other day I was working on a project and I needed to know all of the different types of characters contained in a SQL field so that I could make a regular expression pattern that would not fail. Thus, I decided to write a stored procedure that would loop through a range of characters and test if that character is contained in a specified field in any records. From there I can get a list of all of the used characters and then more intelligently write my regular expression pattern.

This could also be handy if you are trying to cast a field and you keep getting errors because of a character that won't cast correctly. An example would be trying to cast a varchar or char field into an int and for some reason or another one of the records has a letter in the field. Your cast will fail and it can be tricky to track down where the issue is. With this stored procedure you can quickly see what the offending character is and then query up all the records containing that character.

Example usage:
exec FieldCharacterSet 'dbo', 'mytable, 'myfield', 1, 0, 255

Feel free to use this if you find is useful, and leave a comment letting me know if it worked well for you.

-- =============================================
-- Author:		Davin Studer
-- Create date: 9/26/2010
-- Description:	This stored procedure will let you
-- know all the characters that are contained within
-- a field of a specified table.  This is handy for things
-- such as creating regular expression patterns and figuring
-- out casting issues.
-- =============================================
create procedure [dbo].[FieldCharacterSet]
	@TableSchema nvarchar(255) = '',-- this is the schema for table to look at
	@TableName nvarchar(255) = '',	-- this is the table to look at
	@FieldName nvarchar(255) = '',	-- this is the field that will be scanned
	@RTrim bit = 1,					-- should the field be right trimmed before finding the character set ... mostly for char field types
	@LowCharRange int = 0,			-- character scan start point
	@HighCharRange int = 255		-- character scan end point
as
begin
	-- SET NOCOUNT ON added to prevent extra result sets from
	-- interfering with SELECT statements.
	set nocount on;

	declare
		@count bigint = 0,
		@sql nvarchar(max) = '',
		@selects nvarchar(max) = '',
		@error varchar(200) = '',
		@field nvarchar(264) = '',
		@sqlbuild nvarchar(max) = ''
	declare @chars table(
		[character] nchar(1)
	)
	declare @statements table(
		[sql] varchar(max)
	)

	if @TableName = ''
	begin
		set @error = 'You must supply a table name.'
		raiserror (@error, 15, 1)
	end

	if @FieldName = ''
	begin
		set @error = 'You must supply a field name.'
		raiserror (@error, 15, 1)
	end
	
	if @error = ''
	begin
		-- Bracket the table name to avoid naming issues
		if left(@TableName, 1) <> '[' and right(@TableName, 1) <> ']'
		begin
			set @TableName = '[' + @TableName + ']'
		end
		
		-- Bracket the field name to avoid naming issues
		if left(@FieldName, 1) <> '[' and right(@FieldName, 1) <> ']'
		begin
			set @FieldName = '[' + @FieldName + ']'
		end
		
		-- Should we rtrim the field to deal with trailing spaces ... mostly for char fields
		if @RTrim = 1
		begin
			set @field = 'rtrim(' + @FieldName + ')'
		end
		
		-- Create a temp table to hold our examples of each character matched
		if object_id('tempdb..##fieldCharacterSetTemp') is not null
		begin
			drop table ##fieldCharacterSetTemp
		end
		set @sql = 'select * into ##fieldCharacterSetTemp from (select t1.* from ' + @TableSchema + '.' + @TableName + ' t1 inner join ' + @TableSchema + '.' + @TableName + ' t2 on 1 = 1 where 1 = 0) X'
		exec(@sql)

		--Loop through the ascii characters low to high
		while @LowCharRange <= @HighCharRange
		begin
			--Build the from and where clauses
			set @sqlbuild = ''
			set @sqlbuild = @sqlbuild + 'from '
			if @TableSchema <> ''
			begin
				set @sqlbuild = @sqlbuild + @TableSchema + '.'
			end
			set @sqlbuild = @sqlbuild + @TableName + nchar(13)
			set @sqlbuild = @sqlbuild + 'where ' + @field
			-- If the character is A-Z or a-z then case sensitive compare
			if (@LowCharRange between 65 and 90) or (@LowCharRange between 97 and 122)
			begin
				set @sqlbuild = @sqlbuild + ' COLLATE Latin1_General_CS_AS'
			end
			set @sqlbuild = @sqlbuild + ' like '
			-- Deal with a few special SQL characters
			if @LowCharRange = 37 -- % character
			begin
				set @sqlbuild = @sqlbuild + '''%\%%'' escape ''\''' + nchar(13)
			end
			else if @LowCharRange = 39 -- ' character
			begin
				set @sqlbuild = @sqlbuild + '''%''''%''' + nchar(13)
			end
			else if @LowCharRange = 95 -- _ character
			begin
				set @sqlbuild = @sqlbuild + '''%\_%'' escape ''\''' + nchar(13)
			end
			else
			begin
				set @sqlbuild = @sqlbuild + '''%' + nchar(@LowCharRange) + '%''' + nchar(13)
			end

			-- Buld main counting sql
			set @sql = ''
			set @sql = @sql + 'select @countOut = count(*)' + nchar(13)
			set @sql = @sql + @sqlbuild

			--Get count of rows that have the character
			execute sp_executesql @sql,N'@countOut bigint = 0 output',@countOut=@count output

			--If any rows have this character add it to the result set
			if @count > 0
			begin
				insert into @chars values (nchar(@LowCharRange))

				-- Build the select sql to look at example lines that have this character
				set @selects = ''
				set @selects = @selects + 'select top 1 *' + nchar(13)
				set @selects = @selects + @sqlbuild
				
				insert into @statements values (rtrim(replace(@selects, nchar(13), ' ')))

				set @sql = ''
				set @sql = @sql + 'insert into ##fieldCharacterSetTemp' +nchar(13)
				set @sql = @sql + @selects
				
				exec(@sql)
			end
			
			-- Next character
			set @LowCharRange = @LowCharRange + 1
		end

		select * from @chars
		select * from @statements
		if object_id('tempdb..##fieldCharacterSetTemp') is not null
		begin
			select * from ##fieldCharacterSetTemp
			drop table ##fieldCharacterSetTemp
		end
	end
end