SQL Server – Indexing Strategy – Finding Hash Aggregates

Background

In this post we will do a quick follow-up to our last post.  In that post we touched on the difference between Stream and Hash aggregates.

To further our study, we will dig further into the Query Plan and identify queries that have Hash Aggregates.

What is a “Hash Aggregate”?

I like Craig Freedman’s exploratory definition of the Hash Aggregate Operator:

Link

Stream aggregate is great for scalar aggregates and for aggregations where we have an index to provide a sort order on the group by column(s).

The other aggregation operator, hash aggregate, is similar to hash join.  It does not require (or preserve) sort order, requires memory, and is blocking (i.e., it does not produce any results until it has consumed its entire input).  Hash aggregate excels at efficiently aggregating very large data sets.

 

Objects

 

Category Object Name Use
Table
 accumulate.order_DSS Small data set
 accumulate.order_DSL Large data set
Stored Procedure
accumulate.usp_FindHighestOrderByStore_DSS_NI Aggregate data in small data set, do not use existing helpful index
accumulate.usp_FindHighestOrderByStore_DSS_WI Aggregate data in small data set, use existing helpful index
accumulate.usp_FindHighestOrderByStore_DSL_NI Aggregate data in large data set, do not use existing helpful index
accumulate.usp_FindHighestOrderByStore_DSL_WI Aggregate data in large data set, use existing helpful index

 

 

Lab

DDL

accumulate.order_DSS



if schema_id('accumulate') is null
begin
	exec('create schema [accumulate] authorization [dbo]')
end	
go


if object_id('[accumulate].[order_DSS]') is null
begin

	create table [accumulate].[order_DSS]
	(
		  [orderNumber]  bigint not null
			identity(1,1) 

		, [storeNumber] int not null
		
		, [orderDate] datetime not null

		, [orderAmount] money not null

	)
end
go

if not exists
	(
		select *
		from   sys.key_constraints tblSKC
		where  tblSKC.[type] = 'PK'
		and    tblSKC.parent_object_id = object_id('[accumulate].[order_DSS]')
	)
begin
	
	alter table [accumulate].[order_DSS]
		add constraint [PK_Aggregate_Order_DSS]
			primary key
			(
				[orderNumber]
			)
end
go

if not exists
	(
		select *
		from   sys.indexes tblSI
		where  tblSI.object_id = object_id('[accumulate].[order_DSS]')
		and    tblSI.[name] = 'INDX_StoreNumber'
	)
begin

	create index [INDX_StoreNumber]
	on [accumulate].[order_DSS]
	(
		[storeNumber]
	)
	include
	(
		  [orderNumber] 
		, [orderDate]
		, [orderAmount]
	)

end
go


accumulate.order_DSL




if schema_id('accumulate') is null
begin

	exec('create schema [accumulate] authorization [dbo]')
	
end	
go


if object_id('[accumulate].[order_DSL]') is null
begin

	create table [accumulate].[order_DSL]
	(
		  [orderNumber]  bigint not null
			identity(1,1) 

		, [storeNumber] int not null
		
		, [orderDate] datetime not null

		, [orderAmount] money not null

	)
end
go

if not exists
	(
		select *
		from   sys.key_constraints tblSKC
		where  tblSKC.[type] = 'PK'
		and    tblSKC.parent_object_id = object_id('[accumulate].[order_DSL]')
	)
begin
	
	alter table [accumulate].[order_DSL]
		add constraint [PK_Aggregate_Order_DSL]
			primary key
			(
				[orderNumber]
			)
end
go

if not exists
	(
		select *
		from   sys.indexes tblSI
		where  tblSI.object_id = object_id('[accumulate].[order_DSL]')
		and    tblSI.[name] = 'INDX_StoreNumber'
	)
begin

	create index [INDX_StoreNumber]
	on [accumulate].[order_DSL]
	(
		[storeNumber]
	)
	include
	(
		  [orderNumber] 
		, [orderDate]
		, [orderAmount]
	)

end
go



Stored Procedures

accumulate.usp_FindHighestOrderByStore_DSS_NI


if object_id('[accumulate].[usp_FindHighestOrderByStore_DSS_NI]') is null
begin

   exec('create procedure [accumulate].[usp_FindHighestOrderByStore_DSS_NI] as select 1/0 as [shell] ')

end
go

alter procedure [accumulate].[usp_FindHighestOrderByStore_DSS_NI]
as
begin

	set nocount on;

	select 
			  tblAO.[storeNumber]
			, tblAO.[orderDate]
			, tblAO.[orderAmount]


	from   [accumulate].[order_DSS] tblAO with ( INDEX = PK_Aggregate_Order_DSS) 

	where [orderAmount]
				=
					( 
				
						select max(tblAO_Inner.[orderAmount])

						from   [accumulate].[order_DSS] tblAO_Inner
									 with ( INDEX = PK_Aggregate_Order_DSS) 

						where  tblAO.[storeNumber]
								= tblAO_Inner.[storeNumber]
							
					)

	order by
			tblAO.[storeNumber] asc


end
go

accumulate.usp_FindHighestOrderByStore_DSS_WI


if object_id('[accumulate].[usp_FindHighestOrderByStore_DSS_WI]') is null
begin

   exec('create procedure [accumulate].[usp_FindHighestOrderByStore_DSS_WI] as select 1/0 as [shell] ')

end
go

alter procedure [accumulate].[usp_FindHighestOrderByStore_DSS_WI]
as
begin

	set nocount on;

	select 
			  tblAO.[storeNumber]
			, tblAO.[orderDate]
			, tblAO.[orderAmount]


	from   [accumulate].[order_DSS] tblAO 

	where [orderAmount]
				=
					( 
				
						select max(tblAO_Inner.[orderAmount])

						from   [accumulate].[order_DSS] tblAO_Inner

						where  tblAO.[storeNumber]
								= tblAO_Inner.[storeNumber]
							
					)

	order by
			tblAO.[storeNumber] asc


end
go



accumulate.usp_FindHighestOrderByStore_DSL_WI


if object_id('[accumulate].[usp_FindHighestOrderByStore_DSL_NI]') is null
begin

   exec('create procedure [accumulate].[usp_FindHighestOrderByStore_DSL_NI] as select 1/0 as [shell] ')

end
go

alter procedure [accumulate].[usp_FindHighestOrderByStore_DSL_NI]
as
begin

	set nocount on;

	select 
			  tblAO.[storeNumber]
			, tblAO.[orderDate]
			, tblAO.[orderAmount]


	from   [accumulate].[order_DSL] tblAO with ( INDEX = PK_Aggregate_Order_DSL) 

	where [orderAmount]
				=
					( 
				
						select max(tblAO_Inner.[orderAmount])

						from   [accumulate].[order_DSL] tblAO_Inner
									 with ( INDEX = PK_Aggregate_Order_DSL) 

						where  tblAO.[storeNumber]
								= tblAO_Inner.[storeNumber]
							
					)

	order by
			tblAO.[storeNumber] asc


end
go

accumulate.usp_FindHighestOrderByStore_DSL_WI


if object_id('[accumulate].[usp_FindHighestOrderByStore_DSL_WI]') is null
begin

   exec('create procedure [accumulate].[usp_FindHighestOrderByStore_DSL_WI] as select 1/0 as [shell] ')

end
go

alter procedure [accumulate].[usp_FindHighestOrderByStore_DSL_WI]
as
begin

	set nocount on;

	select 
			  tblAO.[storeNumber]
			, tblAO.[orderDate]
			, tblAO.[orderAmount]


	from   [accumulate].[order_DSL] tblAO --with ( INDEX = PK_Aggregate_Order_DSL) 

	where [orderAmount]
				=
					( 
				
						select max(tblAO_Inner.[orderAmount])

						from   [accumulate].[order_DSL] tblAO_Inner
									 --with ( INDEX = PK_Aggregate_Order_DSL) 

						where  tblAO.[storeNumber]
								= tblAO_Inner.[storeNumber]
							
					)

	order by
			tblAO.[storeNumber] asc


end
go



DML (Add data)

Task

  1. Add one thousand records to accumulate.order_DSS
  2. Add one hundred thousand records to accumulate.order_DSL

SQL

SQL Code



set nocount on;
go

truncate table [accumulate].[order_DSS]
truncate table [accumulate].[order_DSL]
go

insert into [accumulate].[order_DSS]
(

	  [storeNumber]
	
	, [orderDate] 

	, [orderAmount]

)
select 
	  cast(rand() * 100 as int)
	, dateadd
		(
			day,
			  rand() * 10000
			, getdate()
		)
	, rand() * 1E6


go 1000

insert into [accumulate].[order_DSL]
(

	  [storeNumber]
	
	, [orderDate] 

	, [orderAmount]

)
select 
	  cast(rand() * 100 as int)
	, dateadd
		(
			day,
			  rand() * 10000
			, getdate()
		)
	, rand() * 1E6


go 100000


 

Output

AddData-20160613-1129AM

 

Profile

Task

Let us execute the fetch stored procedures and compare the query plans

SQL


set statistics io on;
set nocount on;


DECLARE @intDBID INT;

SET @intDBID = db_id()

-- Flush the procedure cache for one database only
DBCC FLUSHPROCINDB (@intDBID)
	with no_infomsgs;

--Small Data Set / No Index
print 'Small Data Set / No Index'
exec [accumulate].[usp_FindHighestOrderByStore_DSS_NI]
print ''
print replicate('*', 120)


--Small Data Set / With Index
print 'Small Data Set / With Index'
exec [accumulate].[usp_FindHighestOrderByStore_DSS_WI]
print ''
print replicate('*', 120)

--Large Data Set / No Index
print 'Large Data Set / No Index'
exec [accumulate].[usp_FindHighestOrderByStore_DSL_NI]
print ''
print replicate('*', 120)

--Large Data Set / With Index
print 'Large Data Set / With Index'
exec [accumulate].[usp_FindHighestOrderByStore_DSL_WI]
print ''
print replicate('*', 120)

 

Output

Query Plan

Small Data Set / No Suitable Index

Query-DSS-NI

 

Small Data Set / With Suitable Index

Query-DSS-WI

 

Large Data Set / No Suitable Index

Query-DSL-NI

Hash Match – Aggregate

Query-DSL-NI-HashMatchAggregate

Hash Match – Inner Join

Query-DSL-NI-HashMatchInnerJoin

 

Large Data Set / With Suitable Index

Query-DSL-WI

 

Explanation
  1. Small Data Set with / No Index
    • Clustered Data Scan
      • Scans entire table
    • Sorts data
    • Segment
      • Group by ( grouping by Key columns,  storeNumber, in this case )
    • Top
      • Selects the first few rows
        • N=1 ( highest amount in our case )
    • Sorts
  2. Small Data Set with / Suitable Index
    • Index Scan
      • Index Scan on grouping column
    • Stream Aggregate
      • Compute Summary Value for groups of rows in a suitably sorted stream
      • Non blocking Operation
    • Index Scan
      • Go get other desired columns
    • Merge Join
  3. Large Data Set without / Suitable Index
    • Clustered Index Scan
      • Targeted against sub-table
      • Reads all records in base table
    • Hash Match ( Aggregate )
      • Uses each row from the top input to build a Hash table, and each row for bottom input to probe into Hash table
        • Physical Operation = Hash Match
        • Logical Operation = Aggregate
        • Actual Number of Rows = 100
        • Estimated Number of Rows = 100
        • Estimated Operator Cost = 0.470546
        • Memory Fractions
          • Memory Fractions Input = 1
          • Memory Fractions Output = 0.0909091
        • Parallel = False
    • Clustered Index Scan
      • Against main table
    • Hash Match ( Inner Join )
      • Uses each row from the top input to build a Hash table, and each row for bottom input to probe into Hash table
        • Physical Operation = Hash Match
        • Logical Operation = Inner Join
        • Actual Number of Rows = 100
        • Estimated Number of Rows = 100
        • Estimated Operator Cost = 0.673591
        • Memory Fractions
          • Memory Fractions Input = 0.909091
          • Memory Fractions Output = 0.909091
        • Parallel = False
    • Sort
  4. Large Data Set with / Suitable Index
    • Index Scan ( against helpful index – main index key is group by column [storeNumber] )
    • Stream Aggregate
    • Index Scan
    • Merge Join

 

Statistics I/O

 

Query I/O
Small Data Set / No Suitable Index Table ‘order_DSS’. Scan count 1, logical reads 7, physical reads 0, read-ahead reads 0
Small Data Set / Suitable Index Table ‘order_DSS’. Scan count 2, logical reads 20, physical reads 0, read-ahead reads 0
Large Data Set / No Suitable Index Table ‘Workfile’. Scan count 0, logical reads 0, physical reads 0, read-ahead reads

Table ‘Worktable’. Scan count 0, logical reads 0, physical reads 0, read-ahead reads 0

Table ‘order_DSL’. Scan count 2, logical reads 922, physical reads 0, read-ahead reads 0

Large Data Set / Suitable Index Table ‘order_DSL’. Scan count 2, logical reads 1118, physical reads 0, read-ahead reads 0

 

 

Explanation
  1. Small Data Set with / No Index
    • Scan Count = 1, Logical Reads = 7
  2. Small Data Set with / Suitable Index
    • Scan Count = 2, Logical Reads = 20
  3. Large Data Set without / Suitable Index
    • Workfile
      • Scan Count = 0, Logical Reads = 0
    • Worktable
      • Scan Count = 0, Logical Reads = 0
    • Main table
      • Scan Count=2, Logical Reads = 922
  4. Large Data Set with / Suitable Index
    • Scan Count=2, Logical Reads = 1118

Statistics Time

 

Query Time
Small Data Set / No Suitable Index CPU time = 0 ms,  elapsed time = 453 ms.
Small Data Set / Suitable Index CPU time = 15 ms,  elapsed time = 339 ms.
Large Date Set / No Suitable Index CPU time = 125 ms,  elapsed time = 455 ms.
Large Data Set / Suitable Index CPU time = 156 ms,  elapsed time = 237 ms.

 

 

Explanation
  1. Large Data Set
    • Lag between CPU and Elapsed Time
      • Increases when suitable index does not exist
        • More IO
        • Sorting
        • Temp table management

Instrumentation

There are a few ways to determine whether “Hash Match ( Aggregates)” are occurring on a system.

One of the precious ways is to review your cached query plans.

Cached Query Plans

SQL Plan – XML

Query-DSL-NI-QueryPlan-Aggregate-XML

Explanation

Here are the relevant Operators:

  1. RelOp
    • LogicalOp=”Aggregate”
    • PhysicalOp=”Hash Match”
  2. Hash
    • Aggregate
      • Type=MAX
  3. HashKeysBuild
    • Column Reference
      • Column=StoreNumber

Code



declare @databaseID int
 
set @databaseID = db_id()
 
 
;WITH
XMLNAMESPACES (DEFAULT N'http://schemas.microsoft.com/sqlserver/2004/07/showplan')
SELECT

  
      cp.plan_handle
 
    , qp.query_plan

	, [database]
		= db_name(qt.[dbid])

	, [sql]
		= qt.[text]

    , [node]
        = qpnHA.query('.')

    , [localName]
        = cast(qpnHA.query('local-name(.)') as varchar)

    , [HashMashAggregateExist]
        = qpnHA.exist('//RelOp[@PhysicalOp="Hash Match"][@LogicalOp="Aggregate"]')

	, [EstimatedTotalSubtreeCost]
		= qpnHA.value('.[1]/@EstimatedTotalSubtreeCost', 'float')

	, [EstimateRows]
		= qpnHA.value('.[1]/@EstimateRows', 'bigint')

	, [EstimateIO]
		= qpnHA.value('.[1]/@EstimateIO', 'float')

	, [EstimateCPU]
		= qpnHA.value('.[1]/@EstimateCPU', 'float')

	, [Parallel]
		= qpnHA.value('.[1]/@Parallel', 'smallint')

	, [Database]
		= qpnHA.qpnHA.value
			(
				N'Hash[1]/HashKeysBuild[1]/ColumnReference[1]/@Database'
				, N'varchar(600)'
			)

	, [Schema]
		= qpnHA.qpnHA.value
			(
				N'Hash[1]/HashKeysBuild[1]/ColumnReference[1]/@Schema'
				, N'varchar(600)'
			)

	, [Table]
		= qpnHA.qpnHA.value
			(
				N'Hash[1]/HashKeysBuild[1]/ColumnReference[1]/@Table'
				, N'varchar(600)'
			)

	, [Column]
		= qpnHA.qpnHA.value
			(
				N'Hash[1]/HashKeysBuild[1]/ColumnReference[1]/@Column'
				, N'varchar(600)'
			)

FROM sys.dm_exec_cached_plans cp
      
cross apply sys.dm_exec_query_plan(cp.plan_handle) qp

cross apply qp.query_plan.nodes
		('//RelOp[@PhysicalOp="Hash Match"][@LogicalOp="Aggregate"]')
			as qpnHA(qpnHA)

cross APPLY sys.dm_exec_plan_attributes(cp.plan_handle) 
				as tblDEPA_DB
  
cross apply sys.dm_exec_sql_text(cp.plan_handle) qT

where qp.query_plan is not null

and   tblDEPA_DB.[attribute] = 'dbid'

and  qp.query_plan.exist
			('//RelOp[@PhysicalOp="Hash Match"][@LogicalOp="Aggregate"]') 
					= 1

and
(
	   ([qt].[text] is null)
	or (
			qt.[text] not like
			(
				'%sys%'
			)

		)
)

--narrow search to current database
and  tblDEPA_DB.[value] = @databaseID




Dedicated

There is a lot to be grateful for here.

Jason Strate for listing the videos on his page.  Some view hoarding information has a competitive advantage.

Kimberly Tripp for gaining the knowledge and presenting it so well.

And, lastly Microsoft for being an excellent steward and working so hard to provide insight into the most innermost workings of this engine.

 

Summary

When a good index exists aggregation uses it and performs a non-blocking Stream Aggregate.

In the event that a suitable index does not exists, a Clustered Index Scan is launched against the entire table.

If the data set is relatively small, the grouping is performed within a Segment Operator.

On the other hand, if the data set is not as manageable, Hash Matches ( Aggregate) are performed.

 

References

  1. Craig Freedman’s SQL Server Blog
    A discussion of query processing, query execution, and query plans in SQL Server.
    Hash Aggregate
    Link

Leave a Reply

Please log in using one of these methods to post your comment:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s