SQL Server Monitoring: DMVs, Query Store, and Performance Baselines
Introduction
Effective SQL Server monitoring prevents performance issues and enables rapid troubleshooting. This guide covers Dynamic Management Views (DMVs), Query Store for query performance tracking, Extended Events, Performance Monitor, SQL Server Agent alerts, establishing performance baselines, and building proactive monitoring solutions.
Dynamic Management Views (DMVs)
Current Query Execution
-- View currently executing queries
SELECT
r.session_id,
r.start_time,
r.status,
r.command,
r.cpu_time,
r.total_elapsed_time / 1000 AS elapsed_time_sec,
r.reads,
r.writes,
r.logical_reads,
DB_NAME(r.database_id) AS database_name,
s.login_name,
s.host_name,
s.program_name,
SUBSTRING(
qt.text,
(r.statement_start_offset/2)+1,
((CASE r.statement_end_offset
WHEN -1 THEN DATALENGTH(qt.text)
ELSE r.statement_end_offset
END - r.statement_start_offset)/2) + 1
) AS query_text,
qp.query_plan
FROM sys.dm_exec_requests r
INNER JOIN sys.dm_exec_sessions s ON r.session_id = s.session_id
CROSS APPLY sys.dm_exec_sql_text(r.sql_handle) qt
CROSS APPLY sys.dm_exec_query_plan(r.plan_handle) qp
WHERE r.session_id <> @@SPID -- Exclude current session
AND s.is_user_process = 1
ORDER BY r.total_elapsed_time DESC;
Query Performance History
-- Top 20 queries by CPU
SELECT TOP 20
qs.execution_count,
qs.total_worker_time / 1000 AS total_cpu_time_ms,
qs.total_worker_time / qs.execution_count / 1000 AS avg_cpu_time_ms,
qs.total_elapsed_time / 1000 AS total_duration_ms,
qs.total_elapsed_time / qs.execution_count / 1000 AS avg_duration_ms,
qs.total_logical_reads,
qs.total_logical_reads / qs.execution_count AS avg_logical_reads,
qs.total_logical_writes,
qs.creation_time,
qs.last_execution_time,
SUBSTRING(
qt.text,
(qs.statement_start_offset/2)+1,
((CASE qs.statement_end_offset
WHEN -1 THEN DATALENGTH(qt.text)
ELSE qs.statement_end_offset
END - qs.statement_start_offset)/2) + 1
) AS query_text,
DB_NAME(qt.dbid) AS database_name,
qp.query_plan
FROM sys.dm_exec_query_stats qs
CROSS APPLY sys.dm_exec_sql_text(qs.sql_handle) qt
CROSS APPLY sys.dm_exec_query_plan(qs.plan_handle) qp
ORDER BY qs.total_worker_time DESC;
-- Top 20 queries by I/O
SELECT TOP 20
qs.execution_count,
qs.total_logical_reads + qs.total_logical_writes AS total_io,
(qs.total_logical_reads + qs.total_logical_writes) / qs.execution_count AS avg_io,
qs.total_logical_reads,
qs.total_logical_reads / qs.execution_count AS avg_reads,
qs.total_logical_writes,
qs.total_logical_writes / qs.execution_count AS avg_writes,
qs.total_elapsed_time / 1000 AS total_duration_ms,
SUBSTRING(
qt.text,
(qs.statement_start_offset/2)+1,
((CASE qs.statement_end_offset
WHEN -1 THEN DATALENGTH(qt.text)
ELSE qs.statement_end_offset
END - qs.statement_start_offset)/2) + 1
) AS query_text
FROM sys.dm_exec_query_stats qs
CROSS APPLY sys.dm_exec_sql_text(qs.sql_handle) qt
ORDER BY total_io DESC;
Wait Statistics
-- Wait statistics (what SQL Server is waiting for)
WITH Waits AS (
SELECT
wait_type,
wait_time_ms / 1000.0 AS wait_time_sec,
(wait_time_ms - signal_wait_time_ms) / 1000.0 AS resource_wait_sec,
signal_wait_time_ms / 1000.0 AS signal_wait_sec,
waiting_tasks_count,
100.0 * wait_time_ms / SUM(wait_time_ms) OVER() AS pct
FROM sys.dm_os_wait_stats
WHERE wait_type NOT IN ( -- Filter out benign waits
'CLR_SEMAPHORE', 'LAZYWRITER_SLEEP', 'RESOURCE_QUEUE',
'SLEEP_TASK', 'SLEEP_SYSTEMTASK', 'SQLTRACE_BUFFER_FLUSH',
'WAITFOR', 'LOGMGR_QUEUE', 'CHECKPOINT_QUEUE',
'REQUEST_FOR_DEADLOCK_SEARCH', 'XE_TIMER_EVENT', 'BROKER_TO_FLUSH',
'BROKER_TASK_STOP', 'CLR_MANUAL_EVENT', 'CLR_AUTO_EVENT',
'DISPATCHER_QUEUE_SEMAPHORE', 'FT_IFTS_SCHEDULER_IDLE_WAIT',
'XE_DISPATCHER_WAIT', 'XE_DISPATCHER_JOIN', 'SQLTRACE_INCREMENTAL_FLUSH_SLEEP'
)
AND wait_time_ms > 0
)
SELECT TOP 20
wait_type,
CAST(wait_time_sec AS DECIMAL(12,2)) AS wait_time_sec,
CAST(resource_wait_sec AS DECIMAL(12,2)) AS resource_wait_sec,
CAST(signal_wait_sec AS DECIMAL(12,2)) AS signal_wait_sec,
waiting_tasks_count,
CAST(pct AS DECIMAL(5,2)) AS pct,
CAST(SUM(pct) OVER(ORDER BY wait_time_sec DESC) AS DECIMAL(5,2)) AS running_total_pct
FROM Waits
ORDER BY wait_time_sec DESC;
-- Common wait types explained:
-- CXPACKET: Parallelism (adjust MAXDOP)
-- PAGEIOLATCH_*: Disk I/O (slow storage, indexing issues)
-- LCK_*: Locking/blocking (optimize queries, reduce transaction times)
-- WRITELOG: Transaction log writes (slow log disk, too many transactions)
-- ASYNC_NETWORK_IO: Client not consuming results fast enough
-- SOS_SCHEDULER_YIELD: CPU pressure (high CPU usage)
Index Usage Statistics
-- Index usage (seeks, scans, lookups, updates)
SELECT
OBJECT_SCHEMA_NAME(s.object_id) AS schema_name,
OBJECT_NAME(s.object_id) AS table_name,
i.name AS index_name,
i.type_desc,
s.user_seeks,
s.user_scans,
s.user_lookups,
s.user_seeks + s.user_scans + s.user_lookups AS total_reads,
s.user_updates,
s.last_user_seek,
s.last_user_scan,
s.last_user_lookup,
s.last_user_update,
CASE
WHEN s.user_updates > 0
THEN CAST((s.user_seeks + s.user_scans + s.user_lookups) AS DECIMAL(18,2)) / s.user_updates
ELSE NULL
END AS read_write_ratio
FROM sys.dm_db_index_usage_stats s
INNER JOIN sys.indexes i ON s.object_id = i.object_id AND s.index_id = i.index_id
WHERE OBJECTPROPERTY(s.object_id, 'IsUserTable') = 1
AND s.database_id = DB_ID()
ORDER BY s.user_seeks + s.user_scans + s.user_lookups DESC;
-- Unused indexes (candidates for removal)
SELECT
OBJECT_SCHEMA_NAME(i.object_id) AS schema_name,
OBJECT_NAME(i.object_id) AS table_name,
i.name AS index_name,
i.type_desc,
ps.used_page_count * 8 / 1024 AS index_size_mb,
s.user_updates,
'DROP INDEX ' + i.name + ' ON ' + OBJECT_SCHEMA_NAME(i.object_id) + '.' + OBJECT_NAME(i.object_id) + ';' AS drop_statement
FROM sys.indexes i
LEFT JOIN sys.dm_db_index_usage_stats s
ON i.object_id = s.object_id
AND i.index_id = s.index_id
AND s.database_id = DB_ID()
INNER JOIN sys.dm_db_partition_stats ps ON i.object_id = ps.object_id AND i.index_id = ps.index_id
WHERE OBJECTPROPERTY(i.object_id, 'IsUserTable') = 1
AND i.type_desc <> 'CLUSTERED'
AND i.is_primary_key = 0
AND i.is_unique_constraint = 0
AND (s.user_seeks IS NULL OR s.user_seeks = 0)
AND (s.user_scans IS NULL OR s.user_scans = 0)
AND (s.user_lookups IS NULL OR s.user_lookups = 0)
AND s.user_updates > 0
ORDER BY s.user_updates DESC, index_size_mb DESC;
I/O Statistics
-- Database I/O statistics
SELECT
DB_NAME(vfs.database_id) AS database_name,
mf.name AS logical_name,
mf.physical_name,
mf.type_desc AS file_type,
CAST(vfs.size_on_disk_bytes / 1048576.0 AS DECIMAL(10,2)) AS size_mb,
vfs.num_of_reads,
vfs.num_of_bytes_read / 1048576 AS mb_read,
vfs.io_stall_read_ms,
CASE
WHEN vfs.num_of_reads > 0
THEN vfs.io_stall_read_ms / vfs.num_of_reads
ELSE 0
END AS avg_read_latency_ms,
vfs.num_of_writes,
vfs.num_of_bytes_written / 1048576 AS mb_written,
vfs.io_stall_write_ms,
CASE
WHEN vfs.num_of_writes > 0
THEN vfs.io_stall_write_ms / vfs.num_of_writes
ELSE 0
END AS avg_write_latency_ms
FROM sys.dm_io_virtual_file_stats(NULL, NULL) vfs
INNER JOIN sys.master_files mf
ON vfs.database_id = mf.database_id
AND vfs.file_id = mf.file_id
ORDER BY vfs.io_stall_read_ms + vfs.io_stall_write_ms DESC;
-- Alert if avg latency > 20ms (slow disk)
Blocking and Deadlocks
-- Current blocking chains
WITH BlockingChain AS (
SELECT
w.session_id AS blocked_session_id,
w.blocking_session_id,
s.login_name AS blocked_user,
s.host_name AS blocked_host,
w.wait_type,
w.wait_time / 1000 AS wait_time_sec,
SUBSTRING(
qt.text,
(r.statement_start_offset/2)+1,
((CASE r.statement_end_offset
WHEN -1 THEN DATALENGTH(qt.text)
ELSE r.statement_end_offset
END - r.statement_start_offset)/2) + 1
) AS blocked_query
FROM sys.dm_os_waiting_tasks w
INNER JOIN sys.dm_exec_sessions s ON w.session_id = s.session_id
INNER JOIN sys.dm_exec_requests r ON w.session_id = r.session_id
CROSS APPLY sys.dm_exec_sql_text(r.sql_handle) qt
WHERE w.blocking_session_id <> 0
)
SELECT
bc.blocked_session_id,
bc.blocking_session_id,
bc.blocked_user,
bc.blocked_host,
bc.wait_type,
bc.wait_time_sec,
bc.blocked_query,
blocker.login_name AS blocker_user,
blocker.host_name AS blocker_host,
SUBSTRING(
blocker_qt.text,
(blocker_r.statement_start_offset/2)+1,
((CASE blocker_r.statement_end_offset
WHEN -1 THEN DATALENGTH(blocker_qt.text)
ELSE blocker_r.statement_end_offset
END - blocker_r.statement_start_offset)/2) + 1
) AS blocker_query
FROM BlockingChain bc
INNER JOIN sys.dm_exec_sessions blocker ON bc.blocking_session_id = blocker.session_id
LEFT JOIN sys.dm_exec_requests blocker_r ON bc.blocking_session_id = blocker_r.session_id
OUTER APPLY sys.dm_exec_sql_text(blocker_r.sql_handle) blocker_qt
ORDER BY bc.wait_time_sec DESC;
-- Kill blocking session (use with caution!)
-- KILL 52; -- Replace 52 with blocking_session_id
Query Store
Enabling Query Store
-- Enable Query Store
ALTER DATABASE AdventureWorks
SET QUERY_STORE = ON (
OPERATION_MODE = READ_WRITE,
CLEANUP_POLICY = (STALE_QUERY_THRESHOLD_DAYS = 30),
DATA_FLUSH_INTERVAL_SECONDS = 900, -- 15 minutes
INTERVAL_LENGTH_MINUTES = 60,
MAX_STORAGE_SIZE_MB = 1000,
QUERY_CAPTURE_MODE = AUTO,
SIZE_BASED_CLEANUP_MODE = AUTO
);
-- Check Query Store status
SELECT
actual_state_desc,
readonly_reason,
current_storage_size_mb,
max_storage_size_mb,
query_capture_mode_desc
FROM sys.database_query_store_options;
Query Performance Tracking
-- Top 20 queries by duration
SELECT TOP 20
qsq.query_id,
qsqt.query_sql_text,
COUNT(qsp.plan_id) AS plan_count,
SUM(qsrs.count_executions) AS execution_count,
CAST(AVG(qsrs.avg_duration) / 1000 AS DECIMAL(10,2)) AS avg_duration_ms,
CAST(MAX(qsrs.max_duration) / 1000 AS DECIMAL(10,2)) AS max_duration_ms,
CAST(AVG(qsrs.avg_cpu_time) / 1000 AS DECIMAL(10,2)) AS avg_cpu_time_ms,
CAST(AVG(qsrs.avg_logical_io_reads) AS DECIMAL(10,2)) AS avg_logical_reads,
MIN(qsrs.last_execution_time) AS first_execution_time,
MAX(qsrs.last_execution_time) AS last_execution_time
FROM sys.query_store_query qsq
INNER JOIN sys.query_store_query_text qsqt ON qsq.query_text_id = qsqt.query_text_id
INNER JOIN sys.query_store_plan qsp ON qsq.query_id = qsp.query_id
INNER JOIN sys.query_store_runtime_stats qsrs ON qsp.plan_id = qsrs.plan_id
WHERE qsrs.last_execution_time >= DATEADD(DAY, -7, GETUTCDATE())
GROUP BY qsq.query_id, qsqt.query_sql_text
ORDER BY AVG(qsrs.avg_duration) DESC;
Identifying Regressed Queries
-- Queries with performance regression
WITH RecentStats AS (
SELECT
qsp.plan_id,
AVG(qsrs.avg_duration) AS recent_avg_duration,
AVG(qsrs.avg_cpu_time) AS recent_avg_cpu
FROM sys.query_store_plan qsp
INNER JOIN sys.query_store_runtime_stats qsrs ON qsp.plan_id = qsrs.plan_id
WHERE qsrs.last_execution_time >= DATEADD(DAY, -1, GETUTCDATE())
GROUP BY qsp.plan_id
),
HistoricalStats AS (
SELECT
qsp.plan_id,
AVG(qsrs.avg_duration) AS historical_avg_duration,
AVG(qsrs.avg_cpu_time) AS historical_avg_cpu
FROM sys.query_store_plan qsp
INNER JOIN sys.query_store_runtime_stats qsrs ON qsp.plan_id = qsrs.plan_id
WHERE qsrs.last_execution_time < DATEADD(DAY, -1, GETUTCDATE())
AND qsrs.last_execution_time >= DATEADD(DAY, -8, GETUTCDATE())
GROUP BY qsp.plan_id
)
SELECT TOP 20
qsq.query_id,
qsqt.query_sql_text,
CAST(h.historical_avg_duration / 1000 AS DECIMAL(10,2)) AS historical_avg_duration_ms,
CAST(r.recent_avg_duration / 1000 AS DECIMAL(10,2)) AS recent_avg_duration_ms,
CAST((r.recent_avg_duration - h.historical_avg_duration) / h.historical_avg_duration * 100 AS DECIMAL(10,2)) AS pct_regression,
qsp.plan_id,
TRY_CAST(qsp.query_plan AS XML) AS query_plan
FROM RecentStats r
INNER JOIN HistoricalStats h ON r.plan_id = h.plan_id
INNER JOIN sys.query_store_plan qsp ON r.plan_id = qsp.plan_id
INNER JOIN sys.query_store_query qsq ON qsp.query_id = qsq.query_id
INNER JOIN sys.query_store_query_text qsqt ON qsq.query_text_id = qsqt.query_text_id
WHERE r.recent_avg_duration > h.historical_avg_duration * 1.5 -- 50% regression
ORDER BY pct_regression DESC;
Forcing Query Plans
-- View forced plans
SELECT
qsq.query_id,
qsqt.query_sql_text,
qsp.plan_id,
qsp.is_forced_plan,
qsp.force_failure_count,
qsp.last_force_failure_reason_desc
FROM sys.query_store_query qsq
INNER JOIN sys.query_store_query_text qsqt ON qsq.query_text_id = qsqt.query_text_id
INNER JOIN sys.query_store_plan qsp ON qsq.query_id = qsp.query_id
WHERE qsp.is_forced_plan = 1;
-- Force a specific plan (use good plan_id from history)
EXEC sp_query_store_force_plan
@query_id = 123,
@plan_id = 456;
-- Unforce plan
EXEC sp_query_store_unforce_plan
@query_id = 123,
@plan_id = 456;
Extended Events
Creating Extended Events Session
-- Create session to capture slow queries
CREATE EVENT SESSION SlowQueries ON SERVER
ADD EVENT sqlserver.sql_statement_completed (
ACTION (
sqlserver.client_hostname,
sqlserver.database_name,
sqlserver.username,
sqlserver.sql_text
)
WHERE (
duration >= 5000000 -- 5 seconds (microseconds)
AND sqlserver.database_name = 'AdventureWorks'
)
),
ADD EVENT sqlserver.rpc_completed (
ACTION (
sqlserver.client_hostname,
sqlserver.database_name,
sqlserver.username,
sqlserver.sql_text
)
WHERE (
duration >= 5000000 -- 5 seconds
AND sqlserver.database_name = 'AdventureWorks'
)
)
ADD TARGET package0.event_file (
SET filename = 'C:\ExtendedEvents\SlowQueries.xel',
max_file_size = 100, -- MB
max_rollover_files = 10
);
-- Start session
ALTER EVENT SESSION SlowQueries ON SERVER STATE = START;
-- Stop session
ALTER EVENT SESSION SlowQueries ON SERVER STATE = STOP;
Reading Extended Events Data
-- Read event file
SELECT
event_data.value('(event/@name)[1]', 'VARCHAR(50)') AS event_name,
event_data.value('(event/@timestamp)[1]', 'DATETIME2') AS event_timestamp,
event_data.value('(event/data[@name="duration"]/value)[1]', 'BIGINT') / 1000 AS duration_ms,
event_data.value('(event/data[@name="cpu_time"]/value)[1]', 'BIGINT') / 1000 AS cpu_time_ms,
event_data.value('(event/data[@name="logical_reads"]/value)[1]', 'BIGINT') AS logical_reads,
event_data.value('(event/data[@name="physical_reads"]/value)[1]', 'BIGINT') AS physical_reads,
event_data.value('(event/data[@name="writes"]/value)[1]', 'BIGINT') AS writes,
event_data.value('(event/action[@name="sql_text"]/value)[1]', 'NVARCHAR(MAX)') AS sql_text,
event_data.value('(event/action[@name="database_name"]/value)[1]', 'NVARCHAR(128)') AS database_name,
event_data.value('(event/action[@name="username"]/value)[1]', 'NVARCHAR(128)') AS username,
event_data.value('(event/action[@name="client_hostname"]/value)[1]', 'NVARCHAR(128)') AS client_hostname
FROM (
SELECT CAST(event_data AS XML) AS event_data
FROM sys.fn_xe_file_target_read_file('C:\ExtendedEvents\SlowQueries*.xel', NULL, NULL, NULL)
) AS XEventData
ORDER BY event_timestamp DESC;
Deadlock Capture
-- Create deadlock capture session
CREATE EVENT SESSION DeadlockCapture ON SERVER
ADD EVENT sqlserver.xml_deadlock_report
ADD TARGET package0.event_file (
SET filename = 'C:\ExtendedEvents\Deadlocks.xel'
);
ALTER EVENT SESSION DeadlockCapture ON SERVER STATE = START;
-- Read deadlock graphs
SELECT
event_data.value('(event/@timestamp)[1]', 'DATETIME2') AS event_timestamp,
CAST(event_data.query('(event/data[@name="xml_report"]/value/deadlock)[1]') AS XML) AS deadlock_graph
FROM (
SELECT CAST(event_data AS XML) AS event_data
FROM sys.fn_xe_file_target_read_file('C:\ExtendedEvents\Deadlocks*.xel', NULL, NULL, NULL)
) AS XEventData
ORDER BY event_timestamp DESC;
Performance Baselines
Establishing Baseline
# PowerShell script to collect baseline metrics
$server = "localhost"
$database = "AdventureWorks"
$outputPath = "C:\Baselines"
$timestamp = Get-Date -Format "yyyyMMdd_HHmmss"
# Collect baseline data
$queries = @{
"WaitStats" = "SELECT * FROM sys.dm_os_wait_stats WHERE wait_time_ms > 0"
"PerfCounters" = @"
SELECT
object_name,
counter_name,
instance_name,
cntr_value
FROM sys.dm_os_performance_counters
WHERE counter_name IN (
'Page life expectancy',
'Buffer cache hit ratio',
'Batch requests/sec',
'SQL Compilations/sec',
'SQL Re-Compilations/sec'
)
"@
"IndexUsage" = "SELECT * FROM sys.dm_db_index_usage_stats WHERE database_id = DB_ID()"
"QueryStats" = "SELECT TOP 100 * FROM sys.dm_exec_query_stats ORDER BY total_worker_time DESC"
}
foreach ($key in $queries.Keys) {
$query = $queries[$key]
$outputFile = "$outputPath\Baseline_${key}_${timestamp}.csv"
Invoke-Sqlcmd -ServerInstance $server -Database $database -Query $query |
Export-Csv -Path $outputFile -NoTypeInformation
Write-Host "Exported $key baseline to $outputFile"
}
Proactive Monitoring Script
# Daily health check script
$server = "localhost"
$emailTo = "dba@contoso.com"
$emailFrom = "sqlalerts@contoso.com"
$smtpServer = "smtp.contoso.com"
$healthChecks = @()
# Check 1: Long-running queries
$longRunningQueries = Invoke-Sqlcmd -ServerInstance $server -Query @"
SELECT COUNT(*) AS QueryCount
FROM sys.dm_exec_requests
WHERE total_elapsed_time > 300000 -- 5 minutes
AND session_id <> @@SPID
"@
if ($longRunningQueries.QueryCount -gt 0) {
$healthChecks += "⚠️ $($longRunningQueries.QueryCount) long-running queries detected"
}
# Check 2: Blocking
$blocking = Invoke-Sqlcmd -ServerInstance $server -Query @"
SELECT COUNT(DISTINCT blocking_session_id) AS BlockingCount
FROM sys.dm_os_waiting_tasks
WHERE blocking_session_id <> 0
"@
if ($blocking.BlockingCount -gt 0) {
$healthChecks += "⚠️ $($blocking.BlockingCount) blocking sessions detected"
}
# Check 3: Failed jobs
$failedJobs = Invoke-Sqlcmd -ServerInstance $server -Database msdb -Query @"
SELECT COUNT(*) AS FailedJobCount
FROM msdb.dbo.sysjobhistory
WHERE run_status = 0 -- Failed
AND run_date = CONVERT(INT, CONVERT(VARCHAR(8), GETDATE(), 112))
"@
if ($failedJobs.FailedJobCount -gt 0) {
$healthChecks += "❌ $($failedJobs.FailedJobCount) failed jobs today"
}
# Send alert if issues found
if ($healthChecks.Count -gt 0) {
$body = "SQL Server Health Check Alert`n`n" + ($healthChecks -join "`n")
Send-MailMessage -To $emailTo -From $emailFrom -Subject "SQL Server Health Alert" -Body $body -SmtpServer $smtpServer
}
Key Takeaways
- Use DMVs for real-time query and performance monitoring
- Enable Query Store for query performance history
- Implement Extended Events for lightweight tracing
- Establish performance baselines during normal operation
- Monitor wait statistics to identify bottlenecks
- Track index usage to optimize indexing strategy
- Set up proactive monitoring with PowerShell automation
- Configure SQL Server Agent alerts for critical conditions
- Review deadlocks and blocking regularly
- Analyze I/O latency for storage performance
Next Steps
- Enable Query Store on production databases
- Create Extended Events sessions for slow queries
- Establish baseline metrics for comparison
- Automate daily health check reports
- Configure SQL Server Agent alerts
Additional Resources
Monitor. Analyze. Optimize. Prevent.