src/backend/executor/instrument.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218

/*-------------------------------------------------------------------------
 *
 * instrument.c
 *	 functions for instrumentation of plan execution
 *
 *
 * Copyright (c) 2001-2006, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
 *	  $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.15 2006/05/30 14:01:58 momjian Exp $
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

#include <unistd.h>
#include <math.h>

#include "executor/instrument.h"

/* This is the function that is used to determine the sampling intervals. In
 * general, if the function is f(x), then for N tuples we will take on the
 * order of integral(1/f(x), x=0..N) samples. Some examples follow, with the
 * number of samples that would be collected over 1,000,000 tuples.

  f(x) = x         =>   log2(N)                        20
  f(x) = x^(1/2)   =>   2 * N^(1/2)                  2000
  f(x) = x^(1/3)   =>   1.5 * N^(2/3)               15000
  
 * I've chosen the last one as it seems to provide a good compromise between
 * low overhead but still getting a meaningful number of samples. However,
 * not all machines have the cbrt() function so on those we substitute
 * sqrt(). The difference is not very significant in the tests I made.
*/ 
#ifdef HAVE_CBRT
#define SampleFunc cbrt
#else
#define SampleFunc sqrt
#endif

#define SAMPLE_THRESHOLD 50

static double SampleOverhead;
static bool SampleOverheadCalculated;

static void
CalculateSampleOverhead()
{
	Instrumentation instr;
	int i;
	
	/* We want to determine the sampling overhead, to correct
	 * calculations later. This only needs to be done once per backend.
	 * Is this the place? A wrong value here (due to a mistimed
	 * task-switch) will cause bad calculations later.
	 *
	 * To minimize the risk we do it a few times and take the lowest.
	 */
	
	SampleOverhead = 1.0e6;
	
	for( i = 0; i<5; i++ )
	{
		int j;
		double overhead;
		
		memset( &instr, 0, sizeof(instr) );
	
		/* Loop SAMPLE_THRESHOLD times or 100 microseconds, whichever is faster */
		for( j=0; j<SAMPLE_THRESHOLD && INSTR_TIME_GET_DOUBLE(instr.counter) < 100e-6; i++ )
		{
			InstrStartNode( &instr );
			InstrStopNode( &instr, 1 );
		}
		overhead = INSTR_TIME_GET_DOUBLE(instr.counter) / instr.samplecount;
		if( overhead < SampleOverhead )
			SampleOverhead = overhead;
	}
		
	SampleOverheadCalculated = true;
}

/* Allocate new instrumentation structure(s) */
Instrumentation *
InstrAlloc(int n)
{
	Instrumentation *instr = palloc0(n * sizeof(Instrumentation));

	/* we don't need to do any initialization except zero 'em */
	
	/* Calculate overhead, if not done yet */
	if( !SampleOverheadCalculated )
		CalculateSampleOverhead();
	return instr;
}

/* Entry to a plan node */
void
InstrStartNode(Instrumentation *instr)
{
	if (INSTR_TIME_IS_ZERO(instr->starttime))
	{
		/* We always sample the first SAMPLE_THRESHOLD tuples, so small nodes are always accurate */
		if (instr->tuplecount < SAMPLE_THRESHOLD)
			instr->sampling = true;
		else
		{
			/* Otherwise we go to sampling, see the comments on SampleFunc at the top of the file */
			if( instr->tuplecount > instr->nextsample )
			{
				instr->sampling = true;
				/* The doubling is so the random will average 1 over time */
				instr->nextsample += 2.0 * SampleFunc(instr->tuplecount) * (double)rand() / (double)RAND_MAX;
			}
		}
		if (instr->sampling)
			INSTR_TIME_SET_CURRENT(instr->starttime);
	}
	else
		elog(DEBUG2, "InstrStartNode called twice in a row");
}

/* Exit from a plan node */
void
InstrStopNode(Instrumentation *instr, double nTuples)
{
	instr_time	endtime;

	/* count the returned tuples */
	instr->tuplecount += nTuples;

	if (instr->sampling)
	{
		if (INSTR_TIME_IS_ZERO(instr->starttime))
		{
			elog(DEBUG2, "InstrStopNode called without start");
			return;
		}

		INSTR_TIME_SET_CURRENT(endtime);

#ifndef WIN32
		instr->counter.tv_sec += endtime.tv_sec - instr->starttime.tv_sec;
		instr->counter.tv_usec += endtime.tv_usec - instr->starttime.tv_usec;

		/* Normalize after each add to avoid overflow/underflow of tv_usec */
		while (instr->counter.tv_usec < 0)
		{
			instr->counter.tv_usec += 1000000;
			instr->counter.tv_sec--;
		}
		while (instr->counter.tv_usec >= 1000000)
		{
			instr->counter.tv_usec -= 1000000;
			instr->counter.tv_sec++;
		}
#else							/* WIN32 */
		instr->counter.QuadPart += (endtime.QuadPart - instr->starttime.QuadPart);
#endif

		INSTR_TIME_SET_ZERO(instr->starttime);
		instr->samplecount += nTuples;
		instr->sampling = false;
	}

	/* Is this the first tuple of this cycle? */
	if (!instr->running)
	{
		instr->running = true;
		instr->firsttuple = INSTR_TIME_GET_DOUBLE(instr->counter);
	}
}

/* Finish a run cycle for a plan node */
void
InstrEndLoop(Instrumentation *instr)
{
	double		totaltime;

	/* Skip if nothing has happened, or already shut down */
	if (!instr->running)
		return;

	if (!INSTR_TIME_IS_ZERO(instr->starttime))
		elog(DEBUG2, "InstrEndLoop called on running node");

	/* Accumulate per-cycle statistics into totals */
	totaltime = INSTR_TIME_GET_DOUBLE(instr->counter);

	instr->startup += instr->firsttuple;

	/* Here we take into account sampling effects. Doing it naively ends
	 * up assuming the sampling overhead applies to all tuples, even the
	 * ones we didn't measure. We've calculated an overhead, so we
	 * subtract that for all samples we didn't measure. The first tuple
	 * is also special cased, because it usually takes longer. */

	if( instr->samplecount < instr->tuplecount )
	{
		double pertuple = (totaltime - instr->firsttuple) / (instr->samplecount - 1);
		instr->total += instr->firsttuple + (pertuple * (instr->samplecount - 1))
		                                  + ((pertuple - SampleOverhead) * (instr->tuplecount - instr->samplecount));
	}
	else
		instr->total += totaltime;
		
	instr->ntuples += instr->tuplecount;
	instr->nsamples += instr->samplecount;
	instr->nloops += 1;

	/* Reset for next cycle (if any) */
	instr->running = false;
	INSTR_TIME_SET_ZERO(instr->starttime);
	INSTR_TIME_SET_ZERO(instr->counter);
	instr->firsttuple = 0;
	instr->samplecount = 0;
	instr->tuplecount = 0;
}