diff options
Diffstat (limited to 'tools/perf/util/arm-spe.c')
| -rw-r--r-- | tools/perf/util/arm-spe.c | 263 | 
1 files changed, 184 insertions, 79 deletions
diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c index 8942fa598a84..71be979f5077 100644 --- a/tools/perf/util/arm-spe.c +++ b/tools/perf/util/arm-spe.c @@ -39,6 +39,18 @@  #define is_ldst_op(op)		(!!((op) & ARM_SPE_OP_LDST)) +#define ARM_SPE_CACHE_EVENT(lvl) \ +	(ARM_SPE_##lvl##_ACCESS | ARM_SPE_##lvl##_MISS) + +#define arm_spe_is_cache_level(type, lvl) \ +	((type) & ARM_SPE_CACHE_EVENT(lvl)) + +#define arm_spe_is_cache_hit(type, lvl) \ +	(((type) & ARM_SPE_CACHE_EVENT(lvl)) == ARM_SPE_##lvl##_ACCESS) + +#define arm_spe_is_cache_miss(type, lvl) \ +	((type) & ARM_SPE_##lvl##_MISS) +  struct arm_spe {  	struct auxtrace			auxtrace;  	struct auxtrace_queues		queues; @@ -62,7 +74,6 @@ struct arm_spe {  	u8				sample_remote_access;  	u8				sample_memory;  	u8				sample_instructions; -	u64				instructions_sample_period;  	u64				l1d_miss_id;  	u64				l1d_access_id; @@ -101,7 +112,7 @@ struct arm_spe_queue {  	u64				time;  	u64				timestamp;  	struct thread			*thread; -	u64				period_instructions; +	u64				sample_count;  	u32				flags;  	struct branch_stack		*last_branch;  }; @@ -228,7 +239,6 @@ static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe,  	speq->pid = -1;  	speq->tid = -1;  	speq->cpu = -1; -	speq->period_instructions = 0;  	/* params set */  	params.get_trace = arm_spe_get_trace; @@ -305,15 +315,28 @@ static int arm_spe_set_tid(struct arm_spe_queue *speq, pid_t tid)  	return 0;  } -static u64 *arm_spe__get_metadata_by_cpu(struct arm_spe *spe, u64 cpu) +static u64 *arm_spe__get_metadata_by_cpu(struct arm_spe *spe, int cpu)  {  	u64 i;  	if (!spe->metadata)  		return NULL; +	/* CPU ID is -1 for per-thread mode */ +	if (cpu < 0) { +		/* +		 * On the heterogeneous system, due to CPU ID is -1, +		 * cannot confirm the data source packet is supported. +		 */ +		if (!spe->is_homogeneous) +			return NULL; + +		/* In homogeneous system, simply use CPU0's metadata */ +		return spe->metadata[0]; +	} +  	for (i = 0; i < spe->metadata_nr_cpu; i++) -		if (spe->metadata[i][ARM_SPE_CPU] == cpu) +		if (spe->metadata[i][ARM_SPE_CPU] == (u64)cpu)  			return spe->metadata[i];  	return NULL; @@ -352,7 +375,7 @@ static void arm_spe_prep_sample(struct arm_spe *spe,  	sample->cpumode = arm_spe_cpumode(spe, sample->ip);  	sample->pid = speq->pid;  	sample->tid = speq->tid; -	sample->period = 1; +	sample->period = spe->synth_opts.period;  	sample->cpu = speq->cpu;  	sample->simd_flags = arm_spe__synth_simd_flags(record); @@ -471,7 +494,8 @@ arm_spe_deliver_synth_event(struct arm_spe *spe,  }  static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq, -				     u64 spe_events_id, u64 data_src) +				     u64 spe_events_id, +				     union perf_mem_data_src data_src)  {  	struct arm_spe *spe = speq->spe;  	struct arm_spe_record *record = &speq->decoder->record; @@ -486,7 +510,7 @@ static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq,  	sample.stream_id = spe_events_id;  	sample.addr = record->virt_addr;  	sample.phys_addr = record->phys_addr; -	sample.data_src = data_src; +	sample.data_src = data_src.val;  	sample.weight = record->latency;  	ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); @@ -519,7 +543,8 @@ static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq,  }  static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, -					     u64 spe_events_id, u64 data_src) +					     u64 spe_events_id, +					     union perf_mem_data_src data_src)  {  	struct arm_spe *spe = speq->spe;  	struct arm_spe_record *record = &speq->decoder->record; @@ -527,14 +552,6 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,  	struct perf_sample sample;  	int ret; -	/* -	 * Handles perf instruction sampling period. -	 */ -	speq->period_instructions++; -	if (speq->period_instructions < spe->instructions_sample_period) -		return 0; -	speq->period_instructions = 0; -  	perf_sample__init(&sample, /*all=*/true);  	arm_spe_prep_sample(spe, speq, event, &sample); @@ -542,8 +559,7 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,  	sample.stream_id = spe_events_id;  	sample.addr = record->to_ip;  	sample.phys_addr = record->phys_addr; -	sample.data_src = data_src; -	sample.period = spe->instructions_sample_period; +	sample.data_src = data_src.val;  	sample.weight = record->latency;  	sample.flags = speq->flags;  	sample.branch_stack = speq->last_branch; @@ -670,8 +686,8 @@ static void arm_spe__synth_data_source_common(const struct arm_spe_record *recor  	 * socket  	 */  	case ARM_SPE_COMMON_DS_REMOTE: -		data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1; -		data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; +		data_src->mem_lvl = PERF_MEM_LVL_NA; +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;  		data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;  		data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;  		break; @@ -819,30 +835,121 @@ static const struct data_source_handle data_source_handles[] = {  	DS(hisi_hip_ds_encoding_cpus, data_source_hisi_hip),  }; -static void arm_spe__synth_memory_level(const struct arm_spe_record *record, -					union perf_mem_data_src *data_src) +static void arm_spe__synth_ld_memory_level(const struct arm_spe_record *record, +					   union perf_mem_data_src *data_src)  { -	if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) { -		data_src->mem_lvl = PERF_MEM_LVL_L3; +	/* +	 * To find a cache hit, search in ascending order from the lower level +	 * caches to the higher level caches. This reflects the best scenario +	 * for a cache hit. +	 */ +	if (arm_spe_is_cache_hit(record->type, L1D)) { +		data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; +	} else if (record->type & ARM_SPE_RECENTLY_FETCHED) { +		data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT; +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_LFB; +	} else if (arm_spe_is_cache_hit(record->type, L2D)) { +		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; +	} else if (arm_spe_is_cache_hit(record->type, LLC)) { +		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; +	/* +	 * To find a cache miss, search in descending order from the higher +	 * level cache to the lower level cache. This represents the worst +	 * scenario for a cache miss. +	 */ +	} else if (arm_spe_is_cache_miss(record->type, LLC)) { +		data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_MISS; +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; +	} else if (arm_spe_is_cache_miss(record->type, L2D)) { +		data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_MISS; +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; +	} else if (arm_spe_is_cache_miss(record->type, L1D)) { +		data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS; +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; +	} +} -		if (record->type & ARM_SPE_LLC_MISS) -			data_src->mem_lvl |= PERF_MEM_LVL_MISS; -		else -			data_src->mem_lvl |= PERF_MEM_LVL_HIT; -	} else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) { +static void arm_spe__synth_st_memory_level(const struct arm_spe_record *record, +					   union perf_mem_data_src *data_src) +{ +	/* Record the greatest level info for a store operation. */ +	if (arm_spe_is_cache_level(record->type, LLC)) { +		data_src->mem_lvl = PERF_MEM_LVL_L3; +		data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, LLC) ? +				     PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT; +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; +	} else if (arm_spe_is_cache_level(record->type, L2D)) { +		data_src->mem_lvl = PERF_MEM_LVL_L2; +		data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, L2D) ? +				     PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT; +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; +	} else if (arm_spe_is_cache_level(record->type, L1D)) {  		data_src->mem_lvl = PERF_MEM_LVL_L1; +		data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, L1D) ? +				     PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT; +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; +	} +} -		if (record->type & ARM_SPE_L1D_MISS) -			data_src->mem_lvl |= PERF_MEM_LVL_MISS; -		else -			data_src->mem_lvl |= PERF_MEM_LVL_HIT; +static void arm_spe__synth_memory_level(struct arm_spe_queue *speq, +					const struct arm_spe_record *record, +					union perf_mem_data_src *data_src) +{ +	struct arm_spe *spe = speq->spe; + +	/* +	 * The data source packet contains more info for cache levels for +	 * peer snooping. So respect the memory level if has been set by +	 * data source parsing. +	 */ +	if (!data_src->mem_lvl) { +		if (data_src->mem_op == PERF_MEM_OP_LOAD) +			arm_spe__synth_ld_memory_level(record, data_src); +		if (data_src->mem_op == PERF_MEM_OP_STORE) +			arm_spe__synth_st_memory_level(record, data_src); +	} + +	if (!data_src->mem_lvl) { +		data_src->mem_lvl = PERF_MEM_LVL_NA; +		data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA; +	} + +	/* +	 * If 'mem_snoop' has been set by data source packet, skip to set +	 * it at here. +	 */ +	if (!data_src->mem_snoop) { +		if (record->type & ARM_SPE_DATA_SNOOPED) { +			if (record->type & ARM_SPE_HITM) +				data_src->mem_snoop = PERF_MEM_SNOOP_HITM; +			else +				data_src->mem_snoop = PERF_MEM_SNOOP_HIT; +		} else { +			u64 *metadata = +				arm_spe__get_metadata_by_cpu(spe, speq->cpu); + +			/* +			 * Set NA ("Not available") mode if no meta data or the +			 * SNOOPED event is not supported. +			 */ +			if (!metadata || +			    !(metadata[ARM_SPE_CAP_EVENT_FILTER] & ARM_SPE_DATA_SNOOPED)) +				data_src->mem_snoop = PERF_MEM_SNOOP_NA; +			else +				data_src->mem_snoop = PERF_MEM_SNOOP_NONE; +		}  	} -	if (record->type & ARM_SPE_REMOTE_ACCESS) -		data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1; +	if (!data_src->mem_remote) { +		if (record->type & ARM_SPE_REMOTE_ACCESS) +			data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; +	}  } -static bool arm_spe__synth_ds(struct arm_spe_queue *speq, +static void arm_spe__synth_ds(struct arm_spe_queue *speq,  			      const struct arm_spe_record *record,  			      union perf_mem_data_src *data_src)  { @@ -859,56 +966,41 @@ static bool arm_spe__synth_ds(struct arm_spe_queue *speq,  		cpuid = perf_env__cpuid(perf_session__env(spe->session));  		midr = strtol(cpuid, NULL, 16);  	} else { -		/* CPU ID is -1 for per-thread mode */ -		if (speq->cpu < 0) { -			/* -			 * On the heterogeneous system, due to CPU ID is -1, -			 * cannot confirm the data source packet is supported. -			 */ -			if (!spe->is_homogeneous) -				return false; - -			/* In homogeneous system, simply use CPU0's metadata */ -			if (spe->metadata) -				metadata = spe->metadata[0]; -		} else { -			metadata = arm_spe__get_metadata_by_cpu(spe, speq->cpu); -		} - +		metadata = arm_spe__get_metadata_by_cpu(spe, speq->cpu);  		if (!metadata) -			return false; +			return;  		midr = metadata[ARM_SPE_CPU_MIDR];  	}  	for (i = 0; i < ARRAY_SIZE(data_source_handles); i++) {  		if (is_midr_in_range_list(midr, data_source_handles[i].midr_ranges)) { -			data_source_handles[i].ds_synth(record, data_src); -			return true; +			return data_source_handles[i].ds_synth(record, data_src);  		}  	} -	return false; +	return;  } -static u64 arm_spe__synth_data_source(struct arm_spe_queue *speq, -				      const struct arm_spe_record *record) +static union perf_mem_data_src +arm_spe__synth_data_source(struct arm_spe_queue *speq, +			   const struct arm_spe_record *record)  { -	union perf_mem_data_src	data_src = { .mem_op = PERF_MEM_OP_NA }; +	union perf_mem_data_src	data_src = {};  	/* Only synthesize data source for LDST operations */  	if (!is_ldst_op(record->op)) -		return 0; +		return data_src;  	if (record->op & ARM_SPE_OP_LD)  		data_src.mem_op = PERF_MEM_OP_LOAD;  	else if (record->op & ARM_SPE_OP_ST)  		data_src.mem_op = PERF_MEM_OP_STORE;  	else -		return 0; +		return data_src; -	if (!arm_spe__synth_ds(speq, record, &data_src)) -		arm_spe__synth_memory_level(record, &data_src); +	arm_spe__synth_ds(speq, record, &data_src); +	arm_spe__synth_memory_level(speq, record, &data_src);  	if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) {  		data_src.mem_dtlb = PERF_MEM_TLB_WK; @@ -919,16 +1011,24 @@ static u64 arm_spe__synth_data_source(struct arm_spe_queue *speq,  			data_src.mem_dtlb |= PERF_MEM_TLB_HIT;  	} -	return data_src.val; +	return data_src;  }  static int arm_spe_sample(struct arm_spe_queue *speq)  {  	const struct arm_spe_record *record = &speq->decoder->record;  	struct arm_spe *spe = speq->spe; -	u64 data_src; +	union perf_mem_data_src data_src;  	int err; +	/* +	 * Discard all samples until period is reached +	 */ +	speq->sample_count++; +	if (speq->sample_count < spe->synth_opts.period) +		return 0; +	speq->sample_count = 0; +  	arm_spe__sample_flags(speq);  	data_src = arm_spe__synth_data_source(speq, record); @@ -1532,6 +1632,7 @@ static const char * const metadata_per_cpu_fmts[] = {  	[ARM_SPE_CPU_MIDR]		= "    MIDR             :0x%"PRIx64"\n",  	[ARM_SPE_CPU_PMU_TYPE]		= "    PMU Type         :%"PRId64"\n",  	[ARM_SPE_CAP_MIN_IVAL]		= "    Min Interval     :%"PRId64"\n", +	[ARM_SPE_CAP_EVENT_FILTER]	= "    Event Filter     :0x%"PRIx64"\n",  };  static void arm_spe_print_info(struct arm_spe *spe, __u64 *arr) @@ -1628,6 +1729,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)  	attr.exclude_guest = evsel->core.attr.exclude_guest;  	attr.sample_id_all = evsel->core.attr.sample_id_all;  	attr.read_format = evsel->core.attr.read_format; +	attr.sample_period = spe->synth_opts.period;  	/* create new id val to be a fixed offset from evsel id */  	id = evsel->core.id[0] + 1000000000; @@ -1744,25 +1846,15 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)  	}  	if (spe->synth_opts.instructions) { -		if (spe->synth_opts.period_type != PERF_ITRACE_PERIOD_INSTRUCTIONS) { -			pr_warning("Only instruction-based sampling period is currently supported by Arm SPE.\n"); -			goto synth_instructions_out; -		} -		if (spe->synth_opts.period > 1) -			pr_warning("Arm SPE has a hardware-based sample period.\n" -				   "Additional instruction events will be discarded by --itrace\n"); -  		spe->sample_instructions = true;  		attr.config = PERF_COUNT_HW_INSTRUCTIONS; -		attr.sample_period = spe->synth_opts.period; -		spe->instructions_sample_period = attr.sample_period; +  		err = perf_session__deliver_synth_attr_event(session, &attr, id);  		if (err)  			return err;  		spe->instructions_id = id;  		arm_spe_set_event_name(evlist, id, "instructions");  	} -synth_instructions_out:  	return 0;  } @@ -1871,10 +1963,23 @@ int arm_spe_process_auxtrace_info(union perf_event *event,  	if (dump_trace)  		return 0; -	if (session->itrace_synth_opts && session->itrace_synth_opts->set) +	if (session->itrace_synth_opts && session->itrace_synth_opts->set) {  		spe->synth_opts = *session->itrace_synth_opts; -	else +	} else {  		itrace_synth_opts__set_default(&spe->synth_opts, false); +		/* Default nanoseconds period not supported */ +		spe->synth_opts.period_type = PERF_ITRACE_PERIOD_INSTRUCTIONS; +		spe->synth_opts.period = 1; +	} + +	if (spe->synth_opts.period_type != PERF_ITRACE_PERIOD_INSTRUCTIONS) { +		ui__error("You must only use i (instructions) --itrace period with Arm SPE. e.g --itrace=i1i\n"); +		err = -EINVAL; +		goto err_free_queues; +	} +	if (spe->synth_opts.period > 1) +		ui__warning("Arm SPE has a hardware-based sampling period.\n\n" +			    "--itrace periods > 1i downsample by an interval of n SPE samples rather than n instructions.\n");  	err = arm_spe_synth_events(spe, session);  	if (err)  | 
