diff --git a/README.md b/README.md index 66e167dae01e..f079827a5454 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,7 @@ pair of .c and .py files, and some are directories of files. - tools/[capable](tools/capable.py): Trace security capability checks. [Examples](tools/capable_example.txt). - tools/[cachestat](tools/cachestat.py): Trace page cache hit/miss ratio. [Examples](tools/cachestat_example.txt). - tools/[cachetop](tools/cachetop.py): Trace page cache hit/miss ratio by processes. [Examples](tools/cachetop_example.txt). +- tools/[compactsnoop](tools/compactsnoop.py): Trace compact zone events with PID and latency. [Examples](tools/compactsnoop_example.txt). - tools/[cpudist](tools/cpudist.py): Summarize on- and off-CPU time per task as a histogram. [Examples](tools/cpudist_example.txt) - tools/[cpuunclaimed](tools/cpuunclaimed.py): Sample CPU run queues and calculate unclaimed idle CPU. [Examples](tools/cpuunclaimed_example.txt) - tools/[criticalstat](tools/criticalstat.py): Trace and report long atomic critical sections in the kernel. [Examples](tools/criticalstat_example.txt) diff --git a/man/man8/compactsnoop.8 b/man/man8/compactsnoop.8 new file mode 100644 index 000000000000..d110a2b1d683 --- /dev/null +++ b/man/man8/compactsnoop.8 @@ -0,0 +1,179 @@ +.TH compactsnoop 8 "2019-11-1" "USER COMMANDS" +.SH NAME +compactstall \- Trace compact zone events. Uses Linux eBPF/bcc. +.SH SYNOPSIS +.B compactsnoop.py [\-h] [\-T] [\-p PID] [\-d DURATION] [\-K] [\-e] +.SH DESCRIPTION +compactsnoop traces the compact zone events, showing which processes are +allocing pages with memory compaction. This can be useful for discovering +when compact_stall (/proc/vmstat) continues to increase, whether it is +caused by some critical processes or not. + +This works by tracing the compact zone events using raw_tracepoints and one +kretprobe. + +For the Centos 7.6 (3.10.x kernel), see the version under tools/old, which +uses an older memory compaction mechanism. + +Since this uses BPF, only the root user can use this tool. +.SH REQUIREMENTS +CONFIG_BPF and bcc. +.SH OPTIONS +.TP +\-h +Print usage message. +.TP +\-T +Include a timestamp column. +.TP +\-p PID +Trace this process ID only (filtered in-kernel). +.TP +\-d DURATION +Total duration of trace in seconds. +.TP +\-K +Output kernel stack trace +.TP +\-e +Show extended fields. +.SH EXAMPLES +.TP +Trace all compact zone events: +# +.B compactsnoop +.TP +Trace all compact zone events, for 10 seconds only: +# +.B compactsnoop -d 10 +.SH FIELDS +.TP +TIME(s) +Time of the call, in seconds. +.TP +COMM +Process name +.TP +PID +Process ID +.TP +NODE +Memory node +.TP +ZONE +Zone of the node (such as DMA, DMA32, NORMAL eg) +.TP +ORDER +Shows which order alloc cause memory compaction, -1 means all orders (eg: write +to /proc/sys/vm/compact_memory) +.TP +MODE +SYNC OR ASYNC +.TP +FRAGIDX (extra column) +The FRAGIDX is short for fragmentation index, which only makes sense if an +allocation of a requested size would fail. If that is true, the fragmentation +index indicates whether external fragmentation or a lack of memory was the +problem. The value can be used to determine if page reclaim or compaction +should be used. +.PP +.in +8n +Index is between 0 and 1 so return within 3 decimal places +.PP +.in +8n +0 => allocation would fail due to lack of memory +.PP +.in +8n +1 => allocation would fail due to fragmentation +.TP +MIN (extra column) +The min watermark of the zone +.TP +LOW (extra column) +The low watermark of the zone +.TP +HIGH (extra column) +The high watermark of the zone +.TP +FREE (extra column) +The nr_free_pages of the zone +.TP +LAT(ms) +compact zone's latency +.TP +STATUS +The compaction's result. +.PP +.in +8n +For (CentOS 7.6's kernel), the status include: +.PP +.in +8n +"skipped" (COMPACT_SKIPPED): compaction didn't start as it was not possible or +direct reclaim was more suitable +.PP +.in +8n +"continue" (COMPACT_CONTINUE): compaction should continue to another pageblock +.PP +.in +8n +"partial" (COMPACT_PARTIAL): direct compaction partially compacted a zone and +there are suitable pages +.PP +.in +8n +"complete" (COMPACT_COMPLETE): The full zone was compacted +.PP +.in +8n +For (kernel 4.7 and above): +.PP +.in +8n +"not_suitable_zone" (COMPACT_NOT_SUITABLE_ZONE): For more detailed tracepoint +output - internal to compaction +.PP +.in +8n +"skipped" (COMPACT_SKIPPED): compaction didn't start as it was not possible or +direct reclaim was more suitable +.PP +.in +8n +"deferred" (COMPACT_DEFERRED): compaction didn't start as it was deferred due +to past failures +.PP +.in +8n +"no_suitable_page" (COMPACT_NOT_SUITABLE_PAGE): For more detailed tracepoint +output - internal to compaction +.PP +.in +8n +"continue" (COMPACT_CONTINUE): compaction should continue to another pageblock +.PP +.in +8n +"complete" (COMPACT_COMPLETE): The full zone was compacted scanned but wasn't +successfull to compact suitable pages. +.PP +.in +8n +"partial_skipped" (COMPACT_PARTIAL_SKIPPED): direct compaction has scanned part +of the zone but wasn't successfull to compact suitable pages. +.PP +.in +8n +"contended" (COMPACT_CONTENDED): compaction terminated prematurely due to lock +contentions +.PP +.in +8n +"success" (COMPACT_SUCCESS): direct compaction terminated after concluding that +the allocation should now succeed +.PP +.in +8n +.SH OVERHEAD +This traces the kernel compact zone kprobe/kretprobe or raw_tracepoints and +prints output for each event. As the rate of this is generally expected to be +low (< 1000/s), the overhead is also expected to be negligible. +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _examples.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Ethercflow diff --git a/snapcraft/snapcraft.yaml b/snapcraft/snapcraft.yaml index ee815b9572d7..5152575cc0d7 100644 --- a/snapcraft/snapcraft.yaml +++ b/snapcraft/snapcraft.yaml @@ -101,6 +101,8 @@ apps: command: bcc-wrapper capable cobjnew: command: bcc-wrapper cobjnew + compactsnoop: + command: bcc-wrapper compactsnoop cpudist: command: bcc-wrapper cpudist cpuunclaimed: diff --git a/tests/python/test_tools_smoke.py b/tests/python/test_tools_smoke.py index e1b10ae2810c..50f20b4dc547 100755 --- a/tests/python/test_tools_smoke.py +++ b/tests/python/test_tools_smoke.py @@ -117,6 +117,10 @@ def test_cpudist(self): def test_cpuunclaimed(self): self.run_with_duration("cpuunclaimed.py 1 1") + @skipUnless(kernel_version_ge(4,17), "requires kernel >= 4.17") + def test_compactsnoop(self): + self.run_with_int("compactsnoop.py") + @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") def test_dbslower(self): # Deliberately left empty -- dbslower requires an instance of either diff --git a/tools/compactsnoop.py b/tools/compactsnoop.py new file mode 100755 index 000000000000..7bdf33c18131 --- /dev/null +++ b/tools/compactsnoop.py @@ -0,0 +1,400 @@ +#!/usr/bin/python +# @lint-avoid-python-3-compatibility-imports +# +# compactsnoop Trace compact zone and print details including issuing PID. +# For Linux, uses BCC, eBPF. +# +# This uses in-kernel eBPF maps to cache process details (PID and comm) by +# compact zone begin, as well as a starting timestamp for calculating +# latency. +# +# Copyright (c) 2019 Wenbo Zhang +# Licensed under the Apache License, Version 2.0 (the "License") +# +# 11-NOV-2019 Wenbo Zhang Created this. + +from __future__ import print_function +from bcc import BPF +import argparse +import platform +from datetime import datetime, timedelta + +# arguments +examples = """examples: + ./compactsnoop # trace all compact stall + ./compactsnoop -T # include timestamps + ./compactsnoop -d 10 # trace for 10 seconds only + ./compactsnoop -K # output kernel stack trace + ./compactsnoop -e # show extended fields +""" + +parser = argparse.ArgumentParser( + description="Trace compact zone", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples, +) +parser.add_argument("-T", "--timestamp", action="store_true", + help="include timestamp on output") +parser.add_argument("-p", "--pid", help="trace this PID only") +parser.add_argument("-d", "--duration", + help="total duration of trace in seconds") +parser.add_argument("-K", "--kernel-stack", action="store_true", + help="output kernel stack trace") +parser.add_argument("-e", "--extended_fields", action="store_true", + help="show system memory state") +parser.add_argument("--ebpf", action="store_true", help=argparse.SUPPRESS) +args = parser.parse_args() +debug = 0 +if args.duration: + args.duration = timedelta(seconds=int(args.duration)) + +NO_EXTENDED = """ +#ifdef EXTNEDED_FIELDS +#undef EXTNEDED_FIELDS +#endif +""" + +EXTENDED = """ +#define EXTNEDED_FIELDS 1 +""" + +bpf_text = """ +#include +#include +#include +#include + +struct val_t { + int nid; + int idx; + int order; + int sync; +#ifdef EXTNEDED_FIELDS + int fragindex; + int low; + int min; + int high; + int free; +#endif + u64 ts; // compaction begin time +}; + +struct data_t { + u32 pid; + u32 tid; + int nid; + int idx; + int order; + u64 delta; + u64 ts; // compaction end time + int sync; +#ifdef EXTNEDED_FIELDS + int fragindex; + int low; + int min; + int high; + int free; +#endif + int status; + int stack_id; + char comm[TASK_COMM_LEN]; +}; + +BPF_HASH(start, u64, struct val_t); +BPF_PERF_OUTPUT(events); +BPF_STACK_TRACE(stack_traces, 2048); + +#ifdef CONFIG_NUMA +static inline int zone_to_nid_(struct zone *zone) +{ + int node; + bpf_probe_read(&node, sizeof(node), &zone->node); + return node; +} +#else +static inline int zone_to_nid_(struct zone *zone) +{ + return 0; +} +#endif + +// #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) +static inline int zone_idx_(struct zone *zone) +{ + struct pglist_data *zone_pgdat = NULL; + bpf_probe_read(&zone_pgdat, sizeof(zone_pgdat), &zone->zone_pgdat); + return zone - zone_pgdat->node_zones; +} + +#ifdef EXTNEDED_FIELDS +static inline void get_all_wmark_pages(struct zone *zone, struct val_t *valp) +{ + u64 _watermark[NR_WMARK] = {}; + u64 watermark_boost = 0; + + bpf_probe_read(&_watermark, sizeof(_watermark), &zone->_watermark); + bpf_probe_read(&watermark_boost, sizeof(watermark_boost), + &zone->watermark_boost); + valp->min = _watermark[WMARK_MIN] + watermark_boost; + valp->low = _watermark[WMARK_LOW] + watermark_boost; + valp->high = _watermark[WMARK_HIGH] + watermark_boost; + bpf_probe_read(&valp->free, sizeof(valp->free), + &zone->vm_stat[NR_FREE_PAGES]); +} +#endif + +static inline void submit_event(void *ctx, int status) +{ + struct data_t data = {}; + u64 ts = bpf_ktime_get_ns(); + u64 id = bpf_get_current_pid_tgid(); + struct val_t *valp = start.lookup(&id); + if (valp == NULL) { + // missed entry + return; + } + + data.delta = ts - valp->ts; + data.ts = ts / 1000; + data.pid = id >> 32; + data.tid = id; + bpf_get_current_comm(&data.comm, sizeof(data.comm)); + data.nid = valp->nid; + data.idx = valp->idx; + data.order = valp->order; + data.sync = valp->sync; + +#ifdef EXTNEDED_FIELDS + data.fragindex = valp->fragindex; + data.min = valp->min; + data.low = valp->low; + data.high = valp->high; + data.free = valp->free; +#endif + + data.status = status; + data.stack_id = stack_traces.get_stackid(ctx, 0); + + events.perf_submit(ctx, &data, sizeof(data)); + + start.delete(&id); +} + +#ifdef EXTNEDED_FIELDS +int trace_fragmentation_index_return(struct pt_regs *ctx) +{ + struct val_t val = { }; + int ret = PT_REGS_RC(ctx); + u64 id = bpf_get_current_pid_tgid(); + PID_FILTER + val.fragindex = ret; + start.update(&id, &val); + return 0; +} +#endif + +static inline void fill_compact_info(struct val_t *valp, + struct zone *zone, + int order) +{ + valp->nid = zone_to_nid_(zone); + valp->idx = zone_idx_(zone); + valp->order = order; +} + +RAW_TRACEPOINT_PROBE(mm_compaction_suitable) +{ + // TP_PROTO(struct zone *zone, int order, int ret) + struct zone *zone = (struct zone *)ctx->args[0]; + int order = (int)ctx->args[1]; + int ret = (int)ctx->args[2]; + u64 id; + + if(ret != COMPACT_CONTINUE) + return 0; + + id = bpf_get_current_pid_tgid(); + PID_FILTER + +#ifdef EXTNEDED_FIELDS + struct val_t *valp = start.lookup(&id); + if (valp == NULL) { + // missed entry or order <= PAGE_ALLOC_COSTLY_ORDER, eg: + // manual trigger echo 1 > /proc/sys/vm/compact_memory + struct val_t val = { .fragindex = -1000 }; + valp = &val; + start.update(&id, valp); + } + fill_compact_info(valp, zone, order); + get_all_wmark_pages(zone, valp); +#else + struct val_t val = { }; + fill_compact_info(&val, zone, order); + start.update(&id, &val); +#endif + + return 0; +} + +RAW_TRACEPOINT_PROBE(mm_compaction_begin) +{ + // TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn, + // unsigned long free_pfn, unsigned long zone_end, bool sync) + bool sync = (bool)ctx->args[4]; + + u64 id = bpf_get_current_pid_tgid(); + struct val_t *valp = start.lookup(&id); + if (valp == NULL) { + // missed entry + return 0; + } + + valp->ts = bpf_ktime_get_ns(); + valp->sync = sync; + return 0; +} + +RAW_TRACEPOINT_PROBE(mm_compaction_end) +{ + // TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn, + // unsigned long free_pfn, unsigned long zone_end, bool sync, + // int status) + submit_event(ctx, ctx->args[5]); + return 0; +} +""" + +if platform.machine() != 'x86_64': + print(""" + Currently only support x86_64 servers, if you want to use it on + other platforms, please refer include/linux/mmzone.h to modify + zone_idex_to_str to get the right zone type + """) + exit() + +if args.extended_fields: + bpf_text = EXTENDED + bpf_text +else: + bpf_text = NO_EXTENDED + bpf_text + +if args.pid: + bpf_text = bpf_text.replace("PID_FILTER", + "if (id >> 32 != %s) { return 0; }" % args.pid) +else: + bpf_text = bpf_text.replace("PID_FILTER", "") +if debug or args.ebpf: + print(bpf_text) + if args.ebpf: + exit() + +# load BPF program +b = BPF(text=bpf_text) +if args.extended_fields: + b.attach_kretprobe(event="fragmentation_index", + fn_name="trace_fragmentation_index_return") + +stack_traces = b.get_table("stack_traces") +initial_ts = 0 + +def zone_idx_to_str(idx): + # from include/linux/mmzone.h + # NOTICE: consider only x86_64 servers + zone_type = { + 0: "ZONE_DMA", + 1: "ZONE_DMA32", + 2: "ZONE_NORMAL", + } + + if idx in zone_type: + return zone_type[idx] + else: + return str(idx) + +def compact_result_to_str(status): + # from include/trace/evnets/mmflags.h + # from include/linux/compaction.h + compact_status = { + # COMPACT_NOT_SUITABLE_ZONE: For more detailed tracepoint + # output - internal to compaction + 0: "not_suitable_zone", + # COMPACT_SKIPPED: compaction didn't start as it was not + # possible or direct reclaim was more suitable + 1: "skipped", + # COMPACT_DEFERRED: compaction didn't start as it was + # deferred due to past failures + 2: "deferred", + # COMPACT_NOT_SUITABLE_PAGE: For more detailed tracepoint + # output - internal to compaction + 3: "no_suitable_page", + # COMPACT_CONTINUE: compaction should continue to another pageblock + 4: "continue", + # COMPACT_COMPLETE: The full zone was compacted scanned but wasn't + # successfull to compact suitable pages. + 5: "complete", + # COMPACT_PARTIAL_SKIPPED: direct compaction has scanned part of the + # zone but wasn't successfull to compact suitable pages. + 6: "partial_skipped", + # COMPACT_CONTENDED: compaction terminated prematurely due to lock + # contentions + 7: "contended", + # COMPACT_SUCCESS: direct compaction terminated after concluding + # that the allocation should now succeed + 8: "success", + } + + if status in compact_status: + return compact_status[status] + else: + return str(status) + +# header +if args.timestamp: + print("%-14s" % ("TIME(s)"), end=" ") +print("%-14s %-6s %-4s %-12s %-5s %-7s" % + ("COMM", "PID", "NODE", "ZONE", "ORDER", "MODE"), end=" ") +if args.extended_fields: + print("%-8s %-8s %-8s %-8s %-8s" % + ("FRAGIDX", "MIN", "LOW", "HIGH", "FREE"), end=" ") +print("%9s %16s" % ("LAT(ms)", "STATUS")) + +# process event +def print_event(cpu, data, size): + event = b["events"].event(data) + + global initial_ts + + if not initial_ts: + initial_ts = event.ts + + if args.timestamp: + delta = event.ts - initial_ts + print("%-14.9f" % (float(delta) / 1000000), end=" ") + + print("%-14.14s %-6s %-4s %-12s %-5s %-7s" % ( + event.comm.decode("utf-8", "replace"), + event.pid, + event.nid, + zone_idx_to_str(event.idx), + event.order, + "SYNC" if event.sync else "ASYNC"), end=" ") + if args.extended_fields: + print("%-8.3f %-8s %-8s %-8s %-8s" % ( + (float(event.fragindex) / 1000), + event.min, event.low, event.high, event.free + ), end=" ") + print("%9.3f %16s" % ( + float(event.delta) / 1000000, compact_result_to_str(event.status))) + if args.kernel_stack: + for addr in stack_traces.walk(event.stack_id): + sym = b.ksym(addr, show_offset=True) + print("\t%s" % sym) + print("") + +# loop with callback to print_event +b["events"].open_perf_buffer(print_event, page_cnt=64) +start_time = datetime.now() +while not args.duration or datetime.now() - start_time < args.duration: + try: + b.perf_buffer_poll() + except KeyboardInterrupt: + exit() diff --git a/tools/compactsnoop_example.txt b/tools/compactsnoop_example.txt new file mode 100644 index 000000000000..563ee80f6de2 --- /dev/null +++ b/tools/compactsnoop_example.txt @@ -0,0 +1,203 @@ +Demonstrations of compactstall, the Linux eBPF/bcc version. + + +compactsnoop traces the compact zone system-wide, and print various details. +Example output (manual trigger by echo 1 > /proc/sys/vm/compact_memory): + +# ./compactsnoop +COMM PID NODE ZONE ORDER MODE LAT(ms) STATUS +zsh 23685 0 ZONE_DMA -1 SYNC 0.025 complete +zsh 23685 0 ZONE_DMA32 -1 SYNC 3.925 complete +zsh 23685 0 ZONE_NORMAL -1 SYNC 113.975 complete +zsh 23685 1 ZONE_NORMAL -1 SYNC 81.57 complete +zsh 23685 0 ZONE_DMA -1 SYNC 0.02 complete +zsh 23685 0 ZONE_DMA32 -1 SYNC 4.631 complete +zsh 23685 0 ZONE_NORMAL -1 SYNC 113.975 complete +zsh 23685 1 ZONE_NORMAL -1 SYNC 80.647 complete +zsh 23685 0 ZONE_DMA -1 SYNC 0.020 complete +zsh 23685 0 ZONE_DMA32 -1 SYNC 3.367 complete +zsh 23685 0 ZONE_NORMAL -1 SYNC 115.18 complete +zsh 23685 1 ZONE_NORMAL -1 SYNC 81.766 complete +zsh 23685 0 ZONE_DMA -1 SYNC 0.025 complete +zsh 23685 0 ZONE_DMA32 -1 SYNC 4.346 complete +zsh 23685 0 ZONE_NORMAL -1 SYNC 114.570 complete +zsh 23685 1 ZONE_NORMAL -1 SYNC 80.820 complete +zsh 23685 0 ZONE_DMA -1 SYNC 0.026 complete +zsh 23685 0 ZONE_DMA32 -1 SYNC 4.611 complete +zsh 23685 0 ZONE_NORMAL -1 SYNC 113.993 complete +zsh 23685 1 ZONE_NORMAL -1 SYNC 80.928 complete +zsh 23685 0 ZONE_DMA -1 SYNC 0.02 complete +zsh 23685 0 ZONE_DMA32 -1 SYNC 3.889 complete +zsh 23685 0 ZONE_NORMAL -1 SYNC 113.776 complete +zsh 23685 1 ZONE_NORMAL -1 SYNC 80.727 complete +^C + +While tracing, the processes alloc pages due to memory fragmentation is too +serious to meet contiguous memory requirements in the system, compact zone +events happened, which will increase the waiting delay of the processes. + +compactsnoop can be useful for discovering when compact_stall(/proc/vmstat) +continues to increase, whether it is caused by some critical processes or not. + +The STATUS include (CentOS 7.6's kernel) + + compact_status = { + # COMPACT_SKIPPED: compaction didn't start as it was not possible or direct reclaim was more suitable + 0: "skipped", + # COMPACT_CONTINUE: compaction should continue to another pageblock + 1: "continue", + # COMPACT_PARTIAL: direct compaction partially compacted a zone and there are suitable pages + 2: "partial", + # COMPACT_COMPLETE: The full zone was compacted + 3: "complete", + } + +or (kernel 4.7 and above) + + compact_status = { + # COMPACT_NOT_SUITABLE_ZONE: For more detailed tracepoint output - internal to compaction + 0: "not_suitable_zone", + # COMPACT_SKIPPED: compaction didn't start as it was not possible or direct reclaim was more suitable + 1: "skipped", + # COMPACT_DEFERRED: compaction didn't start as it was deferred due to past failures + 2: "deferred", + # COMPACT_NOT_SUITABLE_PAGE: For more detailed tracepoint output - internal to compaction + 3: "no_suitable_page", + # COMPACT_CONTINUE: compaction should continue to another pageblock + 4: "continue", + # COMPACT_COMPLETE: The full zone was compacted scanned but wasn't successfull to compact suitable pages. + 5: "complete", + # COMPACT_PARTIAL_SKIPPED: direct compaction has scanned part of the zone but wasn't successfull to compact suitable pages. + 6: "partial_skipped", + # COMPACT_CONTENDED: compaction terminated prematurely due to lock contentions + 7: "contended", + # COMPACT_SUCCESS: direct compaction terminated after concluding that the allocation should now succeed + 8: "success", + } + +The -p option can be used to filter on a PID, which is filtered in-kernel. Here +I've used it with -T to print timestamps: + +# ./compactsnoop -Tp 24376 +TIME(s) COMM PID NODE ZONE ORDER MODE LAT(ms) STATUS +101.364115000 zsh 24376 0 ZONE_DMA -1 SYNC 0.025 complete +101.364555000 zsh 24376 0 ZONE_DMA32 -1 SYNC 3.925 complete +^C + +This shows the zsh process allocs pages, and compact zone events happening, +and the delays are not affected much. + +A maximum tracing duration can be set with the -d option. For example, to trace +for 2 seconds: + +# ./compactsnoop -d 2 +COMM PID NODE ZONE ORDER MODE LAT(ms) STATUS +zsh 26385 0 ZONE_DMA -1 SYNC 0.025444 complete +^C + +The -e option prints out extra columns + +# ./compactsnoop -e +COMM PID NODE ZONE ORDER MODE FRAGIDX MIN LOW HIGH FREE LAT(ms) STATUS +summ 28276 1 ZONE_NORMAL 3 ASYNC 0.728 11284 14105 16926 14193 3.58 partial +summ 28276 0 ZONE_NORMAL 2 ASYNC -1.000 11043 13803 16564 14479 0.0 complete +summ 28276 1 ZONE_NORMAL 2 ASYNC -1.000 11284 14105 16926 14785 0.019 complete +summ 28276 0 ZONE_NORMAL 2 ASYNC -1.000 11043 13803 16564 15199 0.006 partial +summ 28276 1 ZONE_NORMAL 2 ASYNC -1.000 11284 14105 16926 17360 0.030 complete +summ 28276 0 ZONE_NORMAL 2 ASYNC -1.000 11043 13803 16564 15443 0.024 complete +summ 28276 1 ZONE_NORMAL 2 ASYNC -1.000 11284 14105 16926 15634 0.018 complete +summ 28276 1 ZONE_NORMAL 3 ASYNC 0.832 11284 14105 16926 15301 0.006 partial +summ 28276 0 ZONE_NORMAL 2 ASYNC -1.000 11043 13803 16564 14774 0.005 partial +summ 28276 1 ZONE_NORMAL 3 ASYNC 0.733 11284 14105 16926 19888 0.012 partial +^C + +The FRAGIDX is short for fragmentation index, which only makes sense if an +allocation of a requested size would fail. If that is true, the fragmentation +index indicates whether external fragmentation or a lack of memory was the +problem. The value can be used to determine if page reclaim or compaction +should be used. + +Index is between 0 and 1 so return within 3 decimal places + +0 => allocation would fail due to lack of memory +1 => allocation would fail due to fragmentation + +We can see the whole buddy's fragmentation index from /sys/kernel/debug/extfrag/extfrag_index + +The MIN/LOW/HIGH shows the watermarks of the zone, which can also get from +/proc/zoneinfo, and FREE means nr_free_pages (can be found in /proc/zoneinfo too). + + +The -K option prints out kernel stack + +# ./compactsnoop -K -e + +summ 28276 0 ZONE_NORMAL 3 ASYNC 0.528 11043 13803 16564 22654 13.258 partial + kretprobe_trampoline+0x0 + try_to_compact_pages+0x121 + __alloc_pages_direct_compact+0xac + __alloc_pages_slowpath+0x3e9 + __alloc_pages_nodemask+0x404 + alloc_pages_current+0x98 + new_slab+0x2c5 + ___slab_alloc+0x3ac + __slab_alloc+0x40 + kmem_cache_alloc_node+0x8b + copy_process+0x18e + do_fork+0x91 + sys_clone+0x16 + stub_clone+0x44 + +summ 28276 1 ZONE_NORMAL 3 ASYNC -1.000 11284 14105 16926 22074 0.008 partial + kretprobe_trampoline+0x0 + try_to_compact_pages+0x121 + __alloc_pages_direct_compact+0xac + __alloc_pages_slowpath+0x3e9 + __alloc_pages_nodemask+0x404 + alloc_pages_current+0x98 + new_slab+0x2c5 + ___slab_alloc+0x3ac + __slab_alloc+0x40 + kmem_cache_alloc_node+0x8b + copy_process+0x18e + do_fork+0x91 + sys_clone+0x16 + stub_clone+0x44 + +summ 28276 0 ZONE_NORMAL 3 ASYNC 0.527 11043 13803 16564 25653 9.812 partial + kretprobe_trampoline+0x0 + try_to_compact_pages+0x121 + __alloc_pages_direct_compact+0xac + __alloc_pages_slowpath+0x3e9 + __alloc_pages_nodemask+0x404 + alloc_pages_current+0x98 + new_slab+0x2c5 + ___slab_alloc+0x3ac + __slab_alloc+0x40 + kmem_cache_alloc_node+0x8b + copy_process+0x18e + do_fork+0x91 + sys_clone+0x16 + stub_clone+0x44 + +# ./compactsnoop -h +usage: compactsnoop.py [-h] [-T] [-p PID] [-d DURATION] [-K] [-e] + +Trace compact zone + +optional arguments: + -h, --help show this help message and exit + -T, --timestamp include timestamp on output + -p PID, --pid PID trace this PID only + -d DURATION, --duration DURATION + total duration of trace in seconds + -K, --kernel-stack output kernel stack trace + -e, --extended_fields + show system memory state + +examples: + ./compactsnoop # trace all compact stall + ./compactsnoop -T # include timestamps + ./compactsnoop -d 10 # trace for 10 seconds only + ./compactsnoop -K # output kernel stack trace + ./compactsnoop -e # show extended fields diff --git a/tools/old/compactsnoop.py b/tools/old/compactsnoop.py new file mode 100755 index 000000000000..c544041763f4 --- /dev/null +++ b/tools/old/compactsnoop.py @@ -0,0 +1,390 @@ +#!/usr/bin/python +# @lint-avoid-python-3-compatibility-imports +# +# compactsnoop Trace compact zone and print details including issuing PID. +# For Linux, uses BCC, eBPF. +# +# This uses in-kernel eBPF maps to cache process details (PID and comm) by +# compact zone begin, as well as a starting timestamp for calculating +# latency. +# +# Copyright (c) 2019 Wenbo Zhang +# Licensed under the Apache License, Version 2.0 (the "License") +# +# 11-NOV-2019 Wenbo Zhang Created this. + +from __future__ import print_function +from bcc import BPF +import argparse +import platform +from datetime import datetime, timedelta + +# arguments +examples = """examples: + ./compactsnoop # trace all compact stall + ./compactsnoop -T # include timestamps + ./compactsnoop -d 10 # trace for 10 seconds only + ./compactsnoop -K # output kernel stack trace + ./compactsnoop -e # show extended fields +""" + +parser = argparse.ArgumentParser( + description="Trace compact zone", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples, +) +parser.add_argument("-T", "--timestamp", action="store_true", + help="include timestamp on output") +parser.add_argument("-p", "--pid", help="trace this PID only") +parser.add_argument("-d", "--duration", + help="total duration of trace in seconds") +parser.add_argument("-K", "--kernel-stack", action="store_true", + help="output kernel stack trace") +parser.add_argument("-e", "--extended_fields", action="store_true", + help="show system memory state") +parser.add_argument("--ebpf", action="store_true", help=argparse.SUPPRESS) +args = parser.parse_args() +debug = 0 +if args.duration: + args.duration = timedelta(seconds=int(args.duration)) + +NO_EXTENDED = """ +#ifdef EXTNEDED_FIELDS +#undef EXTNEDED_FIELDS +#endif +""" + +EXTENDED = """ +#define EXTNEDED_FIELDS 1 +""" + +bpf_text = """ +#include +#include +#include +struct node; +#include + +struct compact_control { + struct list_head freepages; /* List of free pages to migrate to */ + struct list_head migratepages; /* List of pages being migrated */ + unsigned long nr_freepages; /* Number of isolated free pages */ + unsigned long nr_migratepages; /* Number of pages to migrate */ + unsigned long free_pfn; /* isolate_freepages search base */ + unsigned long migrate_pfn; /* isolate_migratepages search base */ + bool sync; /* Synchronous migration */ +}; + +struct val_t { + int nid; + int idx; + int order; + int sync; +#ifdef EXTNEDED_FIELDS + int fragindex; + int low; + int min; + int high; + int free; +#endif + u64 ts; // compaction begin time +}; + +struct data_t { + u32 pid; + u32 tid; + int nid; + int idx; + int order; + u64 delta; + u64 ts; // compaction end time + int sync; +#ifdef EXTNEDED_FIELDS + int fragindex; + int low; + int min; + int high; + int free; +#endif + int status; + int stack_id; + char comm[TASK_COMM_LEN]; +}; + +BPF_HASH(start, u64, struct val_t); +BPF_PERF_OUTPUT(events); +BPF_STACK_TRACE(stack_traces, 2048); + +#ifdef CONFIG_NUMA +static inline int zone_to_nid_(struct zone *zone) +{ + int node; + bpf_probe_read(&node, sizeof(node), &zone->node); + return node; +} +#else +static inline int zone_to_nid_(struct zone *zone) +{ + return 0; +} +#endif + +// #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) +static inline int zone_idx_(struct zone *zone) +{ + struct pglist_data *zone_pgdat = NULL; + bpf_probe_read(&zone_pgdat, sizeof(zone_pgdat), &zone->zone_pgdat); + return zone - zone_pgdat->node_zones; +} + +#ifdef EXTNEDED_FIELDS +static inline void get_all_wmark_pages(struct zone *zone, struct val_t *valp) +{ + u64 watermark[NR_WMARK] = {}; + u64 watermark_boost = 0; + + bpf_probe_read(&watermark, sizeof(watermark), &zone->watermark); + valp->min = watermark[WMARK_MIN]; + valp->low = watermark[WMARK_LOW]; + valp->high = watermark[WMARK_HIGH]; + bpf_probe_read(&valp->free, sizeof(valp->free), + &zone->vm_stat[NR_FREE_PAGES]); +} +#endif + +int trace_compact_zone_entry(struct pt_regs *ctx, struct zone *zone, + struct compact_control *cc) +{ +#ifdef EXTNEDED_FIELDS + struct val_t val = { .fragindex=-1000 }; +#else + struct val_t val = { }; +#endif + u64 id = bpf_get_current_pid_tgid(); + PID_FILTER + val.sync = cc->sync; + start.update(&id, &val); + return 0; +} + +int trace_compaction_suitable_entry(struct pt_regs *ctx, struct zone *zone, + int order) +{ + u64 id = bpf_get_current_pid_tgid(); + struct val_t *valp = start.lookup(&id); + if (valp == NULL) { + // missed entry + return 0; + } + valp->nid = zone_to_nid_(zone); + valp->idx = zone_idx_(zone); + valp->order = order; + +#ifdef EXTNEDED_FIELDS + get_all_wmark_pages(zone, valp); +#endif + + return 0; +} + +int trace_fragmentation_index_return(struct pt_regs *ctx) +{ + int ret = PT_REGS_RC(ctx); + u64 id = bpf_get_current_pid_tgid(); + struct val_t *valp = start.lookup(&id); + if (valp == NULL) { + // missed entry + return 0; + } +#ifdef EXTNEDED_FIELDS + valp->fragindex = ret; +#endif + return 0; +} + +int trace_compaction_suitable_return(struct pt_regs *ctx) +{ + int ret = PT_REGS_RC(ctx); + u64 id = bpf_get_current_pid_tgid(); + struct val_t *valp = start.lookup(&id); + if (valp == NULL) { + // missed entry + return 0; + } + if (ret != COMPACT_CONTINUE) + start.delete(&id); + else + valp->ts = bpf_ktime_get_ns(); + return 0; +} + +int trace_compact_zone_return(struct pt_regs *ctx) +{ + int ret = PT_REGS_RC(ctx); + struct data_t data = {}; + u64 ts = bpf_ktime_get_ns(); + u64 id = bpf_get_current_pid_tgid(); + struct val_t *valp = start.lookup(&id); + if (valp == NULL) { + // missed entry or unsuitable + return 0; + } + + data.delta = ts - valp->ts; + data.ts = ts / 1000; + data.pid = id >> 32; + data.tid = id; + bpf_get_current_comm(&data.comm, sizeof(data.comm)); + data.nid = valp->nid; + data.idx = valp->idx; + data.order = valp->order; + data.sync = valp->sync; + +#ifdef EXTNEDED_FIELDS + data.fragindex = valp->fragindex; + data.min = valp->min; + data.low = valp->low; + data.high = valp->high; + data.free = valp->free; +#endif + + data.status = ret; + data.stack_id = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID); + + events.perf_submit(ctx, &data, sizeof(data)); + + start.delete(&id); + return 0; +} +""" + +if platform.machine() != 'x86_64': + print(""" + Currently only support x86_64 servers, if you want to use it on + other platforms, please refer include/linux/mmzone.h to modify + zone_idex_to_str to get the right zone type + """) + exit() + +if args.extended_fields: + bpf_text = EXTENDED + bpf_text +else: + bpf_text = NO_EXTENDED + bpf_text + +if args.pid: + bpf_text = bpf_text.replace( + "PID_FILTER", "if (id >> 32 != %s) { return 0; }" % args.pid) +else: + bpf_text = bpf_text.replace("PID_FILTER", "") +if debug or args.ebpf: + print(bpf_text) + if args.ebpf: + exit() + +# load BPF program +b = BPF(text=bpf_text) +b.attach_kprobe(event="compact_zone", fn_name="trace_compact_zone_entry") +b.attach_kretprobe(event="compact_zone", fn_name="trace_compact_zone_return") +b.attach_kprobe( + event="compaction_suitable", fn_name="trace_compaction_suitable_entry" +) +b.attach_kretprobe( + event="fragmentation_index", fn_name="trace_fragmentation_index_return" +) +b.attach_kretprobe( + event="compaction_suitable", fn_name="trace_compaction_suitable_return" +) + +stack_traces = b.get_table("stack_traces") +initial_ts = 0 + +def zone_idx_to_str(idx): + # from include/linux/mmzone.h + # NOTICE: consider only x86_64 servers + zonetype = { + 0: "ZONE_DMA", + 1: "ZONE_DMA32", + 2: "ZONE_NORMAL", + } + + if idx in zonetype: + return zonetype[idx] + else: + return str(idx) + +def compact_result_to_str(status): + # from include/linux/compaction.h + compact_status = { + # COMPACT_SKIPPED: compaction didn't start as it was not possible + # or direct reclaim was more suitable + 0: "skipped", + # COMPACT_CONTINUE: compaction should continue to another pageblock + 1: "continue", + # COMPACT_PARTIAL: direct compaction partially compacted a zone and + # there are suitable pages + 2: "partial", + # COMPACT_COMPLETE: The full zone was compacted + 3: "complete", + } + + if status in compact_status: + return compact_status[status] + else: + return str(status) + +# header +if args.timestamp: + print("%-14s" % ("TIME(s)"), end=" ") +print( + "%-14s %-6s %-4s %-12s %-5s %-7s" + % ("COMM", "PID", "NODE", "ZONE", "ORDER", "MODE"), + end=" ", +) +if args.extended_fields: + print("%-8s %-8s %-8s %-8s %-8s" % + ("FRAGIDX", "MIN", "LOW", "HIGH", "FREE"), end=" ") +print("%9s %16s" % ("LAT(ms)", "STATUS")) + +# process event +def print_event(cpu, data, size): + event = b["events"].event(data) + + global initial_ts + + if not initial_ts: + initial_ts = event.ts + + if args.timestamp: + delta = event.ts - initial_ts + print("%-14.9f" % (float(delta) / 1000000), end=" ") + + print("%-14.14s %-6s %-4s %-12s %-5s %-7s" % ( + event.comm.decode("utf-8", "replace"), + event.pid, + event.nid, + zone_idx_to_str(event.idx), + event.order, + "SYNC" if event.sync else "ASYNC"), end=" ") + if args.extended_fields: + print("%-8.3f %-8s %-8s %-8s %-8s" % ( + float(event.fragindex) / 1000, + event.min, + event.low, + event.high, + event.free), end=" ") + print("%9.3f %16s" % ( + float(event.delta) / 1000000, compact_result_to_str(event.status))) + if args.kernel_stack: + for addr in stack_traces.walk(event.stack_id): + sym = b.ksym(addr, show_offset=True) + print("\t%s" % sym) + print("") + +# loop with callback to print_event +b["events"].open_perf_buffer(print_event, page_cnt=64) +start_time = datetime.now() +while not args.duration or datetime.now() - start_time < args.duration: + try: + b.perf_buffer_poll() + except KeyboardInterrupt: + exit()