aboutsummaryrefslogtreecommitdiffstats
path: root/samples
diff options
context:
space:
mode:
authorWe-unite <3205135446@qq.com>2025-03-08 22:04:20 +0800
committerWe-unite <3205135446@qq.com>2025-03-08 22:04:20 +0800
commita07bb8fd1299070229f0e8f3dcb57ffd5ef9870a (patch)
tree84f21bd0bf7071bc5fc7dd989e77d7ceb5476682 /samples
downloadohosKernel-a07bb8fd1299070229f0e8f3dcb57ffd5ef9870a.tar.gz
ohosKernel-a07bb8fd1299070229f0e8f3dcb57ffd5ef9870a.zip
Initial commit: OpenHarmony-v4.0-ReleaseOpenHarmony-v4.0-Release
Diffstat (limited to 'samples')
-rw-r--r--samples/Kconfig242
-rw-r--r--samples/Makefile32
-rw-r--r--samples/auxdisplay/.gitignore2
-rw-r--r--samples/auxdisplay/Makefile2
-rw-r--r--samples/auxdisplay/cfag12864b-example.c267
-rw-r--r--samples/binderfs/.gitignore1
-rw-r--r--samples/binderfs/Makefile4
-rw-r--r--samples/binderfs/binderfs_example.c82
-rw-r--r--samples/bpf/.gitignore54
-rw-r--r--samples/bpf/Makefile329
-rw-r--r--samples/bpf/Makefile.target75
-rw-r--r--samples/bpf/README.rst105
-rw-r--r--samples/bpf/asm_goto_workaround.h28
-rw-r--r--samples/bpf/bpf_insn.h217
-rw-r--r--samples/bpf/bpf_load.c667
-rw-r--r--samples/bpf/bpf_load.h57
-rw-r--r--samples/bpf/cookie_uid_helper_example.c323
-rw-r--r--samples/bpf/cpustat_kern.c281
-rw-r--r--samples/bpf/cpustat_user.c252
-rwxr-xr-xsamples/bpf/do_hbm_test.sh442
-rw-r--r--samples/bpf/fds_example.c193
-rw-r--r--samples/bpf/hash_func01.h55
-rw-r--r--samples/bpf/hbm.c499
-rw-r--r--samples/bpf/hbm.h38
-rw-r--r--samples/bpf/hbm_edt_kern.c168
-rw-r--r--samples/bpf/hbm_kern.h217
-rw-r--r--samples/bpf/hbm_out_kern.c179
-rw-r--r--samples/bpf/ibumad_kern.c138
-rw-r--r--samples/bpf/ibumad_user.c122
-rw-r--r--samples/bpf/lathist_kern.c99
-rw-r--r--samples/bpf/lathist_user.c130
-rwxr-xr-xsamples/bpf/lwt_len_hist.sh40
-rw-r--r--samples/bpf/lwt_len_hist_kern.c82
-rw-r--r--samples/bpf/lwt_len_hist_user.c77
-rw-r--r--samples/bpf/map_perf_test_kern.c291
-rw-r--r--samples/bpf/map_perf_test_user.c507
-rw-r--r--samples/bpf/offwaketime_kern.c157
-rw-r--r--samples/bpf/offwaketime_user.c160
-rw-r--r--samples/bpf/parse_ldabs.c43
-rw-r--r--samples/bpf/parse_simple.c49
-rw-r--r--samples/bpf/parse_varlen.c150
-rwxr-xr-xsamples/bpf/run_cookie_uid_helper_example.sh15
-rw-r--r--samples/bpf/sampleip_kern.c39
-rw-r--r--samples/bpf/sampleip_user.c227
-rw-r--r--samples/bpf/sock_example.c106
-rw-r--r--samples/bpf/sock_example.h35
-rw-r--r--samples/bpf/sock_flags_kern.c49
-rw-r--r--samples/bpf/sockex1_kern.c30
-rw-r--r--samples/bpf/sockex1_user.c54
-rw-r--r--samples/bpf/sockex2_kern.c223
-rw-r--r--samples/bpf/sockex2_user.c57
-rw-r--r--samples/bpf/sockex3_kern.c293
-rw-r--r--samples/bpf/sockex3_user.c106
-rw-r--r--samples/bpf/spintest_kern.c69
-rw-r--r--samples/bpf/spintest_user.c99
-rw-r--r--samples/bpf/syscall_nrs.c19
-rw-r--r--samples/bpf/syscall_tp_kern.c73
-rw-r--r--samples/bpf/syscall_tp_user.c138
-rw-r--r--samples/bpf/task_fd_query_kern.c19
-rw-r--r--samples/bpf/task_fd_query_user.c383
-rwxr-xr-xsamples/bpf/tc_l2_redirect.sh174
-rw-r--r--samples/bpf/tc_l2_redirect_kern.c237
-rw-r--r--samples/bpf/tc_l2_redirect_user.c70
-rw-r--r--samples/bpf/tcbpf1_kern.c91
-rw-r--r--samples/bpf/tcp_basertt_kern.c71
-rw-r--r--samples/bpf/tcp_bpf.readme28
-rw-r--r--samples/bpf/tcp_bufs_kern.c81
-rw-r--r--samples/bpf/tcp_clamp_kern.c97
-rw-r--r--samples/bpf/tcp_cong_kern.c78
-rw-r--r--samples/bpf/tcp_dumpstats_kern.c68
-rw-r--r--samples/bpf/tcp_iw_kern.c83
-rw-r--r--samples/bpf/tcp_rwnd_kern.c64
-rw-r--r--samples/bpf/tcp_synrto_kern.c64
-rw-r--r--samples/bpf/tcp_tos_reflect_kern.c80
-rw-r--r--samples/bpf/test_cgrp2_array_pin.c106
-rw-r--r--samples/bpf/test_cgrp2_attach.c172
-rw-r--r--samples/bpf/test_cgrp2_sock.c290
-rwxr-xr-xsamples/bpf/test_cgrp2_sock.sh135
-rw-r--r--samples/bpf/test_cgrp2_sock2.c68
-rwxr-xr-xsamples/bpf/test_cgrp2_sock2.sh85
-rwxr-xr-xsamples/bpf/test_cgrp2_tc.sh185
-rw-r--r--samples/bpf/test_cgrp2_tc_kern.c70
-rwxr-xr-xsamples/bpf/test_cls_bpf.sh38
-rw-r--r--samples/bpf/test_current_task_under_cgroup_kern.c44
-rw-r--r--samples/bpf/test_current_task_under_cgroup_user.c113
-rwxr-xr-xsamples/bpf/test_ipip.sh179
-rw-r--r--samples/bpf/test_lru_dist.c540
-rw-r--r--samples/bpf/test_lwt_bpf.c253
-rwxr-xr-xsamples/bpf/test_lwt_bpf.sh400
-rw-r--r--samples/bpf/test_map_in_map_kern.c176
-rw-r--r--samples/bpf/test_map_in_map_user.c173
-rw-r--r--samples/bpf/test_overhead_kprobe_kern.c48
-rw-r--r--samples/bpf/test_overhead_raw_tp_kern.c17
-rw-r--r--samples/bpf/test_overhead_tp_kern.c36
-rw-r--r--samples/bpf/test_overhead_user.c182
-rwxr-xr-xsamples/bpf/test_override_return.sh16
-rw-r--r--samples/bpf/test_probe_write_user_kern.c56
-rw-r--r--samples/bpf/test_probe_write_user_user.c108
-rw-r--r--samples/bpf/trace_common.h13
-rw-r--r--samples/bpf/trace_event_kern.c80
-rw-r--r--samples/bpf/trace_event_user.c354
-rw-r--r--samples/bpf/trace_output_kern.c31
-rw-r--r--samples/bpf/trace_output_user.c107
-rw-r--r--samples/bpf/tracex1_kern.c54
-rw-r--r--samples/bpf/tracex1_user.c50
-rw-r--r--samples/bpf/tracex2_kern.c102
-rw-r--r--samples/bpf/tracex2_user.c193
-rw-r--r--samples/bpf/tracex3_kern.c90
-rw-r--r--samples/bpf/tracex3_user.c190
-rw-r--r--samples/bpf/tracex4_kern.c55
-rw-r--r--samples/bpf/tracex4_user.c103
-rw-r--r--samples/bpf/tracex5_kern.c93
-rw-r--r--samples/bpf/tracex5_user.c101
-rw-r--r--samples/bpf/tracex6_kern.c69
-rw-r--r--samples/bpf/tracex6_user.c226
-rw-r--r--samples/bpf/tracex7_kern.c16
-rw-r--r--samples/bpf/tracex7_user.c56
-rw-r--r--samples/bpf/xdp1_kern.c93
-rw-r--r--samples/bpf/xdp1_user.c167
-rw-r--r--samples/bpf/xdp2_kern.c114
-rwxr-xr-xsamples/bpf/xdp2skb_meta.sh220
-rw-r--r--samples/bpf/xdp2skb_meta_kern.c105
-rw-r--r--samples/bpf/xdp_adjust_tail_kern.c155
-rw-r--r--samples/bpf/xdp_adjust_tail_user.c198
-rw-r--r--samples/bpf/xdp_fwd_kern.c158
-rw-r--r--samples/bpf/xdp_fwd_user.c170
-rw-r--r--samples/bpf/xdp_monitor_kern.c257
-rw-r--r--samples/bpf/xdp_monitor_user.c792
-rw-r--r--samples/bpf/xdp_redirect_cpu_kern.c730
-rw-r--r--samples/bpf/xdp_redirect_cpu_user.c983
-rw-r--r--samples/bpf/xdp_redirect_kern.c90
-rw-r--r--samples/bpf/xdp_redirect_map_kern.c92
-rw-r--r--samples/bpf/xdp_redirect_map_user.c222
-rw-r--r--samples/bpf/xdp_redirect_user.c223
-rw-r--r--samples/bpf/xdp_router_ipv4_kern.c186
-rw-r--r--samples/bpf/xdp_router_ipv4_user.c741
-rw-r--r--samples/bpf/xdp_rxq_info_kern.c140
-rw-r--r--samples/bpf/xdp_rxq_info_user.c605
-rw-r--r--samples/bpf/xdp_sample_pkts_kern.c57
-rw-r--r--samples/bpf/xdp_sample_pkts_user.c202
-rw-r--r--samples/bpf/xdp_tx_iptunnel_common.h34
-rw-r--r--samples/bpf/xdp_tx_iptunnel_kern.c237
-rw-r--r--samples/bpf/xdp_tx_iptunnel_user.c314
-rw-r--r--samples/bpf/xdpsock.h11
-rw-r--r--samples/bpf/xdpsock_kern.c24
-rw-r--r--samples/bpf/xdpsock_user.c1550
-rw-r--r--samples/bpf/xsk_fwd.c1085
-rw-r--r--samples/configfs/Makefile3
-rw-r--r--samples/configfs/configfs_sample.c369
-rw-r--r--samples/connector/.gitignore2
-rw-r--r--samples/connector/Makefile6
-rw-r--r--samples/connector/cn_test.c188
-rw-r--r--samples/connector/ucon.c236
-rw-r--r--samples/ftrace/Makefile8
-rw-r--r--samples/ftrace/ftrace-direct-modify.c97
-rw-r--r--samples/ftrace/ftrace-direct-too.c57
-rw-r--r--samples/ftrace/ftrace-direct.c50
-rw-r--r--samples/ftrace/sample-trace-array.c143
-rw-r--r--samples/ftrace/sample-trace-array.h84
-rw-r--r--samples/hck/Makefile6
-rw-r--r--samples/hck/call.c24
-rw-r--r--samples/hck/register.c48
-rw-r--r--samples/hck/register_one.c31
-rw-r--r--samples/hidraw/.gitignore2
-rw-r--r--samples/hidraw/Makefile4
-rw-r--r--samples/hidraw/hid-example.c182
-rw-r--r--samples/hw_breakpoint/Makefile2
-rw-r--r--samples/hw_breakpoint/data_breakpoint.c82
-rw-r--r--samples/kdb/Makefile2
-rw-r--r--samples/kdb/kdb_hello.c60
-rw-r--r--samples/kfifo/Makefile2
-rw-r--r--samples/kfifo/bytestream-example.c195
-rw-r--r--samples/kfifo/dma-example.c141
-rw-r--r--samples/kfifo/inttype-example.c186
-rw-r--r--samples/kfifo/record-example.c202
-rw-r--r--samples/kmemleak/Makefile3
-rw-r--r--samples/kmemleak/kmemleak-test.c99
-rw-r--r--samples/kobject/Makefile2
-rw-r--r--samples/kobject/kobject-example.c144
-rw-r--r--samples/kobject/kset-example.c288
-rw-r--r--samples/kprobes/Makefile6
-rw-r--r--samples/kprobes/kprobe_example.c120
-rw-r--r--samples/kprobes/kretprobe_example.c108
-rw-r--r--samples/livepatch/Makefile8
-rw-r--r--samples/livepatch/livepatch-callbacks-busymod.c60
-rw-r--r--samples/livepatch/livepatch-callbacks-demo.c196
-rw-r--r--samples/livepatch/livepatch-callbacks-mod.c41
-rw-r--r--samples/livepatch/livepatch-sample.c70
-rw-r--r--samples/livepatch/livepatch-shadow-fix1.c173
-rw-r--r--samples/livepatch/livepatch-shadow-fix2.c132
-rw-r--r--samples/livepatch/livepatch-shadow-mod.c217
-rw-r--r--samples/mei/.gitignore2
-rw-r--r--samples/mei/Makefile5
-rw-r--r--samples/mei/mei-amt-version.c479
-rw-r--r--samples/nitro_enclaves/.gitignore2
-rw-r--r--samples/nitro_enclaves/Makefile16
-rw-r--r--samples/nitro_enclaves/ne_ioctl_sample.c883
-rw-r--r--samples/pidfd/.gitignore2
-rw-r--r--samples/pidfd/Makefile4
-rw-r--r--samples/pidfd/pidfd-metadata.c120
-rw-r--r--samples/pktgen/README.rst46
-rw-r--r--samples/pktgen/functions.sh334
-rw-r--r--samples/pktgen/parameters.sh121
-rwxr-xr-xsamples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh105
-rwxr-xr-xsamples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh85
-rwxr-xr-xsamples/pktgen/pktgen_sample01_simple.sh90
-rwxr-xr-xsamples/pktgen/pktgen_sample02_multiqueue.sh95
-rwxr-xr-xsamples/pktgen/pktgen_sample03_burst_single_flow.sh101
-rwxr-xr-xsamples/pktgen/pktgen_sample04_many_flows.sh115
-rwxr-xr-xsamples/pktgen/pktgen_sample05_flow_per_thread.sh99
-rwxr-xr-xsamples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh113
-rw-r--r--samples/qmi/Makefile2
-rw-r--r--samples/qmi/qmi_sample_client.c622
-rw-r--r--samples/rpmsg/Makefile2
-rw-r--r--samples/rpmsg/rpmsg_client_sample.c96
-rw-r--r--samples/seccomp/.gitignore5
-rw-r--r--samples/seccomp/Makefile6
-rw-r--r--samples/seccomp/bpf-direct.c191
-rw-r--r--samples/seccomp/bpf-fancy.c105
-rw-r--r--samples/seccomp/bpf-helper.c96
-rw-r--r--samples/seccomp/bpf-helper.h263
-rw-r--r--samples/seccomp/dropper.c72
-rw-r--r--samples/seccomp/user-trap.c375
-rw-r--r--samples/timers/.gitignore2
-rw-r--r--samples/timers/Makefile4
-rw-r--r--samples/timers/hpet_example.c295
-rw-r--r--samples/trace_events/Makefile15
-rw-r--r--samples/trace_events/trace-events-sample.c140
-rw-r--r--samples/trace_events/trace-events-sample.h524
-rw-r--r--samples/trace_printk/Makefile7
-rw-r--r--samples/trace_printk/trace-printk.c58
-rw-r--r--samples/uhid/.gitignore2
-rw-r--r--samples/uhid/Makefile4
-rw-r--r--samples/uhid/uhid-example.c465
-rw-r--r--samples/v4l/Makefile2
-rw-r--r--samples/v4l/v4l2-pci-skeleton.c915
-rw-r--r--samples/vfio-mdev/Makefile5
-rw-r--r--samples/vfio-mdev/mbochs.c1485
-rw-r--r--samples/vfio-mdev/mdpy-defs.h22
-rw-r--r--samples/vfio-mdev/mdpy-fb.c243
-rw-r--r--samples/vfio-mdev/mdpy.c807
-rw-r--r--samples/vfio-mdev/mtty.c1491
-rw-r--r--samples/vfs/.gitignore3
-rw-r--r--samples/vfs/Makefile4
-rw-r--r--samples/vfs/test-fsmount.c129
-rw-r--r--samples/vfs/test-statx.c265
-rw-r--r--samples/watch_queue/.gitignore1
-rw-r--r--samples/watch_queue/Makefile4
-rw-r--r--samples/watch_queue/watch_test.c186
-rw-r--r--samples/watchdog/.gitignore2
-rw-r--r--samples/watchdog/Makefile2
-rw-r--r--samples/watchdog/watchdog-simple.c25
252 files changed, 41442 insertions, 0 deletions
diff --git a/samples/Kconfig b/samples/Kconfig
new file mode 100644
index 000000000..0dbd22e06
--- /dev/null
+++ b/samples/Kconfig
@@ -0,0 +1,242 @@
1# SPDX-License-Identifier: GPL-2.0-only
2menuconfig SAMPLES
3 bool "Sample kernel code"
4 help
5 You can build and test sample kernel code here.
6
7if SAMPLES
8
9config SAMPLE_AUXDISPLAY
10 bool "auxdisplay sample"
11 depends on CC_CAN_LINK
12
13config SAMPLE_TRACE_EVENTS
14 tristate "Build trace_events examples -- loadable modules only"
15 depends on EVENT_TRACING && m
16 help
17 This build trace event example modules.
18
19config SAMPLE_TRACE_PRINTK
20 tristate "Build trace_printk module - tests various trace_printk formats"
21 depends on EVENT_TRACING && m
22 help
23 This builds a module that calls trace_printk() and can be used to
24 test various trace_printk() calls from a module.
25
26config SAMPLE_FTRACE_DIRECT
27 tristate "Build register_ftrace_direct() example"
28 depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS && m
29 depends on X86_64 # has x86_64 inlined asm
30 help
31 This builds an ftrace direct function example
32 that hooks to wake_up_process and prints the parameters.
33
34config SAMPLE_TRACE_ARRAY
35 tristate "Build sample module for kernel access to Ftrace instancess"
36 depends on EVENT_TRACING && m
37 help
38 This builds a module that demonstrates the use of various APIs to
39 access Ftrace instances from within the kernel.
40
41config SAMPLE_KOBJECT
42 tristate "Build kobject examples"
43 help
44 This config option will allow you to build a number of
45 different kobject sample modules showing how to use kobjects,
46 ksets, and ktypes properly.
47
48 If in doubt, say "N" here.
49
50config SAMPLE_KPROBES
51 tristate "Build kprobes examples -- loadable modules only"
52 depends on KPROBES && m
53 help
54 This build several kprobes example modules.
55
56config SAMPLE_KRETPROBES
57 tristate "Build kretprobes example -- loadable modules only"
58 default m
59 depends on SAMPLE_KPROBES && KRETPROBES
60
61config SAMPLE_HW_BREAKPOINT
62 tristate "Build kernel hardware breakpoint examples -- loadable module only"
63 depends on HAVE_HW_BREAKPOINT && m
64 help
65 This builds kernel hardware breakpoint example modules.
66
67config SAMPLE_KFIFO
68 tristate "Build kfifo examples -- loadable modules only"
69 depends on m
70 help
71 This config option will allow you to build a number of
72 different kfifo sample modules showing how to use the
73 generic kfifo API.
74
75 If in doubt, say "N" here.
76
77config SAMPLE_KDB
78 tristate "Build kdb command example -- loadable modules only"
79 depends on KGDB_KDB && m
80 help
81 Build an example of how to dynamically add the hello
82 command to the kdb shell.
83
84config SAMPLE_QMI_CLIENT
85 tristate "Build qmi client sample -- loadable modules only"
86 depends on m
87 depends on ARCH_QCOM
88 depends on NET
89 select QCOM_QMI_HELPERS
90 help
91 Build an QMI client sample driver, which demonstrates how to
92 communicate with a remote QRTR service, using QMI encoded messages.
93
94config SAMPLE_RPMSG_CLIENT
95 tristate "Build rpmsg client sample -- loadable modules only"
96 depends on RPMSG && m
97 help
98 Build an rpmsg client sample driver, which demonstrates how
99 to communicate with an AMP-configured remote processor over
100 the rpmsg bus.
101
102config SAMPLE_LIVEPATCH
103 tristate "Build live patching samples -- loadable modules only"
104 depends on LIVEPATCH && m
105 help
106 Build sample live patch demonstrations.
107
108config SAMPLE_CONFIGFS
109 tristate "Build configfs patching sample -- loadable modules only"
110 depends on CONFIGFS_FS && m
111 help
112 Builds a sample configfs interface.
113
114config SAMPLE_CONNECTOR
115 tristate "Build connector sample -- loadable modules only"
116 depends on CONNECTOR && HEADERS_INSTALL && m
117 help
118 When enabled, this builds both a sample kernel module for
119 the connector interface and a user space tool to communicate
120 with it.
121 See also Documentation/driver-api/connector.rst
122
123config SAMPLE_HIDRAW
124 bool "hidraw sample"
125 depends on CC_CAN_LINK && HEADERS_INSTALL
126
127config SAMPLE_PIDFD
128 bool "pidfd sample"
129 depends on CC_CAN_LINK && HEADERS_INSTALL
130
131config SAMPLE_SECCOMP
132 bool "Build seccomp sample code"
133 depends on SECCOMP_FILTER && CC_CAN_LINK && HEADERS_INSTALL
134 help
135 Build samples of seccomp filters using various methods of
136 BPF filter construction.
137
138config SAMPLE_TIMER
139 bool "Timer sample"
140 depends on CC_CAN_LINK && HEADERS_INSTALL
141
142config SAMPLE_UHID
143 bool "UHID sample"
144 depends on CC_CAN_LINK && HEADERS_INSTALL
145 help
146 Build UHID sample program.
147
148config SAMPLE_VFIO_MDEV_MTTY
149 tristate "Build VFIO mtty example mediated device sample code -- loadable modules only"
150 depends on VFIO_MDEV_DEVICE && m
151 help
152 Build a virtual tty sample driver for use as a VFIO
153 mediated device
154
155config SAMPLE_VFIO_MDEV_MDPY
156 tristate "Build VFIO mdpy example mediated device sample code -- loadable modules only"
157 depends on VFIO_MDEV_DEVICE && m
158 help
159 Build a virtual display sample driver for use as a VFIO
160 mediated device. It is a simple framebuffer and supports
161 the region display interface (VFIO_GFX_PLANE_TYPE_REGION).
162
163config SAMPLE_VFIO_MDEV_MDPY_FB
164 tristate "Build VFIO mdpy example guest fbdev driver -- loadable module only"
165 depends on FB && m
166 select FB_CFB_FILLRECT
167 select FB_CFB_COPYAREA
168 select FB_CFB_IMAGEBLIT
169 help
170 Guest fbdev driver for the virtual display sample driver.
171
172config SAMPLE_VFIO_MDEV_MBOCHS
173 tristate "Build VFIO mdpy example mediated device sample code -- loadable modules only"
174 depends on VFIO_MDEV_DEVICE && m
175 select DMA_SHARED_BUFFER
176 help
177 Build a virtual display sample driver for use as a VFIO
178 mediated device. It supports the region display interface
179 (VFIO_GFX_PLANE_TYPE_DMABUF).
180 Emulate enough of qemu stdvga to make bochs-drm.ko happy.
181 That is basically the vram memory bar and the bochs dispi
182 interface vbe registers in the mmio register bar.
183 Specifically it does *not* include any legacy vga stuff.
184 Device looks a lot like "qemu -device secondary-vga".
185
186config SAMPLE_ANDROID_BINDERFS
187 bool "Build Android binderfs example"
188 depends on CC_CAN_LINK && HEADERS_INSTALL
189 help
190 Builds a sample program to illustrate the use of the Android binderfs
191 filesystem.
192
193config SAMPLE_VFS
194 bool "Build example programs that use new VFS system calls"
195 depends on CC_CAN_LINK && HEADERS_INSTALL
196 help
197 Build example userspace programs that use new VFS system calls such
198 as mount API and statx(). Note that this is restricted to the x86
199 arch whilst it accesses system calls that aren't yet in all arches.
200
201config SAMPLE_INTEL_MEI
202 bool "Build example program working with intel mei driver"
203 depends on INTEL_MEI
204 depends on CC_CAN_LINK && HEADERS_INSTALL
205 help
206 Build a sample program to work with mei device.
207
208config SAMPLE_WATCHDOG
209 bool "watchdog sample"
210 depends on CC_CAN_LINK
211
212config SAMPLE_WATCH_QUEUE
213 bool "Build example watch_queue notification API consumer"
214 depends on CC_CAN_LINK && HEADERS_INSTALL
215 help
216 Build example userspace program to use the new mount_notify(),
217 sb_notify() syscalls and the KEYCTL_WATCH_KEY keyctl() function.
218
219config SAMPLE_HCK
220 bool "HCK sample"
221 help
222 HCK sample
223
224config SAMPLE_HCK_CALL
225 bool "HCK call sample"
226 depends on SAMPLE_HCK
227 help
228 HCK call sample
229
230config SAMPLE_HCK_REGISTER
231 bool "HCK register sample"
232 depends on SAMPLE_HCK
233 help
234 HCK register sample
235
236config SAMPLE_HCK_REGISTER_ONE
237 bool "HCK register one interface sample"
238 depends on SAMPLE_HCK
239 help
240 HCK register sample
241
242endif # SAMPLES
diff --git a/samples/Makefile b/samples/Makefile
new file mode 100644
index 000000000..e002c114a
--- /dev/null
+++ b/samples/Makefile
@@ -0,0 +1,32 @@
1# SPDX-License-Identifier: GPL-2.0
2# Makefile for Linux samples code
3
4subdir-$(CONFIG_SAMPLE_AUXDISPLAY) += auxdisplay
5subdir-$(CONFIG_SAMPLE_ANDROID_BINDERFS) += binderfs
6obj-$(CONFIG_SAMPLE_CONFIGFS) += configfs/
7obj-$(CONFIG_SAMPLE_CONNECTOR) += connector/
8subdir-$(CONFIG_SAMPLE_HIDRAW) += hidraw
9obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += hw_breakpoint/
10obj-$(CONFIG_SAMPLE_KDB) += kdb/
11obj-$(CONFIG_SAMPLE_KFIFO) += kfifo/
12obj-$(CONFIG_SAMPLE_KOBJECT) += kobject/
13obj-$(CONFIG_SAMPLE_KPROBES) += kprobes/
14obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch/
15subdir-$(CONFIG_SAMPLE_PIDFD) += pidfd
16obj-$(CONFIG_SAMPLE_QMI_CLIENT) += qmi/
17obj-$(CONFIG_SAMPLE_RPMSG_CLIENT) += rpmsg/
18subdir-$(CONFIG_SAMPLE_SECCOMP) += seccomp
19subdir-$(CONFIG_SAMPLE_TIMER) += timers
20obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace_events/
21obj-$(CONFIG_SAMPLE_TRACE_PRINTK) += trace_printk/
22obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace/
23obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += ftrace/
24subdir-$(CONFIG_SAMPLE_UHID) += uhid
25obj-$(CONFIG_VIDEO_PCI_SKELETON) += v4l/
26obj-y += vfio-mdev/
27subdir-$(CONFIG_SAMPLE_VFS) += vfs
28obj-$(CONFIG_SAMPLE_INTEL_MEI) += mei/
29subdir-$(CONFIG_SAMPLE_WATCHDOG) += watchdog
30subdir-$(CONFIG_SAMPLE_WATCH_QUEUE) += watch_queue
31obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak/
32obj-$(CONFIG_SAMPLE_HCK) += hck/
diff --git a/samples/auxdisplay/.gitignore b/samples/auxdisplay/.gitignore
new file mode 100644
index 000000000..2ed744c0e
--- /dev/null
+++ b/samples/auxdisplay/.gitignore
@@ -0,0 +1,2 @@
1# SPDX-License-Identifier: GPL-2.0-only
2cfag12864b-example
diff --git a/samples/auxdisplay/Makefile b/samples/auxdisplay/Makefile
new file mode 100644
index 000000000..19d556893
--- /dev/null
+++ b/samples/auxdisplay/Makefile
@@ -0,0 +1,2 @@
1# SPDX-License-Identifier: GPL-2.0
2userprogs-always-y += cfag12864b-example
diff --git a/samples/auxdisplay/cfag12864b-example.c b/samples/auxdisplay/cfag12864b-example.c
new file mode 100644
index 000000000..bfeab44f8
--- /dev/null
+++ b/samples/auxdisplay/cfag12864b-example.c
@@ -0,0 +1,267 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Filename: cfag12864b-example.c
4 * Version: 0.1.0
5 * Description: cfag12864b LCD userspace example program
6 *
7 * Author: Copyright (C) Miguel Ojeda Sandonis
8 * Date: 2006-10-31
9 */
10
11/*
12 * ------------------------
13 * start of cfag12864b code
14 * ------------------------
15 */
16
17#include <string.h>
18#include <fcntl.h>
19#include <unistd.h>
20#include <sys/types.h>
21#include <sys/stat.h>
22#include <sys/mman.h>
23
24#define CFAG12864B_WIDTH (128)
25#define CFAG12864B_HEIGHT (64)
26#define CFAG12864B_SIZE (128 * 64 / 8)
27#define CFAG12864B_BPB (8)
28#define CFAG12864B_ADDRESS(x, y) ((y) * CFAG12864B_WIDTH / \
29 CFAG12864B_BPB + (x) / CFAG12864B_BPB)
30#define CFAG12864B_BIT(n) (((unsigned char) 1) << (n))
31
32#undef CFAG12864B_DOCHECK
33#ifdef CFAG12864B_DOCHECK
34 #define CFAG12864B_CHECK(x, y) ((x) < CFAG12864B_WIDTH && \
35 (y) < CFAG12864B_HEIGHT)
36#else
37 #define CFAG12864B_CHECK(x, y) (1)
38#endif
39
40int cfag12864b_fd;
41unsigned char * cfag12864b_mem;
42unsigned char cfag12864b_buffer[CFAG12864B_SIZE];
43
44/*
45 * init a cfag12864b framebuffer device
46 *
47 * No error: return = 0
48 * Unable to open: return = -1
49 * Unable to mmap: return = -2
50 */
51static int cfag12864b_init(char *path)
52{
53 cfag12864b_fd = open(path, O_RDWR);
54 if (cfag12864b_fd == -1)
55 return -1;
56
57 cfag12864b_mem = mmap(0, CFAG12864B_SIZE, PROT_READ | PROT_WRITE,
58 MAP_SHARED, cfag12864b_fd, 0);
59 if (cfag12864b_mem == MAP_FAILED) {
60 close(cfag12864b_fd);
61 return -2;
62 }
63
64 return 0;
65}
66
67/*
68 * exit a cfag12864b framebuffer device
69 */
70static void cfag12864b_exit(void)
71{
72 munmap(cfag12864b_mem, CFAG12864B_SIZE);
73 close(cfag12864b_fd);
74}
75
76/*
77 * set (x, y) pixel
78 */
79static void cfag12864b_set(unsigned char x, unsigned char y)
80{
81 if (CFAG12864B_CHECK(x, y))
82 cfag12864b_buffer[CFAG12864B_ADDRESS(x, y)] |=
83 CFAG12864B_BIT(x % CFAG12864B_BPB);
84}
85
86/*
87 * unset (x, y) pixel
88 */
89static void cfag12864b_unset(unsigned char x, unsigned char y)
90{
91 if (CFAG12864B_CHECK(x, y))
92 cfag12864b_buffer[CFAG12864B_ADDRESS(x, y)] &=
93 ~CFAG12864B_BIT(x % CFAG12864B_BPB);
94}
95
96/*
97 * is set (x, y) pixel?
98 *
99 * Pixel off: return = 0
100 * Pixel on: return = 1
101 */
102static unsigned char cfag12864b_isset(unsigned char x, unsigned char y)
103{
104 if (CFAG12864B_CHECK(x, y))
105 if (cfag12864b_buffer[CFAG12864B_ADDRESS(x, y)] &
106 CFAG12864B_BIT(x % CFAG12864B_BPB))
107 return 1;
108
109 return 0;
110}
111
112/*
113 * not (x, y) pixel
114 */
115static void cfag12864b_not(unsigned char x, unsigned char y)
116{
117 if (cfag12864b_isset(x, y))
118 cfag12864b_unset(x, y);
119 else
120 cfag12864b_set(x, y);
121}
122
123/*
124 * fill (set all pixels)
125 */
126static void cfag12864b_fill(void)
127{
128 unsigned short i;
129
130 for (i = 0; i < CFAG12864B_SIZE; i++)
131 cfag12864b_buffer[i] = 0xFF;
132}
133
134/*
135 * clear (unset all pixels)
136 */
137static void cfag12864b_clear(void)
138{
139 unsigned short i;
140
141 for (i = 0; i < CFAG12864B_SIZE; i++)
142 cfag12864b_buffer[i] = 0;
143}
144
145/*
146 * format a [128*64] matrix
147 *
148 * Pixel off: src[i] = 0
149 * Pixel on: src[i] > 0
150 */
151static void cfag12864b_format(unsigned char * matrix)
152{
153 unsigned char i, j, n;
154
155 for (i = 0; i < CFAG12864B_HEIGHT; i++)
156 for (j = 0; j < CFAG12864B_WIDTH / CFAG12864B_BPB; j++) {
157 cfag12864b_buffer[i * CFAG12864B_WIDTH / CFAG12864B_BPB +
158 j] = 0;
159 for (n = 0; n < CFAG12864B_BPB; n++)
160 if (matrix[i * CFAG12864B_WIDTH +
161 j * CFAG12864B_BPB + n])
162 cfag12864b_buffer[i * CFAG12864B_WIDTH /
163 CFAG12864B_BPB + j] |=
164 CFAG12864B_BIT(n);
165 }
166}
167
168/*
169 * blit buffer to lcd
170 */
171static void cfag12864b_blit(void)
172{
173 memcpy(cfag12864b_mem, cfag12864b_buffer, CFAG12864B_SIZE);
174}
175
176/*
177 * ----------------------
178 * end of cfag12864b code
179 * ----------------------
180 */
181
182#include <stdio.h>
183
184#define EXAMPLES 6
185
186static void example(unsigned char n)
187{
188 unsigned short i, j;
189 unsigned char matrix[CFAG12864B_WIDTH * CFAG12864B_HEIGHT];
190
191 if (n > EXAMPLES)
192 return;
193
194 printf("Example %i/%i - ", n, EXAMPLES);
195
196 switch (n) {
197 case 1:
198 printf("Draw points setting bits");
199 cfag12864b_clear();
200 for (i = 0; i < CFAG12864B_WIDTH; i += 2)
201 for (j = 0; j < CFAG12864B_HEIGHT; j += 2)
202 cfag12864b_set(i, j);
203 break;
204
205 case 2:
206 printf("Clear the LCD");
207 cfag12864b_clear();
208 break;
209
210 case 3:
211 printf("Draw rows formatting a [128*64] matrix");
212 memset(matrix, 0, CFAG12864B_WIDTH * CFAG12864B_HEIGHT);
213 for (i = 0; i < CFAG12864B_WIDTH; i++)
214 for (j = 0; j < CFAG12864B_HEIGHT; j += 2)
215 matrix[j * CFAG12864B_WIDTH + i] = 1;
216 cfag12864b_format(matrix);
217 break;
218
219 case 4:
220 printf("Fill the lcd");
221 cfag12864b_fill();
222 break;
223
224 case 5:
225 printf("Draw columns unsetting bits");
226 for (i = 0; i < CFAG12864B_WIDTH; i += 2)
227 for (j = 0; j < CFAG12864B_HEIGHT; j++)
228 cfag12864b_unset(i, j);
229 break;
230
231 case 6:
232 printf("Do negative not-ing all bits");
233 for (i = 0; i < CFAG12864B_WIDTH; i++)
234 for (j = 0; j < CFAG12864B_HEIGHT; j ++)
235 cfag12864b_not(i, j);
236 break;
237 }
238
239 puts(" - [Press Enter]");
240}
241
242int main(int argc, char *argv[])
243{
244 unsigned char n;
245
246 if (argc != 2) {
247 printf(
248 "Syntax: %s fbdev\n"
249 "Usually: /dev/fb0, /dev/fb1...\n", argv[0]);
250 return -1;
251 }
252
253 if (cfag12864b_init(argv[1])) {
254 printf("Can't init %s fbdev\n", argv[1]);
255 return -2;
256 }
257
258 for (n = 1; n <= EXAMPLES; n++) {
259 example(n);
260 cfag12864b_blit();
261 while (getchar() != '\n');
262 }
263
264 cfag12864b_exit();
265
266 return 0;
267}
diff --git a/samples/binderfs/.gitignore b/samples/binderfs/.gitignore
new file mode 100644
index 000000000..eb60241e8
--- /dev/null
+++ b/samples/binderfs/.gitignore
@@ -0,0 +1 @@
binderfs_example
diff --git a/samples/binderfs/Makefile b/samples/binderfs/Makefile
new file mode 100644
index 000000000..629e43b9b
--- /dev/null
+++ b/samples/binderfs/Makefile
@@ -0,0 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0-only
2userprogs-always-y += binderfs_example
3
4userccflags += -I usr/include
diff --git a/samples/binderfs/binderfs_example.c b/samples/binderfs/binderfs_example.c
new file mode 100644
index 000000000..0fd92cdda
--- /dev/null
+++ b/samples/binderfs/binderfs_example.c
@@ -0,0 +1,82 @@
1// SPDX-License-Identifier: GPL-2.0
2
3#define _GNU_SOURCE
4#include <errno.h>
5#include <fcntl.h>
6#include <sched.h>
7#include <stdio.h>
8#include <stdlib.h>
9#include <string.h>
10#include <sys/ioctl.h>
11#include <sys/mount.h>
12#include <sys/stat.h>
13#include <sys/types.h>
14#include <unistd.h>
15#include <linux/android/binder.h>
16#include <linux/android/binderfs.h>
17
18int main(int argc, char *argv[])
19{
20 int fd, ret, saved_errno;
21 struct binderfs_device device = { 0 };
22
23 ret = unshare(CLONE_NEWNS);
24 if (ret < 0) {
25 fprintf(stderr, "%s - Failed to unshare mount namespace\n",
26 strerror(errno));
27 exit(EXIT_FAILURE);
28 }
29
30 ret = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0);
31 if (ret < 0) {
32 fprintf(stderr, "%s - Failed to mount / as private\n",
33 strerror(errno));
34 exit(EXIT_FAILURE);
35 }
36
37 ret = mkdir("/dev/binderfs", 0755);
38 if (ret < 0 && errno != EEXIST) {
39 fprintf(stderr, "%s - Failed to create binderfs mountpoint\n",
40 strerror(errno));
41 exit(EXIT_FAILURE);
42 }
43
44 ret = mount(NULL, "/dev/binderfs", "binder", 0, 0);
45 if (ret < 0) {
46 fprintf(stderr, "%s - Failed to mount binderfs\n",
47 strerror(errno));
48 exit(EXIT_FAILURE);
49 }
50
51 memcpy(device.name, "my-binder", strlen("my-binder"));
52
53 fd = open("/dev/binderfs/binder-control", O_RDONLY | O_CLOEXEC);
54 if (fd < 0) {
55 fprintf(stderr, "%s - Failed to open binder-control device\n",
56 strerror(errno));
57 exit(EXIT_FAILURE);
58 }
59
60 ret = ioctl(fd, BINDER_CTL_ADD, &device);
61 saved_errno = errno;
62 close(fd);
63 errno = saved_errno;
64 if (ret < 0) {
65 fprintf(stderr, "%s - Failed to allocate new binder device\n",
66 strerror(errno));
67 exit(EXIT_FAILURE);
68 }
69
70 printf("Allocated new binder device with major %d, minor %d, and name %s\n",
71 device.major, device.minor, device.name);
72
73 ret = unlink("/dev/binderfs/my-binder");
74 if (ret < 0) {
75 fprintf(stderr, "%s - Failed to delete binder device\n",
76 strerror(errno));
77 exit(EXIT_FAILURE);
78 }
79
80 /* Cleanup happens when the mount namespace dies. */
81 exit(EXIT_SUCCESS);
82}
diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore
new file mode 100644
index 000000000..b2f29bc8d
--- /dev/null
+++ b/samples/bpf/.gitignore
@@ -0,0 +1,54 @@
1# SPDX-License-Identifier: GPL-2.0-only
2cpustat
3fds_example
4hbm
5ibumad
6lathist
7lwt_len_hist
8map_perf_test
9offwaketime
10per_socket_stats_example
11sampleip
12sock_example
13sockex1
14sockex2
15sockex3
16spintest
17syscall_nrs.h
18syscall_tp
19task_fd_query
20tc_l2_redirect
21test_cgrp2_array_pin
22test_cgrp2_attach
23test_cgrp2_attach2
24test_cgrp2_sock
25test_cgrp2_sock2
26test_current_task_under_cgroup
27test_lru_dist
28test_map_in_map
29test_overhead
30test_probe_write_user
31trace_event
32trace_output
33tracex1
34tracex2
35tracex3
36tracex4
37tracex5
38tracex6
39tracex7
40xdp1
41xdp2
42xdp_adjust_tail
43xdp_fwd
44xdp_monitor
45xdp_redirect
46xdp_redirect_cpu
47xdp_redirect_map
48xdp_router_ipv4
49xdp_rxq_info
50xdp_sample_pkts
51xdp_tx_iptunnel
52xdpsock
53xsk_fwd
54testfile.img
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
new file mode 100644
index 000000000..aeebf5d12
--- /dev/null
+++ b/samples/bpf/Makefile
@@ -0,0 +1,329 @@
1# SPDX-License-Identifier: GPL-2.0
2
3BPF_SAMPLES_PATH ?= $(abspath $(srctree)/$(src))
4TOOLS_PATH := $(BPF_SAMPLES_PATH)/../../tools
5
6# List of programs to build
7tprogs-y := test_lru_dist
8tprogs-y += sock_example
9tprogs-y += fds_example
10tprogs-y += sockex1
11tprogs-y += sockex2
12tprogs-y += sockex3
13tprogs-y += tracex1
14tprogs-y += tracex2
15tprogs-y += tracex3
16tprogs-y += tracex4
17tprogs-y += tracex5
18tprogs-y += tracex6
19tprogs-y += tracex7
20tprogs-y += test_probe_write_user
21tprogs-y += trace_output
22tprogs-y += lathist
23tprogs-y += offwaketime
24tprogs-y += spintest
25tprogs-y += map_perf_test
26tprogs-y += test_overhead
27tprogs-y += test_cgrp2_array_pin
28tprogs-y += test_cgrp2_attach
29tprogs-y += test_cgrp2_sock
30tprogs-y += test_cgrp2_sock2
31tprogs-y += xdp1
32tprogs-y += xdp2
33tprogs-y += xdp_router_ipv4
34tprogs-y += test_current_task_under_cgroup
35tprogs-y += trace_event
36tprogs-y += sampleip
37tprogs-y += tc_l2_redirect
38tprogs-y += lwt_len_hist
39tprogs-y += xdp_tx_iptunnel
40tprogs-y += test_map_in_map
41tprogs-y += per_socket_stats_example
42tprogs-y += xdp_redirect
43tprogs-y += xdp_redirect_map
44tprogs-y += xdp_redirect_cpu
45tprogs-y += xdp_monitor
46tprogs-y += xdp_rxq_info
47tprogs-y += syscall_tp
48tprogs-y += cpustat
49tprogs-y += xdp_adjust_tail
50tprogs-y += xdpsock
51tprogs-y += xsk_fwd
52tprogs-y += xdp_fwd
53tprogs-y += task_fd_query
54tprogs-y += xdp_sample_pkts
55tprogs-y += ibumad
56tprogs-y += hbm
57
58# Libbpf dependencies
59LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
60
61CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o
62TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o
63
64fds_example-objs := fds_example.o
65sockex1-objs := sockex1_user.o
66sockex2-objs := sockex2_user.o
67sockex3-objs := sockex3_user.o
68tracex1-objs := tracex1_user.o $(TRACE_HELPERS)
69tracex2-objs := tracex2_user.o
70tracex3-objs := tracex3_user.o
71tracex4-objs := tracex4_user.o
72tracex5-objs := tracex5_user.o $(TRACE_HELPERS)
73tracex6-objs := tracex6_user.o
74tracex7-objs := tracex7_user.o
75test_probe_write_user-objs := test_probe_write_user_user.o
76trace_output-objs := trace_output_user.o $(TRACE_HELPERS)
77lathist-objs := lathist_user.o
78offwaketime-objs := offwaketime_user.o $(TRACE_HELPERS)
79spintest-objs := spintest_user.o $(TRACE_HELPERS)
80map_perf_test-objs := map_perf_test_user.o
81test_overhead-objs := bpf_load.o test_overhead_user.o
82test_cgrp2_array_pin-objs := test_cgrp2_array_pin.o
83test_cgrp2_attach-objs := test_cgrp2_attach.o
84test_cgrp2_sock-objs := test_cgrp2_sock.o
85test_cgrp2_sock2-objs := bpf_load.o test_cgrp2_sock2.o
86xdp1-objs := xdp1_user.o
87# reuse xdp1 source intentionally
88xdp2-objs := xdp1_user.o
89xdp_router_ipv4-objs := xdp_router_ipv4_user.o
90test_current_task_under_cgroup-objs := $(CGROUP_HELPERS) \
91 test_current_task_under_cgroup_user.o
92trace_event-objs := trace_event_user.o $(TRACE_HELPERS)
93sampleip-objs := sampleip_user.o $(TRACE_HELPERS)
94tc_l2_redirect-objs := bpf_load.o tc_l2_redirect_user.o
95lwt_len_hist-objs := bpf_load.o lwt_len_hist_user.o
96xdp_tx_iptunnel-objs := xdp_tx_iptunnel_user.o
97test_map_in_map-objs := test_map_in_map_user.o
98per_socket_stats_example-objs := cookie_uid_helper_example.o
99xdp_redirect-objs := xdp_redirect_user.o
100xdp_redirect_map-objs := xdp_redirect_map_user.o
101xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o
102xdp_monitor-objs := xdp_monitor_user.o
103xdp_rxq_info-objs := xdp_rxq_info_user.o
104syscall_tp-objs := syscall_tp_user.o
105cpustat-objs := cpustat_user.o
106xdp_adjust_tail-objs := xdp_adjust_tail_user.o
107xdpsock-objs := xdpsock_user.o
108xsk_fwd-objs := xsk_fwd.o
109xdp_fwd-objs := xdp_fwd_user.o
110task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
111xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS)
112ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS)
113hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS)
114
115# Tell kbuild to always build the programs
116always-y := $(tprogs-y)
117always-y += sockex1_kern.o
118always-y += sockex2_kern.o
119always-y += sockex3_kern.o
120always-y += tracex1_kern.o
121always-y += tracex2_kern.o
122always-y += tracex3_kern.o
123always-y += tracex4_kern.o
124always-y += tracex5_kern.o
125always-y += tracex6_kern.o
126always-y += tracex7_kern.o
127always-y += sock_flags_kern.o
128always-y += test_probe_write_user_kern.o
129always-y += trace_output_kern.o
130always-y += tcbpf1_kern.o
131always-y += tc_l2_redirect_kern.o
132always-y += lathist_kern.o
133always-y += offwaketime_kern.o
134always-y += spintest_kern.o
135always-y += map_perf_test_kern.o
136always-y += test_overhead_tp_kern.o
137always-y += test_overhead_raw_tp_kern.o
138always-y += test_overhead_kprobe_kern.o
139always-y += parse_varlen.o parse_simple.o parse_ldabs.o
140always-y += test_cgrp2_tc_kern.o
141always-y += xdp1_kern.o
142always-y += xdp2_kern.o
143always-y += xdp_router_ipv4_kern.o
144always-y += test_current_task_under_cgroup_kern.o
145always-y += trace_event_kern.o
146always-y += sampleip_kern.o
147always-y += lwt_len_hist_kern.o
148always-y += xdp_tx_iptunnel_kern.o
149always-y += test_map_in_map_kern.o
150always-y += tcp_synrto_kern.o
151always-y += tcp_rwnd_kern.o
152always-y += tcp_bufs_kern.o
153always-y += tcp_cong_kern.o
154always-y += tcp_iw_kern.o
155always-y += tcp_clamp_kern.o
156always-y += tcp_basertt_kern.o
157always-y += tcp_tos_reflect_kern.o
158always-y += tcp_dumpstats_kern.o
159always-y += xdp_redirect_kern.o
160always-y += xdp_redirect_map_kern.o
161always-y += xdp_redirect_cpu_kern.o
162always-y += xdp_monitor_kern.o
163always-y += xdp_rxq_info_kern.o
164always-y += xdp2skb_meta_kern.o
165always-y += syscall_tp_kern.o
166always-y += cpustat_kern.o
167always-y += xdp_adjust_tail_kern.o
168always-y += xdp_fwd_kern.o
169always-y += task_fd_query_kern.o
170always-y += xdp_sample_pkts_kern.o
171always-y += ibumad_kern.o
172always-y += hbm_out_kern.o
173always-y += hbm_edt_kern.o
174always-y += xdpsock_kern.o
175
176ifeq ($(ARCH), arm)
177# Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux
178# headers when arm instruction set identification is requested.
179ARM_ARCH_SELECTOR := $(filter -D__LINUX_ARM_ARCH__%, $(KBUILD_CFLAGS))
180BPF_EXTRA_CFLAGS := $(ARM_ARCH_SELECTOR)
181TPROGS_CFLAGS += $(ARM_ARCH_SELECTOR)
182endif
183
184TPROGS_CFLAGS += -Wall -O2
185TPROGS_CFLAGS += -Wmissing-prototypes
186TPROGS_CFLAGS += -Wstrict-prototypes
187
188TPROGS_CFLAGS += -I$(objtree)/usr/include
189TPROGS_CFLAGS += -I$(srctree)/tools/testing/selftests/bpf/
190TPROGS_CFLAGS += -I$(srctree)/tools/lib/
191TPROGS_CFLAGS += -I$(srctree)/tools/include
192TPROGS_CFLAGS += -I$(srctree)/tools/perf
193TPROGS_CFLAGS += -DHAVE_ATTR_TEST=0
194
195ifdef SYSROOT
196TPROGS_CFLAGS += --sysroot=$(SYSROOT)
197TPROGS_LDFLAGS := -L$(SYSROOT)/usr/lib
198endif
199
200TPROGCFLAGS_bpf_load.o += -Wno-unused-variable
201
202TPROGS_LDLIBS += $(LIBBPF) -lelf -lz
203TPROGLDLIBS_tracex4 += -lrt
204TPROGLDLIBS_trace_output += -lrt
205TPROGLDLIBS_map_perf_test += -lrt
206TPROGLDLIBS_test_overhead += -lrt
207TPROGLDLIBS_xdpsock += -pthread
208TPROGLDLIBS_xsk_fwd += -pthread
209
210# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
211# make M=samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
212LLC ?= llc
213CLANG ?= clang
214OPT ?= opt
215LLVM_DIS ?= llvm-dis
216LLVM_OBJCOPY ?= llvm-objcopy
217BTF_PAHOLE ?= pahole
218
219# Detect that we're cross compiling and use the cross compiler
220ifdef CROSS_COMPILE
221CLANG_ARCH_ARGS = --target=$(notdir $(CROSS_COMPILE:%-=%))
222endif
223
224# Don't evaluate probes and warnings if we need to run make recursively
225ifneq ($(src),)
226HDR_PROBE := $(shell printf "\#include <linux/types.h>\n struct list_head { int a; }; int main() { return 0; }" | \
227 $(CC) $(TPROGS_CFLAGS) $(TPROGS_LDFLAGS) -x c - \
228 -o /dev/null 2>/dev/null && echo okay)
229
230ifeq ($(HDR_PROBE),)
231$(warning WARNING: Detected possible issues with include path.)
232$(warning WARNING: Please install kernel headers locally (make headers_install).)
233endif
234
235BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris)
236BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF)
237BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 'usage.*llvm')
238BTF_LLVM_PROBE := $(shell echo "int main() { return 0; }" | \
239 $(CLANG) -target bpf -O2 -g -c -x c - -o ./llvm_btf_verify.o; \
240 readelf -S ./llvm_btf_verify.o | grep BTF; \
241 /bin/rm -f ./llvm_btf_verify.o)
242
243BPF_EXTRA_CFLAGS += -fno-stack-protector
244ifneq ($(BTF_LLVM_PROBE),)
245 BPF_EXTRA_CFLAGS += -g
246else
247ifneq ($(and $(BTF_LLC_PROBE),$(BTF_PAHOLE_PROBE),$(BTF_OBJCOPY_PROBE)),)
248 BPF_EXTRA_CFLAGS += -g
249 LLC_FLAGS += -mattr=dwarfris
250 DWARF2BTF = y
251endif
252endif
253endif
254
255# Trick to allow make to be run from this directory
256all:
257 $(MAKE) -C ../../ M=$(CURDIR) BPF_SAMPLES_PATH=$(CURDIR)
258
259clean:
260 $(MAKE) -C ../../ M=$(CURDIR) clean
261 @find $(CURDIR) -type f -name '*~' -delete
262
263$(LIBBPF): FORCE
264# Fix up variables inherited from Kbuild that tools/ build system won't like
265 $(MAKE) -C $(dir $@) RM='rm -rf' EXTRA_CFLAGS="$(TPROGS_CFLAGS)" \
266 LDFLAGS=$(TPROGS_LDFLAGS) srctree=$(BPF_SAMPLES_PATH)/../../ O=
267
268$(obj)/syscall_nrs.h: $(obj)/syscall_nrs.s FORCE
269 $(call filechk,offsets,__SYSCALL_NRS_H__)
270
271targets += syscall_nrs.s
272clean-files += syscall_nrs.h
273
274FORCE:
275
276
277# Verify LLVM compiler tools are available and bpf target is supported by llc
278.PHONY: verify_cmds verify_target_bpf $(CLANG) $(LLC)
279
280verify_cmds: $(CLANG) $(LLC)
281 @for TOOL in $^ ; do \
282 if ! (which -- "$${TOOL}" > /dev/null 2>&1); then \
283 echo "*** ERROR: Cannot find LLVM tool $${TOOL}" ;\
284 exit 1; \
285 else true; fi; \
286 done
287
288verify_target_bpf: verify_cmds
289 @if ! (${LLC} -march=bpf -mattr=help > /dev/null 2>&1); then \
290 echo "*** ERROR: LLVM (${LLC}) does not support 'bpf' target" ;\
291 echo " NOTICE: LLVM version >= 3.7.1 required" ;\
292 exit 2; \
293 else true; fi
294
295$(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF)
296$(src)/*.c: verify_target_bpf $(LIBBPF)
297
298$(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
299$(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
300$(obj)/hbm.o: $(src)/hbm.h
301$(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
302
303-include $(BPF_SAMPLES_PATH)/Makefile.target
304
305# asm/sysreg.h - inline assembly used by it is incompatible with llvm.
306# But, there is no easy way to fix it, so just exclude it since it is
307# useless for BPF samples.
308# below we use long chain of commands, clang | opt | llvm-dis | llc,
309# to generate final object file. 'clang' compiles the source into IR
310# with native target, e.g., x64, arm64, etc. 'opt' does bpf CORE IR builtin
311# processing (llvm12) and IR optimizations. 'llvm-dis' converts
312# 'opt' output to IR, and finally 'llc' generates bpf byte code.
313$(obj)/%.o: $(src)/%.c
314 @echo " CLANG-bpf " $@
315 $(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(BPF_EXTRA_CFLAGS) \
316 -I$(obj) -I$(srctree)/tools/testing/selftests/bpf/ \
317 -I$(srctree)/tools/lib/ \
318 -D__KERNEL__ -D__BPF_TRACING__ -Wno-unused-value -Wno-pointer-sign \
319 -D__TARGET_ARCH_$(SRCARCH) -Wno-compare-distinct-pointer-types \
320 -Wno-gnu-variable-sized-type-not-at-end \
321 -Wno-address-of-packed-member -Wno-tautological-compare \
322 -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \
323 -I$(srctree)/samples/bpf/ -include asm_goto_workaround.h \
324 -O2 -emit-llvm -Xclang -disable-llvm-passes -c $< -o - | \
325 $(OPT) -O2 -mtriple=bpf-pc-linux | $(LLVM_DIS) | \
326 $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@
327ifeq ($(DWARF2BTF),y)
328 $(BTF_PAHOLE) -J $@
329endif
diff --git a/samples/bpf/Makefile.target b/samples/bpf/Makefile.target
new file mode 100644
index 000000000..7621f55e2
--- /dev/null
+++ b/samples/bpf/Makefile.target
@@ -0,0 +1,75 @@
1# SPDX-License-Identifier: GPL-2.0
2# ==========================================================================
3# Building binaries on the host system
4# Binaries are not used during the compilation of the kernel, and intended
5# to be build for target board, target board can be host of course. Added to
6# build binaries to run not on host system.
7#
8# Sample syntax
9# tprogs-y := xsk_example
10# Will compile xsk_example.c and create an executable named xsk_example
11#
12# tprogs-y := xdpsock
13# xdpsock-objs := xdpsock_1.o xdpsock_2.o
14# Will compile xdpsock_1.c and xdpsock_2.c, and then link the executable
15# xdpsock, based on xdpsock_1.o and xdpsock_2.o
16#
17# Derived from scripts/Makefile.host
18#
19__tprogs := $(sort $(tprogs-y))
20
21# C code
22# Executables compiled from a single .c file
23tprog-csingle := $(foreach m,$(__tprogs), \
24 $(if $($(m)-objs),,$(m)))
25
26# C executables linked based on several .o files
27tprog-cmulti := $(foreach m,$(__tprogs),\
28 $(if $($(m)-objs),$(m)))
29
30# Object (.o) files compiled from .c files
31tprog-cobjs := $(sort $(foreach m,$(__tprogs),$($(m)-objs)))
32
33tprog-csingle := $(addprefix $(obj)/,$(tprog-csingle))
34tprog-cmulti := $(addprefix $(obj)/,$(tprog-cmulti))
35tprog-cobjs := $(addprefix $(obj)/,$(tprog-cobjs))
36
37#####
38# Handle options to gcc. Support building with separate output directory
39
40_tprogc_flags = $(TPROGS_CFLAGS) \
41 $(TPROGCFLAGS_$(basetarget).o)
42
43# $(objtree)/$(obj) for including generated headers from checkin source files
44ifeq ($(KBUILD_EXTMOD),)
45ifdef building_out_of_srctree
46_tprogc_flags += -I $(objtree)/$(obj)
47endif
48endif
49
50tprogc_flags = -Wp,-MD,$(depfile) $(_tprogc_flags)
51
52# Create executable from a single .c file
53# tprog-csingle -> Executable
54quiet_cmd_tprog-csingle = CC $@
55 cmd_tprog-csingle = $(CC) $(tprogc_flags) $(TPROGS_LDFLAGS) -o $@ $< \
56 $(TPROGS_LDLIBS) $(TPROGLDLIBS_$(@F))
57$(tprog-csingle): $(obj)/%: $(src)/%.c FORCE
58 $(call if_changed_dep,tprog-csingle)
59
60# Link an executable based on list of .o files, all plain c
61# tprog-cmulti -> executable
62quiet_cmd_tprog-cmulti = LD $@
63 cmd_tprog-cmulti = $(CC) $(tprogc_flags) $(TPROGS_LDFLAGS) -o $@ \
64 $(addprefix $(obj)/,$($(@F)-objs)) \
65 $(TPROGS_LDLIBS) $(TPROGLDLIBS_$(@F))
66$(tprog-cmulti): $(tprog-cobjs) FORCE
67 $(call if_changed,tprog-cmulti)
68$(call multi_depend, $(tprog-cmulti), , -objs)
69
70# Create .o file from a single .c file
71# tprog-cobjs -> .o
72quiet_cmd_tprog-cobjs = CC $@
73 cmd_tprog-cobjs = $(CC) $(tprogc_flags) -c -o $@ $<
74$(tprog-cobjs): $(obj)/%.o: $(src)/%.c FORCE
75 $(call if_changed_dep,tprog-cobjs)
diff --git a/samples/bpf/README.rst b/samples/bpf/README.rst
new file mode 100644
index 000000000..dd34b2d26
--- /dev/null
+++ b/samples/bpf/README.rst
@@ -0,0 +1,105 @@
1eBPF sample programs
2====================
3
4This directory contains a test stubs, verifier test-suite and examples
5for using eBPF. The examples use libbpf from tools/lib/bpf.
6
7Build dependencies
8==================
9
10Compiling requires having installed:
11 * clang >= version 3.4.0
12 * llvm >= version 3.7.1
13
14Note that LLVM's tool 'llc' must support target 'bpf', list version
15and supported targets with command: ``llc --version``
16
17Clean and configuration
18-----------------------
19
20It can be needed to clean tools, samples or kernel before trying new arch or
21after some changes (on demand)::
22
23 make -C tools clean
24 make -C samples/bpf clean
25 make clean
26
27Configure kernel, defconfig for instance::
28
29 make defconfig
30
31Kernel headers
32--------------
33
34There are usually dependencies to header files of the current kernel.
35To avoid installing devel kernel headers system wide, as a normal
36user, simply call::
37
38 make headers_install
39
40This will creates a local "usr/include" directory in the git/build top
41level directory, that the make system automatically pickup first.
42
43Compiling
44=========
45
46For building the BPF samples, issue the below command from the kernel
47top level directory::
48
49 make M=samples/bpf
50
51It is also possible to call make from this directory. This will just
52hide the invocation of make as above.
53
54Manually compiling LLVM with 'bpf' support
55------------------------------------------
56
57Since version 3.7.0, LLVM adds a proper LLVM backend target for the
58BPF bytecode architecture.
59
60By default llvm will build all non-experimental backends including bpf.
61To generate a smaller llc binary one can use::
62
63 -DLLVM_TARGETS_TO_BUILD="BPF"
64
65Quick sniplet for manually compiling LLVM and clang
66(build dependencies are cmake and gcc-c++)::
67
68 $ git clone http://llvm.org/git/llvm.git
69 $ cd llvm/tools
70 $ git clone --depth 1 http://llvm.org/git/clang.git
71 $ cd ..; mkdir build; cd build
72 $ cmake .. -DLLVM_TARGETS_TO_BUILD="BPF;X86"
73 $ make -j $(getconf _NPROCESSORS_ONLN)
74
75It is also possible to point make to the newly compiled 'llc' or
76'clang' command via redefining LLC or CLANG on the make command line::
77
78 make M=samples/bpf LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
79
80Cross compiling samples
81-----------------------
82In order to cross-compile, say for arm64 targets, export CROSS_COMPILE and ARCH
83environment variables before calling make. But do this before clean,
84cofiguration and header install steps described above. This will direct make to
85build samples for the cross target::
86
87 export ARCH=arm64
88 export CROSS_COMPILE="aarch64-linux-gnu-"
89
90Headers can be also installed on RFS of target board if need to keep them in
91sync (not necessarily and it creates a local "usr/include" directory also)::
92
93 make INSTALL_HDR_PATH=~/some_sysroot/usr headers_install
94
95Pointing LLC and CLANG is not necessarily if it's installed on HOST and have
96in its targets appropriate arm64 arch (usually it has several arches).
97Build samples::
98
99 make M=samples/bpf
100
101Or build samples with SYSROOT if some header or library is absent in toolchain,
102say libelf, providing address to file system containing headers and libs,
103can be RFS of target board::
104
105 make M=samples/bpf SYSROOT=~/some_sysroot
diff --git a/samples/bpf/asm_goto_workaround.h b/samples/bpf/asm_goto_workaround.h
new file mode 100644
index 000000000..7048bb359
--- /dev/null
+++ b/samples/bpf/asm_goto_workaround.h
@@ -0,0 +1,28 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/* Copyright (c) 2019 Facebook */
3#ifndef __ASM_GOTO_WORKAROUND_H
4#define __ASM_GOTO_WORKAROUND_H
5
6/*
7 * This will bring in asm_volatile_goto and asm_inline macro definitions
8 * if enabled by compiler and config options.
9 */
10#include <linux/types.h>
11
12#ifdef asm_volatile_goto
13#undef asm_volatile_goto
14#define asm_volatile_goto(x...) asm volatile("invalid use of asm_volatile_goto")
15#endif
16
17/*
18 * asm_inline is defined as asm __inline in "include/linux/compiler_types.h"
19 * if supported by the kernel's CC (i.e CONFIG_CC_HAS_ASM_INLINE) which is not
20 * supported by CLANG.
21 */
22#ifdef asm_inline
23#undef asm_inline
24#define asm_inline asm
25#endif
26
27#define volatile(x...) volatile("")
28#endif
diff --git a/samples/bpf/bpf_insn.h b/samples/bpf/bpf_insn.h
new file mode 100644
index 000000000..544237980
--- /dev/null
+++ b/samples/bpf/bpf_insn.h
@@ -0,0 +1,217 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/* eBPF instruction mini library */
3#ifndef __BPF_INSN_H
4#define __BPF_INSN_H
5
6struct bpf_insn;
7
8/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
9
10#define BPF_ALU64_REG(OP, DST, SRC) \
11 ((struct bpf_insn) { \
12 .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \
13 .dst_reg = DST, \
14 .src_reg = SRC, \
15 .off = 0, \
16 .imm = 0 })
17
18#define BPF_ALU32_REG(OP, DST, SRC) \
19 ((struct bpf_insn) { \
20 .code = BPF_ALU | BPF_OP(OP) | BPF_X, \
21 .dst_reg = DST, \
22 .src_reg = SRC, \
23 .off = 0, \
24 .imm = 0 })
25
26/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */
27
28#define BPF_ALU64_IMM(OP, DST, IMM) \
29 ((struct bpf_insn) { \
30 .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \
31 .dst_reg = DST, \
32 .src_reg = 0, \
33 .off = 0, \
34 .imm = IMM })
35
36#define BPF_ALU32_IMM(OP, DST, IMM) \
37 ((struct bpf_insn) { \
38 .code = BPF_ALU | BPF_OP(OP) | BPF_K, \
39 .dst_reg = DST, \
40 .src_reg = 0, \
41 .off = 0, \
42 .imm = IMM })
43
44/* Short form of mov, dst_reg = src_reg */
45
46#define BPF_MOV64_REG(DST, SRC) \
47 ((struct bpf_insn) { \
48 .code = BPF_ALU64 | BPF_MOV | BPF_X, \
49 .dst_reg = DST, \
50 .src_reg = SRC, \
51 .off = 0, \
52 .imm = 0 })
53
54#define BPF_MOV32_REG(DST, SRC) \
55 ((struct bpf_insn) { \
56 .code = BPF_ALU | BPF_MOV | BPF_X, \
57 .dst_reg = DST, \
58 .src_reg = SRC, \
59 .off = 0, \
60 .imm = 0 })
61
62/* Short form of mov, dst_reg = imm32 */
63
64#define BPF_MOV64_IMM(DST, IMM) \
65 ((struct bpf_insn) { \
66 .code = BPF_ALU64 | BPF_MOV | BPF_K, \
67 .dst_reg = DST, \
68 .src_reg = 0, \
69 .off = 0, \
70 .imm = IMM })
71
72#define BPF_MOV32_IMM(DST, IMM) \
73 ((struct bpf_insn) { \
74 .code = BPF_ALU | BPF_MOV | BPF_K, \
75 .dst_reg = DST, \
76 .src_reg = 0, \
77 .off = 0, \
78 .imm = IMM })
79
80/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
81#define BPF_LD_IMM64(DST, IMM) \
82 BPF_LD_IMM64_RAW(DST, 0, IMM)
83
84#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \
85 ((struct bpf_insn) { \
86 .code = BPF_LD | BPF_DW | BPF_IMM, \
87 .dst_reg = DST, \
88 .src_reg = SRC, \
89 .off = 0, \
90 .imm = (__u32) (IMM) }), \
91 ((struct bpf_insn) { \
92 .code = 0, /* zero is reserved opcode */ \
93 .dst_reg = 0, \
94 .src_reg = 0, \
95 .off = 0, \
96 .imm = ((__u64) (IMM)) >> 32 })
97
98#ifndef BPF_PSEUDO_MAP_FD
99# define BPF_PSEUDO_MAP_FD 1
100#endif
101
102/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
103#define BPF_LD_MAP_FD(DST, MAP_FD) \
104 BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
105
106
107/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */
108
109#define BPF_LD_ABS(SIZE, IMM) \
110 ((struct bpf_insn) { \
111 .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \
112 .dst_reg = 0, \
113 .src_reg = 0, \
114 .off = 0, \
115 .imm = IMM })
116
117/* Memory load, dst_reg = *(uint *) (src_reg + off16) */
118
119#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \
120 ((struct bpf_insn) { \
121 .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \
122 .dst_reg = DST, \
123 .src_reg = SRC, \
124 .off = OFF, \
125 .imm = 0 })
126
127/* Memory store, *(uint *) (dst_reg + off16) = src_reg */
128
129#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \
130 ((struct bpf_insn) { \
131 .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \
132 .dst_reg = DST, \
133 .src_reg = SRC, \
134 .off = OFF, \
135 .imm = 0 })
136
137/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */
138
139#define BPF_STX_XADD(SIZE, DST, SRC, OFF) \
140 ((struct bpf_insn) { \
141 .code = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD, \
142 .dst_reg = DST, \
143 .src_reg = SRC, \
144 .off = OFF, \
145 .imm = 0 })
146
147/* Memory store, *(uint *) (dst_reg + off16) = imm32 */
148
149#define BPF_ST_MEM(SIZE, DST, OFF, IMM) \
150 ((struct bpf_insn) { \
151 .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \
152 .dst_reg = DST, \
153 .src_reg = 0, \
154 .off = OFF, \
155 .imm = IMM })
156
157/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */
158
159#define BPF_JMP_REG(OP, DST, SRC, OFF) \
160 ((struct bpf_insn) { \
161 .code = BPF_JMP | BPF_OP(OP) | BPF_X, \
162 .dst_reg = DST, \
163 .src_reg = SRC, \
164 .off = OFF, \
165 .imm = 0 })
166
167/* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */
168
169#define BPF_JMP32_REG(OP, DST, SRC, OFF) \
170 ((struct bpf_insn) { \
171 .code = BPF_JMP32 | BPF_OP(OP) | BPF_X, \
172 .dst_reg = DST, \
173 .src_reg = SRC, \
174 .off = OFF, \
175 .imm = 0 })
176
177/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */
178
179#define BPF_JMP_IMM(OP, DST, IMM, OFF) \
180 ((struct bpf_insn) { \
181 .code = BPF_JMP | BPF_OP(OP) | BPF_K, \
182 .dst_reg = DST, \
183 .src_reg = 0, \
184 .off = OFF, \
185 .imm = IMM })
186
187/* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */
188
189#define BPF_JMP32_IMM(OP, DST, IMM, OFF) \
190 ((struct bpf_insn) { \
191 .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \
192 .dst_reg = DST, \
193 .src_reg = 0, \
194 .off = OFF, \
195 .imm = IMM })
196
197/* Raw code statement block */
198
199#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \
200 ((struct bpf_insn) { \
201 .code = CODE, \
202 .dst_reg = DST, \
203 .src_reg = SRC, \
204 .off = OFF, \
205 .imm = IMM })
206
207/* Program exit */
208
209#define BPF_EXIT_INSN() \
210 ((struct bpf_insn) { \
211 .code = BPF_JMP | BPF_EXIT, \
212 .dst_reg = 0, \
213 .src_reg = 0, \
214 .off = 0, \
215 .imm = 0 })
216
217#endif
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
new file mode 100644
index 000000000..c5ad528f0
--- /dev/null
+++ b/samples/bpf/bpf_load.c
@@ -0,0 +1,667 @@
1// SPDX-License-Identifier: GPL-2.0
2#include <stdio.h>
3#include <sys/types.h>
4#include <sys/stat.h>
5#include <fcntl.h>
6#include <libelf.h>
7#include <gelf.h>
8#include <errno.h>
9#include <unistd.h>
10#include <string.h>
11#include <stdbool.h>
12#include <stdlib.h>
13#include <linux/bpf.h>
14#include <linux/filter.h>
15#include <linux/perf_event.h>
16#include <linux/netlink.h>
17#include <linux/rtnetlink.h>
18#include <linux/types.h>
19#include <sys/socket.h>
20#include <sys/syscall.h>
21#include <sys/ioctl.h>
22#include <sys/mman.h>
23#include <poll.h>
24#include <ctype.h>
25#include <assert.h>
26#include <bpf/bpf.h>
27#include "bpf_load.h"
28#include "perf-sys.h"
29
30#define DEBUGFS "/sys/kernel/debug/tracing/"
31
32static char license[128];
33static int kern_version;
34static bool processed_sec[128];
35char bpf_log_buf[BPF_LOG_BUF_SIZE];
36int map_fd[MAX_MAPS];
37int prog_fd[MAX_PROGS];
38int event_fd[MAX_PROGS];
39int prog_cnt;
40int prog_array_fd = -1;
41
42struct bpf_map_data map_data[MAX_MAPS];
43int map_data_count;
44
45static int populate_prog_array(const char *event, int prog_fd)
46{
47 int ind = atoi(event), err;
48
49 err = bpf_map_update_elem(prog_array_fd, &ind, &prog_fd, BPF_ANY);
50 if (err < 0) {
51 printf("failed to store prog_fd in prog_array\n");
52 return -1;
53 }
54 return 0;
55}
56
57static int write_kprobe_events(const char *val)
58{
59 int fd, ret, flags;
60
61 if (val == NULL)
62 return -1;
63 else if (val[0] == '\0')
64 flags = O_WRONLY | O_TRUNC;
65 else
66 flags = O_WRONLY | O_APPEND;
67
68 fd = open(DEBUGFS "kprobe_events", flags);
69
70 ret = write(fd, val, strlen(val));
71 close(fd);
72
73 return ret;
74}
75
76static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
77{
78 bool is_socket = strncmp(event, "socket", 6) == 0;
79 bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
80 bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
81 bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
82 bool is_raw_tracepoint = strncmp(event, "raw_tracepoint/", 15) == 0;
83 bool is_xdp = strncmp(event, "xdp", 3) == 0;
84 bool is_perf_event = strncmp(event, "perf_event", 10) == 0;
85 bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0;
86 bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
87 bool is_sockops = strncmp(event, "sockops", 7) == 0;
88 bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0;
89 bool is_sk_msg = strncmp(event, "sk_msg", 6) == 0;
90 size_t insns_cnt = size / sizeof(struct bpf_insn);
91 enum bpf_prog_type prog_type;
92 char buf[256];
93 int fd, efd, err, id;
94 struct perf_event_attr attr = {};
95
96 attr.type = PERF_TYPE_TRACEPOINT;
97 attr.sample_type = PERF_SAMPLE_RAW;
98 attr.sample_period = 1;
99 attr.wakeup_events = 1;
100
101 if (is_socket) {
102 prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
103 } else if (is_kprobe || is_kretprobe) {
104 prog_type = BPF_PROG_TYPE_KPROBE;
105 } else if (is_tracepoint) {
106 prog_type = BPF_PROG_TYPE_TRACEPOINT;
107 } else if (is_raw_tracepoint) {
108 prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT;
109 } else if (is_xdp) {
110 prog_type = BPF_PROG_TYPE_XDP;
111 } else if (is_perf_event) {
112 prog_type = BPF_PROG_TYPE_PERF_EVENT;
113 } else if (is_cgroup_skb) {
114 prog_type = BPF_PROG_TYPE_CGROUP_SKB;
115 } else if (is_cgroup_sk) {
116 prog_type = BPF_PROG_TYPE_CGROUP_SOCK;
117 } else if (is_sockops) {
118 prog_type = BPF_PROG_TYPE_SOCK_OPS;
119 } else if (is_sk_skb) {
120 prog_type = BPF_PROG_TYPE_SK_SKB;
121 } else if (is_sk_msg) {
122 prog_type = BPF_PROG_TYPE_SK_MSG;
123 } else {
124 printf("Unknown event '%s'\n", event);
125 return -1;
126 }
127
128 if (prog_cnt == MAX_PROGS)
129 return -1;
130
131 fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version,
132 bpf_log_buf, BPF_LOG_BUF_SIZE);
133 if (fd < 0) {
134 printf("bpf_load_program() err=%d\n%s", errno, bpf_log_buf);
135 return -1;
136 }
137
138 prog_fd[prog_cnt++] = fd;
139
140 if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
141 return 0;
142
143 if (is_socket || is_sockops || is_sk_skb || is_sk_msg) {
144 if (is_socket)
145 event += 6;
146 else
147 event += 7;
148 if (*event != '/')
149 return 0;
150 event++;
151 if (!isdigit(*event)) {
152 printf("invalid prog number\n");
153 return -1;
154 }
155 return populate_prog_array(event, fd);
156 }
157
158 if (is_raw_tracepoint) {
159 efd = bpf_raw_tracepoint_open(event + 15, fd);
160 if (efd < 0) {
161 printf("tracepoint %s %s\n", event + 15, strerror(errno));
162 return -1;
163 }
164 event_fd[prog_cnt - 1] = efd;
165 return 0;
166 }
167
168 if (is_kprobe || is_kretprobe) {
169 bool need_normal_check = true;
170 const char *event_prefix = "";
171
172 if (is_kprobe)
173 event += 7;
174 else
175 event += 10;
176
177 if (*event == 0) {
178 printf("event name cannot be empty\n");
179 return -1;
180 }
181
182 if (isdigit(*event))
183 return populate_prog_array(event, fd);
184
185#ifdef __x86_64__
186 if (strncmp(event, "sys_", 4) == 0) {
187 snprintf(buf, sizeof(buf), "%c:__x64_%s __x64_%s",
188 is_kprobe ? 'p' : 'r', event, event);
189 err = write_kprobe_events(buf);
190 if (err >= 0) {
191 need_normal_check = false;
192 event_prefix = "__x64_";
193 }
194 }
195#endif
196 if (need_normal_check) {
197 snprintf(buf, sizeof(buf), "%c:%s %s",
198 is_kprobe ? 'p' : 'r', event, event);
199 err = write_kprobe_events(buf);
200 if (err < 0) {
201 printf("failed to create kprobe '%s' error '%s'\n",
202 event, strerror(errno));
203 return -1;
204 }
205 }
206
207 strcpy(buf, DEBUGFS);
208 strcat(buf, "events/kprobes/");
209 strcat(buf, event_prefix);
210 strcat(buf, event);
211 strcat(buf, "/id");
212 } else if (is_tracepoint) {
213 event += 11;
214
215 if (*event == 0) {
216 printf("event name cannot be empty\n");
217 return -1;
218 }
219 strcpy(buf, DEBUGFS);
220 strcat(buf, "events/");
221 strcat(buf, event);
222 strcat(buf, "/id");
223 }
224
225 efd = open(buf, O_RDONLY, 0);
226 if (efd < 0) {
227 printf("failed to open event %s\n", event);
228 return -1;
229 }
230
231 err = read(efd, buf, sizeof(buf));
232 if (err < 0 || err >= sizeof(buf)) {
233 printf("read from '%s' failed '%s'\n", event, strerror(errno));
234 return -1;
235 }
236
237 close(efd);
238
239 buf[err] = 0;
240 id = atoi(buf);
241 attr.config = id;
242
243 efd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
244 if (efd < 0) {
245 printf("event %d fd %d err %s\n", id, efd, strerror(errno));
246 return -1;
247 }
248 event_fd[prog_cnt - 1] = efd;
249 err = ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
250 if (err < 0) {
251 printf("ioctl PERF_EVENT_IOC_ENABLE failed err %s\n",
252 strerror(errno));
253 return -1;
254 }
255 err = ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
256 if (err < 0) {
257 printf("ioctl PERF_EVENT_IOC_SET_BPF failed err %s\n",
258 strerror(errno));
259 return -1;
260 }
261
262 return 0;
263}
264
265static int load_maps(struct bpf_map_data *maps, int nr_maps,
266 fixup_map_cb fixup_map)
267{
268 int i, numa_node;
269
270 for (i = 0; i < nr_maps; i++) {
271 if (fixup_map) {
272 fixup_map(&maps[i], i);
273 /* Allow userspace to assign map FD prior to creation */
274 if (maps[i].fd != -1) {
275 map_fd[i] = maps[i].fd;
276 continue;
277 }
278 }
279
280 numa_node = maps[i].def.map_flags & BPF_F_NUMA_NODE ?
281 maps[i].def.numa_node : -1;
282
283 if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
284 maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) {
285 int inner_map_fd = map_fd[maps[i].def.inner_map_idx];
286
287 map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type,
288 maps[i].name,
289 maps[i].def.key_size,
290 inner_map_fd,
291 maps[i].def.max_entries,
292 maps[i].def.map_flags,
293 numa_node);
294 } else {
295 map_fd[i] = bpf_create_map_node(maps[i].def.type,
296 maps[i].name,
297 maps[i].def.key_size,
298 maps[i].def.value_size,
299 maps[i].def.max_entries,
300 maps[i].def.map_flags,
301 numa_node);
302 }
303 if (map_fd[i] < 0) {
304 printf("failed to create map %d (%s): %d %s\n",
305 i, maps[i].name, errno, strerror(errno));
306 return 1;
307 }
308 maps[i].fd = map_fd[i];
309
310 if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY)
311 prog_array_fd = map_fd[i];
312 }
313 return 0;
314}
315
316static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname,
317 GElf_Shdr *shdr, Elf_Data **data)
318{
319 Elf_Scn *scn;
320
321 scn = elf_getscn(elf, i);
322 if (!scn)
323 return 1;
324
325 if (gelf_getshdr(scn, shdr) != shdr)
326 return 2;
327
328 *shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
329 if (!*shname || !shdr->sh_size)
330 return 3;
331
332 *data = elf_getdata(scn, 0);
333 if (!*data || elf_getdata(scn, *data) != NULL)
334 return 4;
335
336 return 0;
337}
338
339static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols,
340 GElf_Shdr *shdr, struct bpf_insn *insn,
341 struct bpf_map_data *maps, int nr_maps)
342{
343 int i, nrels;
344
345 nrels = shdr->sh_size / shdr->sh_entsize;
346
347 for (i = 0; i < nrels; i++) {
348 GElf_Sym sym;
349 GElf_Rel rel;
350 unsigned int insn_idx;
351 bool match = false;
352 int j, map_idx;
353
354 gelf_getrel(data, i, &rel);
355
356 insn_idx = rel.r_offset / sizeof(struct bpf_insn);
357
358 gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym);
359
360 if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) {
361 printf("invalid relo for insn[%d].code 0x%x\n",
362 insn_idx, insn[insn_idx].code);
363 return 1;
364 }
365 insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD;
366
367 /* Match FD relocation against recorded map_data[] offset */
368 for (map_idx = 0; map_idx < nr_maps; map_idx++) {
369 if (maps[map_idx].elf_offset == sym.st_value) {
370 match = true;
371 break;
372 }
373 }
374 if (match) {
375 insn[insn_idx].imm = maps[map_idx].fd;
376 } else {
377 printf("invalid relo for insn[%d] no map_data match\n",
378 insn_idx);
379 return 1;
380 }
381 }
382
383 return 0;
384}
385
386static int cmp_symbols(const void *l, const void *r)
387{
388 const GElf_Sym *lsym = (const GElf_Sym *)l;
389 const GElf_Sym *rsym = (const GElf_Sym *)r;
390
391 if (lsym->st_value < rsym->st_value)
392 return -1;
393 else if (lsym->st_value > rsym->st_value)
394 return 1;
395 else
396 return 0;
397}
398
399static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx,
400 Elf *elf, Elf_Data *symbols, int strtabidx)
401{
402 int map_sz_elf, map_sz_copy;
403 bool validate_zero = false;
404 Elf_Data *data_maps;
405 int i, nr_maps;
406 GElf_Sym *sym;
407 Elf_Scn *scn;
408 int copy_sz;
409
410 if (maps_shndx < 0)
411 return -EINVAL;
412 if (!symbols)
413 return -EINVAL;
414
415 /* Get data for maps section via elf index */
416 scn = elf_getscn(elf, maps_shndx);
417 if (scn)
418 data_maps = elf_getdata(scn, NULL);
419 if (!scn || !data_maps) {
420 printf("Failed to get Elf_Data from maps section %d\n",
421 maps_shndx);
422 return -EINVAL;
423 }
424
425 /* For each map get corrosponding symbol table entry */
426 sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym));
427 for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
428 assert(nr_maps < MAX_MAPS+1);
429 if (!gelf_getsym(symbols, i, &sym[nr_maps]))
430 continue;
431 if (sym[nr_maps].st_shndx != maps_shndx)
432 continue;
433 /* Only increment iif maps section */
434 nr_maps++;
435 }
436
437 /* Align to map_fd[] order, via sort on offset in sym.st_value */
438 qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols);
439
440 /* Keeping compatible with ELF maps section changes
441 * ------------------------------------------------
442 * The program size of struct bpf_load_map_def is known by loader
443 * code, but struct stored in ELF file can be different.
444 *
445 * Unfortunately sym[i].st_size is zero. To calculate the
446 * struct size stored in the ELF file, assume all struct have
447 * the same size, and simply divide with number of map
448 * symbols.
449 */
450 map_sz_elf = data_maps->d_size / nr_maps;
451 map_sz_copy = sizeof(struct bpf_load_map_def);
452 if (map_sz_elf < map_sz_copy) {
453 /*
454 * Backward compat, loading older ELF file with
455 * smaller struct, keeping remaining bytes zero.
456 */
457 map_sz_copy = map_sz_elf;
458 } else if (map_sz_elf > map_sz_copy) {
459 /*
460 * Forward compat, loading newer ELF file with larger
461 * struct with unknown features. Assume zero means
462 * feature not used. Thus, validate rest of struct
463 * data is zero.
464 */
465 validate_zero = true;
466 }
467
468 /* Memcpy relevant part of ELF maps data to loader maps */
469 for (i = 0; i < nr_maps; i++) {
470 struct bpf_load_map_def *def;
471 unsigned char *addr, *end;
472 const char *map_name;
473 size_t offset;
474
475 map_name = elf_strptr(elf, strtabidx, sym[i].st_name);
476 maps[i].name = strdup(map_name);
477 if (!maps[i].name) {
478 printf("strdup(%s): %s(%d)\n", map_name,
479 strerror(errno), errno);
480 free(sym);
481 return -errno;
482 }
483
484 /* Symbol value is offset into ELF maps section data area */
485 offset = sym[i].st_value;
486 def = (struct bpf_load_map_def *)(data_maps->d_buf + offset);
487 maps[i].elf_offset = offset;
488 memset(&maps[i].def, 0, sizeof(struct bpf_load_map_def));
489 memcpy(&maps[i].def, def, map_sz_copy);
490
491 /* Verify no newer features were requested */
492 if (validate_zero) {
493 addr = (unsigned char *) def + map_sz_copy;
494 end = (unsigned char *) def + map_sz_elf;
495 for (; addr < end; addr++) {
496 if (*addr != 0) {
497 free(sym);
498 return -EFBIG;
499 }
500 }
501 }
502 }
503
504 free(sym);
505 return nr_maps;
506}
507
508static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
509{
510 int fd, i, ret, maps_shndx = -1, strtabidx = -1;
511 Elf *elf;
512 GElf_Ehdr ehdr;
513 GElf_Shdr shdr, shdr_prog;
514 Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL;
515 char *shname, *shname_prog;
516 int nr_maps = 0;
517
518 /* reset global variables */
519 kern_version = 0;
520 memset(license, 0, sizeof(license));
521 memset(processed_sec, 0, sizeof(processed_sec));
522
523 if (elf_version(EV_CURRENT) == EV_NONE)
524 return 1;
525
526 fd = open(path, O_RDONLY, 0);
527 if (fd < 0)
528 return 1;
529
530 elf = elf_begin(fd, ELF_C_READ, NULL);
531
532 if (!elf)
533 return 1;
534
535 if (gelf_getehdr(elf, &ehdr) != &ehdr)
536 return 1;
537
538 /* clear all kprobes */
539 i = write_kprobe_events("");
540
541 /* scan over all elf sections to get license and map info */
542 for (i = 1; i < ehdr.e_shnum; i++) {
543
544 if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
545 continue;
546
547 if (0) /* helpful for llvm debugging */
548 printf("section %d:%s data %p size %zd link %d flags %d\n",
549 i, shname, data->d_buf, data->d_size,
550 shdr.sh_link, (int) shdr.sh_flags);
551
552 if (strcmp(shname, "license") == 0) {
553 processed_sec[i] = true;
554 memcpy(license, data->d_buf, data->d_size);
555 } else if (strcmp(shname, "version") == 0) {
556 processed_sec[i] = true;
557 if (data->d_size != sizeof(int)) {
558 printf("invalid size of version section %zd\n",
559 data->d_size);
560 return 1;
561 }
562 memcpy(&kern_version, data->d_buf, sizeof(int));
563 } else if (strcmp(shname, "maps") == 0) {
564 int j;
565
566 maps_shndx = i;
567 data_maps = data;
568 for (j = 0; j < MAX_MAPS; j++)
569 map_data[j].fd = -1;
570 } else if (shdr.sh_type == SHT_SYMTAB) {
571 strtabidx = shdr.sh_link;
572 symbols = data;
573 }
574 }
575
576 ret = 1;
577
578 if (!symbols) {
579 printf("missing SHT_SYMTAB section\n");
580 goto done;
581 }
582
583 if (data_maps) {
584 nr_maps = load_elf_maps_section(map_data, maps_shndx,
585 elf, symbols, strtabidx);
586 if (nr_maps < 0) {
587 printf("Error: Failed loading ELF maps (errno:%d):%s\n",
588 nr_maps, strerror(-nr_maps));
589 goto done;
590 }
591 if (load_maps(map_data, nr_maps, fixup_map))
592 goto done;
593 map_data_count = nr_maps;
594
595 processed_sec[maps_shndx] = true;
596 }
597
598 /* process all relo sections, and rewrite bpf insns for maps */
599 for (i = 1; i < ehdr.e_shnum; i++) {
600 if (processed_sec[i])
601 continue;
602
603 if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
604 continue;
605
606 if (shdr.sh_type == SHT_REL) {
607 struct bpf_insn *insns;
608
609 /* locate prog sec that need map fixup (relocations) */
610 if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog,
611 &shdr_prog, &data_prog))
612 continue;
613
614 if (shdr_prog.sh_type != SHT_PROGBITS ||
615 !(shdr_prog.sh_flags & SHF_EXECINSTR))
616 continue;
617
618 insns = (struct bpf_insn *) data_prog->d_buf;
619 processed_sec[i] = true; /* relo section */
620
621 if (parse_relo_and_apply(data, symbols, &shdr, insns,
622 map_data, nr_maps))
623 continue;
624 }
625 }
626
627 /* load programs */
628 for (i = 1; i < ehdr.e_shnum; i++) {
629
630 if (processed_sec[i])
631 continue;
632
633 if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
634 continue;
635
636 if (memcmp(shname, "kprobe/", 7) == 0 ||
637 memcmp(shname, "kretprobe/", 10) == 0 ||
638 memcmp(shname, "tracepoint/", 11) == 0 ||
639 memcmp(shname, "raw_tracepoint/", 15) == 0 ||
640 memcmp(shname, "xdp", 3) == 0 ||
641 memcmp(shname, "perf_event", 10) == 0 ||
642 memcmp(shname, "socket", 6) == 0 ||
643 memcmp(shname, "cgroup/", 7) == 0 ||
644 memcmp(shname, "sockops", 7) == 0 ||
645 memcmp(shname, "sk_skb", 6) == 0 ||
646 memcmp(shname, "sk_msg", 6) == 0) {
647 ret = load_and_attach(shname, data->d_buf,
648 data->d_size);
649 if (ret != 0)
650 goto done;
651 }
652 }
653
654done:
655 close(fd);
656 return ret;
657}
658
659int load_bpf_file(char *path)
660{
661 return do_load_bpf_file(path, NULL);
662}
663
664int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map)
665{
666 return do_load_bpf_file(path, fixup_map);
667}
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
new file mode 100644
index 000000000..4fcd258c6
--- /dev/null
+++ b/samples/bpf/bpf_load.h
@@ -0,0 +1,57 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef __BPF_LOAD_H
3#define __BPF_LOAD_H
4
5#include <bpf/bpf.h>
6
7#define MAX_MAPS 32
8#define MAX_PROGS 32
9
10struct bpf_load_map_def {
11 unsigned int type;
12 unsigned int key_size;
13 unsigned int value_size;
14 unsigned int max_entries;
15 unsigned int map_flags;
16 unsigned int inner_map_idx;
17 unsigned int numa_node;
18};
19
20struct bpf_map_data {
21 int fd;
22 char *name;
23 size_t elf_offset;
24 struct bpf_load_map_def def;
25};
26
27typedef void (*fixup_map_cb)(struct bpf_map_data *map, int idx);
28
29extern int prog_fd[MAX_PROGS];
30extern int event_fd[MAX_PROGS];
31extern char bpf_log_buf[BPF_LOG_BUF_SIZE];
32extern int prog_cnt;
33
34/* There is a one-to-one mapping between map_fd[] and map_data[].
35 * The map_data[] just contains more rich info on the given map.
36 */
37extern int map_fd[MAX_MAPS];
38extern struct bpf_map_data map_data[MAX_MAPS];
39extern int map_data_count;
40
41/* parses elf file compiled by llvm .c->.o
42 * . parses 'maps' section and creates maps via BPF syscall
43 * . parses 'license' section and passes it to syscall
44 * . parses elf relocations for BPF maps and adjusts BPF_LD_IMM64 insns by
45 * storing map_fd into insn->imm and marking such insns as BPF_PSEUDO_MAP_FD
46 * . loads eBPF programs via BPF syscall
47 *
48 * One ELF file can contain multiple BPF programs which will be loaded
49 * and their FDs stored stored in prog_fd array
50 *
51 * returns zero on success
52 */
53int load_bpf_file(char *path);
54int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map);
55
56int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags);
57#endif
diff --git a/samples/bpf/cookie_uid_helper_example.c b/samples/bpf/cookie_uid_helper_example.c
new file mode 100644
index 000000000..deb0e3e03
--- /dev/null
+++ b/samples/bpf/cookie_uid_helper_example.c
@@ -0,0 +1,323 @@
1/* This test is a demo of using get_socket_uid and get_socket_cookie
2 * helper function to do per socket based network traffic monitoring.
3 * It requires iptables version higher then 1.6.1. to load pinned eBPF
4 * program into the xt_bpf match.
5 *
6 * TEST:
7 * ./run_cookie_uid_helper_example.sh -option
8 * option:
9 * -t: do traffic monitoring test, the program will continuously
10 * print out network traffic happens after program started A sample
11 * output is shown below:
12 *
13 * cookie: 877, uid: 0x3e8, Pakcet Count: 20, Bytes Count: 11058
14 * cookie: 132, uid: 0x0, Pakcet Count: 2, Bytes Count: 286
15 * cookie: 812, uid: 0x3e8, Pakcet Count: 3, Bytes Count: 1726
16 * cookie: 802, uid: 0x3e8, Pakcet Count: 2, Bytes Count: 104
17 * cookie: 877, uid: 0x3e8, Pakcet Count: 20, Bytes Count: 11058
18 * cookie: 831, uid: 0x3e8, Pakcet Count: 2, Bytes Count: 104
19 * cookie: 0, uid: 0x0, Pakcet Count: 6, Bytes Count: 712
20 * cookie: 880, uid: 0xfffe, Pakcet Count: 1, Bytes Count: 70
21 *
22 * -s: do getsockopt SO_COOKIE test, the program will set up a pair of
23 * UDP sockets and send packets between them. And read out the traffic data
24 * directly from the ebpf map based on the socket cookie.
25 *
26 * Clean up: if using shell script, the script file will delete the iptables
27 * rule and unmount the bpf program when exit. Else the iptables rule need
28 * to be deleted by hand, see run_cookie_uid_helper_example.sh for detail.
29 */
30
31#define _GNU_SOURCE
32
33#define offsetof(type, member) __builtin_offsetof(type, member)
34#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
35
36#include <arpa/inet.h>
37#include <errno.h>
38#include <error.h>
39#include <limits.h>
40#include <linux/bpf.h>
41#include <linux/if_ether.h>
42#include <net/if.h>
43#include <signal.h>
44#include <stdbool.h>
45#include <stdint.h>
46#include <stdio.h>
47#include <stdlib.h>
48#include <string.h>
49#include <sys/socket.h>
50#include <sys/stat.h>
51#include <sys/types.h>
52#include <unistd.h>
53#include <bpf/bpf.h>
54#include "bpf_insn.h"
55
56#define PORT 8888
57
58struct stats {
59 uint32_t uid;
60 uint64_t packets;
61 uint64_t bytes;
62};
63
64static int map_fd, prog_fd;
65
66static bool test_finish;
67
68static void maps_create(void)
69{
70 map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(uint32_t),
71 sizeof(struct stats), 100, 0);
72 if (map_fd < 0)
73 error(1, errno, "map create failed!\n");
74}
75
76static void prog_load(void)
77{
78 static char log_buf[1 << 16];
79
80 struct bpf_insn prog[] = {
81 /*
82 * Save sk_buff for future usage. value stored in R6 to R10 will
83 * not be reset after a bpf helper function call.
84 */
85 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
86 /*
87 * pc1: BPF_FUNC_get_socket_cookie takes one parameter,
88 * R1: sk_buff
89 */
90 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
91 BPF_FUNC_get_socket_cookie),
92 /* pc2-4: save &socketCookie to r7 for future usage*/
93 BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
94 BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
95 BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
96 /*
97 * pc5-8: set up the registers for BPF_FUNC_map_lookup_elem,
98 * it takes two parameters (R1: map_fd, R2: &socket_cookie)
99 */
100 BPF_LD_MAP_FD(BPF_REG_1, map_fd),
101 BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
102 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
103 BPF_FUNC_map_lookup_elem),
104 /*
105 * pc9. if r0 != 0x0, go to pc+14, since we have the cookie
106 * stored already
107 * Otherwise do pc10-22 to setup a new data entry.
108 */
109 BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 14),
110 BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
111 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
112 BPF_FUNC_get_socket_uid),
113 /*
114 * Place a struct stats in the R10 stack and sequentially
115 * place the member value into the memory. Packets value
116 * is set by directly place a IMM value 1 into the stack.
117 */
118 BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0,
119 -32 + (__s16)offsetof(struct stats, uid)),
120 BPF_ST_MEM(BPF_DW, BPF_REG_10,
121 -32 + (__s16)offsetof(struct stats, packets), 1),
122 /*
123 * __sk_buff is a special struct used for eBPF program to
124 * directly access some sk_buff field.
125 */
126 BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6,
127 offsetof(struct __sk_buff, len)),
128 BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1,
129 -32 + (__s16)offsetof(struct stats, bytes)),
130 /*
131 * add new map entry using BPF_FUNC_map_update_elem, it takes
132 * 4 parameters (R1: map_fd, R2: &socket_cookie, R3: &stats,
133 * R4: flags)
134 */
135 BPF_LD_MAP_FD(BPF_REG_1, map_fd),
136 BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
137 BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
138 BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -32),
139 BPF_MOV64_IMM(BPF_REG_4, 0),
140 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
141 BPF_FUNC_map_update_elem),
142 BPF_JMP_IMM(BPF_JA, 0, 0, 5),
143 /*
144 * pc24-30 update the packet info to a exist data entry, it can
145 * be done by directly write to pointers instead of using
146 * BPF_FUNC_map_update_elem helper function
147 */
148 BPF_MOV64_REG(BPF_REG_9, BPF_REG_0),
149 BPF_MOV64_IMM(BPF_REG_1, 1),
150 BPF_STX_XADD(BPF_DW, BPF_REG_9, BPF_REG_1,
151 offsetof(struct stats, packets)),
152 BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6,
153 offsetof(struct __sk_buff, len)),
154 BPF_STX_XADD(BPF_DW, BPF_REG_9, BPF_REG_1,
155 offsetof(struct stats, bytes)),
156 BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6,
157 offsetof(struct __sk_buff, len)),
158 BPF_EXIT_INSN(),
159 };
160 prog_fd = bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog,
161 ARRAY_SIZE(prog), "GPL", 0,
162 log_buf, sizeof(log_buf));
163 if (prog_fd < 0)
164 error(1, errno, "failed to load prog\n%s\n", log_buf);
165}
166
167static void prog_attach_iptables(char *file)
168{
169 int ret;
170 char rules[100];
171
172 if (bpf_obj_pin(prog_fd, file))
173 error(1, errno, "bpf_obj_pin");
174 if (strlen(file) > 50) {
175 printf("file path too long: %s\n", file);
176 exit(1);
177 }
178 sprintf(rules, "iptables -A OUTPUT -m bpf --object-pinned %s -j ACCEPT",
179 file);
180 ret = system(rules);
181 if (ret < 0) {
182 printf("iptables rule update failed: %d/n", WEXITSTATUS(ret));
183 exit(1);
184 }
185}
186
187static void print_table(void)
188{
189 struct stats curEntry;
190 uint32_t curN = UINT32_MAX;
191 uint32_t nextN;
192 int res;
193
194 while (bpf_map_get_next_key(map_fd, &curN, &nextN) > -1) {
195 curN = nextN;
196 res = bpf_map_lookup_elem(map_fd, &curN, &curEntry);
197 if (res < 0) {
198 error(1, errno, "fail to get entry value of Key: %u\n",
199 curN);
200 } else {
201 printf("cookie: %u, uid: 0x%x, Packet Count: %lu,"
202 " Bytes Count: %lu\n", curN, curEntry.uid,
203 curEntry.packets, curEntry.bytes);
204 }
205 }
206}
207
208static void udp_client(void)
209{
210 struct sockaddr_in si_other = {0};
211 struct sockaddr_in si_me = {0};
212 struct stats dataEntry;
213 int s_rcv, s_send, i, recv_len;
214 char message = 'a';
215 char buf;
216 uint64_t cookie;
217 int res;
218 socklen_t cookie_len = sizeof(cookie);
219 socklen_t slen = sizeof(si_other);
220
221 s_rcv = socket(PF_INET, SOCK_DGRAM, 0);
222 if (s_rcv < 0)
223 error(1, errno, "rcv socket creat failed!\n");
224 si_other.sin_family = AF_INET;
225 si_other.sin_port = htons(PORT);
226 if (inet_aton("127.0.0.1", &si_other.sin_addr) == 0)
227 error(1, errno, "inet_aton\n");
228 if (bind(s_rcv, (struct sockaddr *)&si_other, sizeof(si_other)) == -1)
229 error(1, errno, "bind\n");
230 s_send = socket(PF_INET, SOCK_DGRAM, 0);
231 if (s_send < 0)
232 error(1, errno, "send socket creat failed!\n");
233 res = getsockopt(s_send, SOL_SOCKET, SO_COOKIE, &cookie, &cookie_len);
234 if (res < 0)
235 printf("get cookie failed: %s\n", strerror(errno));
236 res = bpf_map_lookup_elem(map_fd, &cookie, &dataEntry);
237 if (res != -1)
238 error(1, errno, "socket stat found while flow not active\n");
239 for (i = 0; i < 10; i++) {
240 res = sendto(s_send, &message, sizeof(message), 0,
241 (struct sockaddr *)&si_other, slen);
242 if (res == -1)
243 error(1, errno, "send\n");
244 if (res != sizeof(message))
245 error(1, 0, "%uB != %luB\n", res, sizeof(message));
246 recv_len = recvfrom(s_rcv, &buf, sizeof(buf), 0,
247 (struct sockaddr *)&si_me, &slen);
248 if (recv_len < 0)
249 error(1, errno, "receive\n");
250 res = memcmp(&(si_other.sin_addr), &(si_me.sin_addr),
251 sizeof(si_me.sin_addr));
252 if (res != 0)
253 error(1, EFAULT, "sender addr error: %d\n", res);
254 printf("Message received: %c\n", buf);
255 res = bpf_map_lookup_elem(map_fd, &cookie, &dataEntry);
256 if (res < 0)
257 error(1, errno, "lookup sk stat failed, cookie: %lu\n",
258 cookie);
259 printf("cookie: %lu, uid: 0x%x, Packet Count: %lu,"
260 " Bytes Count: %lu\n\n", cookie, dataEntry.uid,
261 dataEntry.packets, dataEntry.bytes);
262 }
263 close(s_send);
264 close(s_rcv);
265}
266
267static int usage(void)
268{
269 printf("Usage: ./run_cookie_uid_helper_example.sh"
270 " bpfObjName -option\n"
271 " -t traffic monitor test\n"
272 " -s getsockopt cookie test\n");
273 return 1;
274}
275
276static void finish(int ret)
277{
278 test_finish = true;
279}
280
281int main(int argc, char *argv[])
282{
283 int opt;
284 bool cfg_test_traffic = false;
285 bool cfg_test_cookie = false;
286
287 if (argc != 3)
288 return usage();
289 while ((opt = getopt(argc, argv, "ts")) != -1) {
290 switch (opt) {
291 case 't':
292 cfg_test_traffic = true;
293 break;
294 case 's':
295 cfg_test_cookie = true;
296 break;
297
298 default:
299 printf("unknown option %c\n", opt);
300 usage();
301 return -1;
302 }
303 }
304 maps_create();
305 prog_load();
306 prog_attach_iptables(argv[2]);
307 if (cfg_test_traffic) {
308 if (signal(SIGINT, finish) == SIG_ERR)
309 error(1, errno, "register SIGINT handler failed");
310 if (signal(SIGTERM, finish) == SIG_ERR)
311 error(1, errno, "register SIGTERM handler failed");
312 while (!test_finish) {
313 print_table();
314 printf("\n");
315 sleep(1);
316 };
317 } else if (cfg_test_cookie) {
318 udp_client();
319 }
320 close(prog_fd);
321 close(map_fd);
322 return 0;
323}
diff --git a/samples/bpf/cpustat_kern.c b/samples/bpf/cpustat_kern.c
new file mode 100644
index 000000000..5aefd19cd
--- /dev/null
+++ b/samples/bpf/cpustat_kern.c
@@ -0,0 +1,281 @@
1// SPDX-License-Identifier: GPL-2.0
2
3#include <linux/version.h>
4#include <linux/ptrace.h>
5#include <uapi/linux/bpf.h>
6#include <bpf/bpf_helpers.h>
7
8/*
9 * The CPU number, cstate number and pstate number are based
10 * on 96boards Hikey with octa CA53 CPUs.
11 *
12 * Every CPU have three idle states for cstate:
13 * WFI, CPU_OFF, CLUSTER_OFF
14 *
15 * Every CPU have 5 operating points:
16 * 208MHz, 432MHz, 729MHz, 960MHz, 1200MHz
17 *
18 * This code is based on these assumption and other platforms
19 * need to adjust these definitions.
20 */
21#define MAX_CPU 8
22#define MAX_PSTATE_ENTRIES 5
23#define MAX_CSTATE_ENTRIES 3
24
25static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 };
26
27/*
28 * my_map structure is used to record cstate and pstate index and
29 * timestamp (Idx, Ts), when new event incoming we need to update
30 * combination for new state index and timestamp (Idx`, Ts`).
31 *
32 * Based on (Idx, Ts) and (Idx`, Ts`) we can calculate the time
33 * interval for the previous state: Duration(Idx) = Ts` - Ts.
34 *
35 * Every CPU has one below array for recording state index and
36 * timestamp, and record for cstate and pstate saperately:
37 *
38 * +--------------------------+
39 * | cstate timestamp |
40 * +--------------------------+
41 * | cstate index |
42 * +--------------------------+
43 * | pstate timestamp |
44 * +--------------------------+
45 * | pstate index |
46 * +--------------------------+
47 */
48#define MAP_OFF_CSTATE_TIME 0
49#define MAP_OFF_CSTATE_IDX 1
50#define MAP_OFF_PSTATE_TIME 2
51#define MAP_OFF_PSTATE_IDX 3
52#define MAP_OFF_NUM 4
53
54struct {
55 __uint(type, BPF_MAP_TYPE_ARRAY);
56 __type(key, u32);
57 __type(value, u64);
58 __uint(max_entries, MAX_CPU * MAP_OFF_NUM);
59} my_map SEC(".maps");
60
61/* cstate_duration records duration time for every idle state per CPU */
62struct {
63 __uint(type, BPF_MAP_TYPE_ARRAY);
64 __type(key, u32);
65 __type(value, u64);
66 __uint(max_entries, MAX_CPU * MAX_CSTATE_ENTRIES);
67} cstate_duration SEC(".maps");
68
69/* pstate_duration records duration time for every operating point per CPU */
70struct {
71 __uint(type, BPF_MAP_TYPE_ARRAY);
72 __type(key, u32);
73 __type(value, u64);
74 __uint(max_entries, MAX_CPU * MAX_PSTATE_ENTRIES);
75} pstate_duration SEC(".maps");
76
77/*
78 * The trace events for cpu_idle and cpu_frequency are taken from:
79 * /sys/kernel/debug/tracing/events/power/cpu_idle/format
80 * /sys/kernel/debug/tracing/events/power/cpu_frequency/format
81 *
82 * These two events have same format, so define one common structure.
83 */
84struct cpu_args {
85 u64 pad;
86 u32 state;
87 u32 cpu_id;
88};
89
90/* calculate pstate index, returns MAX_PSTATE_ENTRIES for failure */
91static u32 find_cpu_pstate_idx(u32 frequency)
92{
93 u32 i;
94
95 for (i = 0; i < sizeof(cpu_opps) / sizeof(u32); i++) {
96 if (frequency == cpu_opps[i])
97 return i;
98 }
99
100 return i;
101}
102
103SEC("tracepoint/power/cpu_idle")
104int bpf_prog1(struct cpu_args *ctx)
105{
106 u64 *cts, *pts, *cstate, *pstate, prev_state, cur_ts, delta;
107 u32 key, cpu, pstate_idx;
108 u64 *val;
109
110 if (ctx->cpu_id > MAX_CPU)
111 return 0;
112
113 cpu = ctx->cpu_id;
114
115 key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_TIME;
116 cts = bpf_map_lookup_elem(&my_map, &key);
117 if (!cts)
118 return 0;
119
120 key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
121 cstate = bpf_map_lookup_elem(&my_map, &key);
122 if (!cstate)
123 return 0;
124
125 key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
126 pts = bpf_map_lookup_elem(&my_map, &key);
127 if (!pts)
128 return 0;
129
130 key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
131 pstate = bpf_map_lookup_elem(&my_map, &key);
132 if (!pstate)
133 return 0;
134
135 prev_state = *cstate;
136 *cstate = ctx->state;
137
138 if (!*cts) {
139 *cts = bpf_ktime_get_ns();
140 return 0;
141 }
142
143 cur_ts = bpf_ktime_get_ns();
144 delta = cur_ts - *cts;
145 *cts = cur_ts;
146
147 /*
148 * When state doesn't equal to (u32)-1, the cpu will enter
149 * one idle state; for this case we need to record interval
150 * for the pstate.
151 *
152 * OPP2
153 * +---------------------+
154 * OPP1 | |
155 * ---------+ |
156 * | Idle state
157 * +---------------
158 *
159 * |<- pstate duration ->|
160 * ^ ^
161 * pts cur_ts
162 */
163 if (ctx->state != (u32)-1) {
164
165 /* record pstate after have first cpu_frequency event */
166 if (!*pts)
167 return 0;
168
169 delta = cur_ts - *pts;
170
171 pstate_idx = find_cpu_pstate_idx(*pstate);
172 if (pstate_idx >= MAX_PSTATE_ENTRIES)
173 return 0;
174
175 key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
176 val = bpf_map_lookup_elem(&pstate_duration, &key);
177 if (val)
178 __sync_fetch_and_add((long *)val, delta);
179
180 /*
181 * When state equal to (u32)-1, the cpu just exits from one
182 * specific idle state; for this case we need to record
183 * interval for the pstate.
184 *
185 * OPP2
186 * -----------+
187 * | OPP1
188 * | +-----------
189 * | Idle state |
190 * +---------------------+
191 *
192 * |<- cstate duration ->|
193 * ^ ^
194 * cts cur_ts
195 */
196 } else {
197
198 key = cpu * MAX_CSTATE_ENTRIES + prev_state;
199 val = bpf_map_lookup_elem(&cstate_duration, &key);
200 if (val)
201 __sync_fetch_and_add((long *)val, delta);
202 }
203
204 /* Update timestamp for pstate as new start time */
205 if (*pts)
206 *pts = cur_ts;
207
208 return 0;
209}
210
211SEC("tracepoint/power/cpu_frequency")
212int bpf_prog2(struct cpu_args *ctx)
213{
214 u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta;
215 u32 key, cpu, pstate_idx;
216 u64 *val;
217
218 cpu = ctx->cpu_id;
219
220 key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
221 pts = bpf_map_lookup_elem(&my_map, &key);
222 if (!pts)
223 return 0;
224
225 key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
226 pstate = bpf_map_lookup_elem(&my_map, &key);
227 if (!pstate)
228 return 0;
229
230 key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
231 cstate = bpf_map_lookup_elem(&my_map, &key);
232 if (!cstate)
233 return 0;
234
235 prev_state = *pstate;
236 *pstate = ctx->state;
237
238 if (!*pts) {
239 *pts = bpf_ktime_get_ns();
240 return 0;
241 }
242
243 cur_ts = bpf_ktime_get_ns();
244 delta = cur_ts - *pts;
245 *pts = cur_ts;
246
247 /* When CPU is in idle, bail out to skip pstate statistics */
248 if (*cstate != (u32)(-1))
249 return 0;
250
251 /*
252 * The cpu changes to another different OPP (in below diagram
253 * change frequency from OPP3 to OPP1), need recording interval
254 * for previous frequency OPP3 and update timestamp as start
255 * time for new frequency OPP1.
256 *
257 * OPP3
258 * +---------------------+
259 * OPP2 | |
260 * ---------+ |
261 * | OPP1
262 * +---------------
263 *
264 * |<- pstate duration ->|
265 * ^ ^
266 * pts cur_ts
267 */
268 pstate_idx = find_cpu_pstate_idx(*pstate);
269 if (pstate_idx >= MAX_PSTATE_ENTRIES)
270 return 0;
271
272 key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
273 val = bpf_map_lookup_elem(&pstate_duration, &key);
274 if (val)
275 __sync_fetch_and_add((long *)val, delta);
276
277 return 0;
278}
279
280char _license[] SEC("license") = "GPL";
281u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/cpustat_user.c b/samples/bpf/cpustat_user.c
new file mode 100644
index 000000000..96675985e
--- /dev/null
+++ b/samples/bpf/cpustat_user.c
@@ -0,0 +1,252 @@
1// SPDX-License-Identifier: GPL-2.0
2
3#define _GNU_SOURCE
4#include <errno.h>
5#include <stdio.h>
6#include <stdlib.h>
7#include <signal.h>
8#include <sched.h>
9#include <string.h>
10#include <unistd.h>
11#include <fcntl.h>
12#include <locale.h>
13#include <sys/types.h>
14#include <sys/stat.h>
15#include <sys/time.h>
16#include <sys/resource.h>
17#include <sys/wait.h>
18
19#include <bpf/bpf.h>
20#include <bpf/libbpf.h>
21
22static int cstate_map_fd, pstate_map_fd;
23
24#define MAX_CPU 8
25#define MAX_PSTATE_ENTRIES 5
26#define MAX_CSTATE_ENTRIES 3
27#define MAX_STARS 40
28
29#define CPUFREQ_MAX_SYSFS_PATH "/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq"
30#define CPUFREQ_LOWEST_FREQ "208000"
31#define CPUFREQ_HIGHEST_FREQ "12000000"
32
33struct cpu_stat_data {
34 unsigned long cstate[MAX_CSTATE_ENTRIES];
35 unsigned long pstate[MAX_PSTATE_ENTRIES];
36};
37
38static struct cpu_stat_data stat_data[MAX_CPU];
39
40static void cpu_stat_print(void)
41{
42 int i, j;
43 char state_str[sizeof("cstate-9")];
44 struct cpu_stat_data *data;
45
46 /* Clear screen */
47 printf("\033[2J");
48
49 /* Header */
50 printf("\nCPU states statistics:\n");
51 printf("%-10s ", "state(ms)");
52
53 for (i = 0; i < MAX_CSTATE_ENTRIES; i++) {
54 sprintf(state_str, "cstate-%d", i);
55 printf("%-11s ", state_str);
56 }
57
58 for (i = 0; i < MAX_PSTATE_ENTRIES; i++) {
59 sprintf(state_str, "pstate-%d", i);
60 printf("%-11s ", state_str);
61 }
62
63 printf("\n");
64
65 for (j = 0; j < MAX_CPU; j++) {
66 data = &stat_data[j];
67
68 printf("CPU-%-6d ", j);
69 for (i = 0; i < MAX_CSTATE_ENTRIES; i++)
70 printf("%-11ld ", data->cstate[i] / 1000000);
71
72 for (i = 0; i < MAX_PSTATE_ENTRIES; i++)
73 printf("%-11ld ", data->pstate[i] / 1000000);
74
75 printf("\n");
76 }
77}
78
79static void cpu_stat_update(int cstate_fd, int pstate_fd)
80{
81 unsigned long key, value;
82 int c, i;
83
84 for (c = 0; c < MAX_CPU; c++) {
85 for (i = 0; i < MAX_CSTATE_ENTRIES; i++) {
86 key = c * MAX_CSTATE_ENTRIES + i;
87 bpf_map_lookup_elem(cstate_fd, &key, &value);
88 stat_data[c].cstate[i] = value;
89 }
90
91 for (i = 0; i < MAX_PSTATE_ENTRIES; i++) {
92 key = c * MAX_PSTATE_ENTRIES + i;
93 bpf_map_lookup_elem(pstate_fd, &key, &value);
94 stat_data[c].pstate[i] = value;
95 }
96 }
97}
98
99/*
100 * This function is copied from 'idlestat' tool function
101 * idlestat_wake_all() in idlestate.c.
102 *
103 * It sets the self running task affinity to cpus one by one so can wake up
104 * the specific CPU to handle scheduling; this results in all cpus can be
105 * waken up once and produce ftrace event 'trace_cpu_idle'.
106 */
107static int cpu_stat_inject_cpu_idle_event(void)
108{
109 int rcpu, i, ret;
110 cpu_set_t cpumask;
111 cpu_set_t original_cpumask;
112
113 ret = sysconf(_SC_NPROCESSORS_CONF);
114 if (ret < 0)
115 return -1;
116
117 rcpu = sched_getcpu();
118 if (rcpu < 0)
119 return -1;
120
121 /* Keep track of the CPUs we will run on */
122 sched_getaffinity(0, sizeof(original_cpumask), &original_cpumask);
123
124 for (i = 0; i < ret; i++) {
125
126 /* Pointless to wake up ourself */
127 if (i == rcpu)
128 continue;
129
130 /* Pointless to wake CPUs we will not run on */
131 if (!CPU_ISSET(i, &original_cpumask))
132 continue;
133
134 CPU_ZERO(&cpumask);
135 CPU_SET(i, &cpumask);
136
137 sched_setaffinity(0, sizeof(cpumask), &cpumask);
138 }
139
140 /* Enable all the CPUs of the original mask */
141 sched_setaffinity(0, sizeof(original_cpumask), &original_cpumask);
142 return 0;
143}
144
145/*
146 * It's possible to have no any frequency change for long time and cannot
147 * get ftrace event 'trace_cpu_frequency' for long period, this introduces
148 * big deviation for pstate statistics.
149 *
150 * To solve this issue, below code forces to set 'scaling_max_freq' to 208MHz
151 * for triggering ftrace event 'trace_cpu_frequency' and then recovery back to
152 * the maximum frequency value 1.2GHz.
153 */
154static int cpu_stat_inject_cpu_frequency_event(void)
155{
156 int len, fd;
157
158 fd = open(CPUFREQ_MAX_SYSFS_PATH, O_WRONLY);
159 if (fd < 0) {
160 printf("failed to open scaling_max_freq, errno=%d\n", errno);
161 return fd;
162 }
163
164 len = write(fd, CPUFREQ_LOWEST_FREQ, strlen(CPUFREQ_LOWEST_FREQ));
165 if (len < 0) {
166 printf("failed to open scaling_max_freq, errno=%d\n", errno);
167 goto err;
168 }
169
170 len = write(fd, CPUFREQ_HIGHEST_FREQ, strlen(CPUFREQ_HIGHEST_FREQ));
171 if (len < 0) {
172 printf("failed to open scaling_max_freq, errno=%d\n", errno);
173 goto err;
174 }
175
176err:
177 close(fd);
178 return len;
179}
180
181static void int_exit(int sig)
182{
183 cpu_stat_inject_cpu_idle_event();
184 cpu_stat_inject_cpu_frequency_event();
185 cpu_stat_update(cstate_map_fd, pstate_map_fd);
186 cpu_stat_print();
187 exit(0);
188}
189
190int main(int argc, char **argv)
191{
192 struct bpf_link *link = NULL;
193 struct bpf_program *prog;
194 struct bpf_object *obj;
195 char filename[256];
196 int ret;
197
198 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
199 obj = bpf_object__open_file(filename, NULL);
200 if (libbpf_get_error(obj)) {
201 fprintf(stderr, "ERROR: opening BPF object file failed\n");
202 return 0;
203 }
204
205 prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
206 if (!prog) {
207 printf("finding a prog in obj file failed\n");
208 goto cleanup;
209 }
210
211 /* load BPF program */
212 if (bpf_object__load(obj)) {
213 fprintf(stderr, "ERROR: loading BPF object file failed\n");
214 goto cleanup;
215 }
216
217 cstate_map_fd = bpf_object__find_map_fd_by_name(obj, "cstate_duration");
218 pstate_map_fd = bpf_object__find_map_fd_by_name(obj, "pstate_duration");
219 if (cstate_map_fd < 0 || pstate_map_fd < 0) {
220 fprintf(stderr, "ERROR: finding a map in obj file failed\n");
221 goto cleanup;
222 }
223
224 link = bpf_program__attach(prog);
225 if (libbpf_get_error(link)) {
226 fprintf(stderr, "ERROR: bpf_program__attach failed\n");
227 link = NULL;
228 goto cleanup;
229 }
230
231 ret = cpu_stat_inject_cpu_idle_event();
232 if (ret < 0)
233 return 1;
234
235 ret = cpu_stat_inject_cpu_frequency_event();
236 if (ret < 0)
237 return 1;
238
239 signal(SIGINT, int_exit);
240 signal(SIGTERM, int_exit);
241
242 while (1) {
243 cpu_stat_update(cstate_map_fd, pstate_map_fd);
244 cpu_stat_print();
245 sleep(5);
246 }
247
248cleanup:
249 bpf_link__destroy(link);
250 bpf_object__close(obj);
251 return 0;
252}
diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh
new file mode 100755
index 000000000..ffe4c0607
--- /dev/null
+++ b/samples/bpf/do_hbm_test.sh
@@ -0,0 +1,442 @@
1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3#
4# Copyright (c) 2019 Facebook
5#
6# This program is free software; you can redistribute it and/or
7# modify it under the terms of version 2 of the GNU General Public
8# License as published by the Free Software Foundation.
9
10Usage() {
11 echo "Script for testing HBM (Host Bandwidth Manager) framework."
12 echo "It creates a cgroup to use for testing and load a BPF program to limit"
13 echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create"
14 echo "loads. The output is the goodput in Mbps (unless -D was used)."
15 echo ""
16 echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>]"
17 echo " [-D] [-d=<delay>|--delay=<delay>] [--debug] [-E] [--edt]"
18 echo " [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]"
19 echo " [-l] [-N] [--no_cn] [-p=<port>|--port=<port>] [-P]"
20 echo " [-q=<qdisc>] [-R] [-s=<server>|--server=<server]"
21 echo " [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]"
22 echo " Where:"
23 echo " out egress (default)"
24 echo " -b or --bpf BPF program filename to load and attach."
25 echo " Default is hbm_out_kern.o for egress,"
26 echo " -c or -cc TCP congestion control (cubic or dctcp)"
27 echo " --debug print BPF trace buffer"
28 echo " -d or --delay add a delay in ms using netem"
29 echo " -D In addition to the goodput in Mbps, it also outputs"
30 echo " other detailed information. This information is"
31 echo " test dependent (i.e. iperf3 or netperf)."
32 echo " -E enable ECN (not required for dctcp)"
33 echo " --edt use fq's Earliest Departure Time (requires fq)"
34 echo " -f or --flows number of concurrent flows (default=1)"
35 echo " -i or --id cgroup id (an integer, default is 1)"
36 echo " -N use netperf instead of iperf3"
37 echo " --no_cn Do not return CN notifications"
38 echo " -l do not limit flows using loopback"
39 echo " -h Help"
40 echo " -p or --port iperf3 port (default is 5201)"
41 echo " -P use an iperf3 instance for each flow"
42 echo " -q use the specified qdisc"
43 echo " -r or --rate rate in Mbps (default 1s 1Gbps)"
44 echo " -R Use TCP_RR for netperf. 1st flow has req"
45 echo " size of 10KB, rest of 1MB. Reply in all"
46 echo " cases is 1 byte."
47 echo " More detailed output for each flow can be found"
48 echo " in the files netperf.<cg>.<flow>, where <cg> is the"
49 echo " cgroup id as specified with the -i flag, and <flow>"
50 echo " is the flow id starting at 1 and increasing by 1 for"
51 echo " flow (as specified by -f)."
52 echo " -s or --server hostname of netperf server. Used to create netperf"
53 echo " test traffic between to hosts (default is within host)"
54 echo " netserver must be running on the host."
55 echo " -S or --stats whether to update hbm stats (default is yes)."
56 echo " -t or --time duration of iperf3 in seconds (default=5)"
57 echo " -w Work conserving flag. cgroup can increase its"
58 echo " bandwidth beyond the rate limit specified"
59 echo " while there is available bandwidth. Current"
60 echo " implementation assumes there is only one NIC"
61 echo " (eth0), but can be extended to support multiple"
62 echo " NICs."
63 echo " cubic or dctcp specify which TCP CC to use"
64 echo " "
65 exit
66}
67
68#set -x
69
70debug_flag=0
71args="$@"
72name="$0"
73netem=0
74cc=x
75dir="-o"
76dir_name="out"
77dur=5
78flows=1
79id=1
80prog=""
81port=5201
82rate=1000
83multi_iperf=0
84flow_cnt=1
85use_netperf=0
86rr=0
87ecn=0
88details=0
89server=""
90qdisc=""
91flags=""
92do_stats=0
93
94function start_hbm () {
95 rm -f hbm.out
96 echo "./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog" > hbm.out
97 echo " " >> hbm.out
98 ./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog >> hbm.out 2>&1 &
99 echo $!
100}
101
102processArgs () {
103 for i in $args ; do
104 case $i in
105 # Support for upcomming ingress rate limiting
106 #in) # support for upcoming ingress rate limiting
107 # dir="-i"
108 # dir_name="in"
109 # ;;
110 out)
111 dir="-o"
112 dir_name="out"
113 ;;
114 -b=*|--bpf=*)
115 prog="${i#*=}"
116 ;;
117 -c=*|--cc=*)
118 cc="${i#*=}"
119 ;;
120 --no_cn)
121 flags="$flags --no_cn"
122 ;;
123 --debug)
124 flags="$flags -d"
125 debug_flag=1
126 ;;
127 -d=*|--delay=*)
128 netem="${i#*=}"
129 ;;
130 -D)
131 details=1
132 ;;
133 -E)
134 ecn=1
135 ;;
136 --edt)
137 flags="$flags --edt"
138 qdisc="fq"
139 ;;
140 -f=*|--flows=*)
141 flows="${i#*=}"
142 ;;
143 -i=*|--id=*)
144 id="${i#*=}"
145 ;;
146 -l)
147 flags="$flags -l"
148 ;;
149 -N)
150 use_netperf=1
151 ;;
152 -p=*|--port=*)
153 port="${i#*=}"
154 ;;
155 -P)
156 multi_iperf=1
157 ;;
158 -q=*)
159 qdisc="${i#*=}"
160 ;;
161 -r=*|--rate=*)
162 rate="${i#*=}"
163 ;;
164 -R)
165 rr=1
166 ;;
167 -s=*|--server=*)
168 server="${i#*=}"
169 ;;
170 -S|--stats)
171 flags="$flags -s"
172 do_stats=1
173 ;;
174 -t=*|--time=*)
175 dur="${i#*=}"
176 ;;
177 -w)
178 flags="$flags -w"
179 ;;
180 cubic)
181 cc=cubic
182 ;;
183 dctcp)
184 cc=dctcp
185 ;;
186 *)
187 echo "Unknown arg:$i"
188 Usage
189 ;;
190 esac
191 done
192}
193
194processArgs
195
196if [ $debug_flag -eq 1 ] ; then
197 rm -f hbm_out.log
198fi
199
200hbm_pid=$(start_hbm)
201usleep 100000
202
203host=`hostname`
204cg_base_dir=/sys/fs/cgroup
205cg_dir="$cg_base_dir/cgroup-test-work-dir/hbm$id"
206
207echo $$ >> $cg_dir/cgroup.procs
208
209ulimit -l unlimited
210
211rm -f ss.out
212rm -f hbm.[0-9]*.$dir_name
213if [ $ecn -ne 0 ] ; then
214 sysctl -w -q -n net.ipv4.tcp_ecn=1
215fi
216
217if [ $use_netperf -eq 0 ] ; then
218 cur_cc=`sysctl -n net.ipv4.tcp_congestion_control`
219 if [ "$cc" != "x" ] ; then
220 sysctl -w -q -n net.ipv4.tcp_congestion_control=$cc
221 fi
222fi
223
224if [ "$netem" -ne "0" ] ; then
225 if [ "$qdisc" != "" ] ; then
226 echo "WARNING: Ignoring -q options because -d option used"
227 fi
228 tc qdisc del dev lo root > /dev/null 2>&1
229 tc qdisc add dev lo root netem delay $netem\ms > /dev/null 2>&1
230elif [ "$qdisc" != "" ] ; then
231 tc qdisc del dev eth0 root > /dev/null 2>&1
232 tc qdisc add dev eth0 root $qdisc > /dev/null 2>&1
233fi
234
235n=0
236m=$[$dur * 5]
237hn="::1"
238if [ $use_netperf -ne 0 ] ; then
239 if [ "$server" != "" ] ; then
240 hn=$server
241 fi
242fi
243
244( ping6 -i 0.2 -c $m $hn > ping.out 2>&1 ) &
245
246if [ $use_netperf -ne 0 ] ; then
247 begNetserverPid=`ps ax | grep netserver | grep --invert-match "grep" | \
248 awk '{ print $1 }'`
249 if [ "$begNetserverPid" == "" ] ; then
250 if [ "$server" == "" ] ; then
251 ( ./netserver > /dev/null 2>&1) &
252 usleep 100000
253 fi
254 fi
255 flow_cnt=1
256 if [ "$server" == "" ] ; then
257 np_server=$host
258 else
259 np_server=$server
260 fi
261 if [ "$cc" == "x" ] ; then
262 np_cc=""
263 else
264 np_cc="-K $cc,$cc"
265 fi
266 replySize=1
267 while [ $flow_cnt -le $flows ] ; do
268 if [ $rr -ne 0 ] ; then
269 reqSize=1M
270 if [ $flow_cnt -eq 1 ] ; then
271 reqSize=10K
272 fi
273 if [ "$dir" == "-i" ] ; then
274 replySize=$reqSize
275 reqSize=1
276 fi
277 ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r $reqSize,$replySize $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,REMOTE_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,LOCAL_RECV_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
278 else
279 if [ "$dir" == "-i" ] ; then
280 ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r 1,10M $np_cc -k P50_LATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REMOTE_TRANSPORT_RETRANS,REMOTE_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
281 else
282 ( ./netperf -H $np_server -l $dur -f m -j -t TCP_STREAM -- $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
283 fi
284 fi
285 flow_cnt=$[flow_cnt+1]
286 done
287
288# sleep for duration of test (plus some buffer)
289 n=$[dur+2]
290 sleep $n
291
292# force graceful termination of netperf
293 pids=`pgrep netperf`
294 for p in $pids ; do
295 kill -SIGALRM $p
296 done
297
298 flow_cnt=1
299 rate=0
300 if [ $details -ne 0 ] ; then
301 echo ""
302 echo "Details for HBM in cgroup $id"
303 if [ $do_stats -eq 1 ] ; then
304 if [ -e hbm.$id.$dir_name ] ; then
305 cat hbm.$id.$dir_name
306 fi
307 fi
308 fi
309 while [ $flow_cnt -le $flows ] ; do
310 if [ "$dir" == "-i" ] ; then
311 r=`cat netperf.$id.$flow_cnt | grep -o "REMOTE_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"`
312 else
313 r=`cat netperf.$id.$flow_cnt | grep -o "LOCAL_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"`
314 fi
315 echo "rate for flow $flow_cnt: $r"
316 rate=$[rate+r]
317 if [ $details -ne 0 ] ; then
318 echo "-----"
319 echo "Details for cgroup $id, flow $flow_cnt"
320 cat netperf.$id.$flow_cnt
321 fi
322 flow_cnt=$[flow_cnt+1]
323 done
324 if [ $details -ne 0 ] ; then
325 echo ""
326 delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
327 echo "PING AVG DELAY:$delay"
328 echo "AGGREGATE_GOODPUT:$rate"
329 else
330 echo $rate
331 fi
332elif [ $multi_iperf -eq 0 ] ; then
333 (iperf3 -s -p $port -1 > /dev/null 2>&1) &
334 usleep 100000
335 iperf3 -c $host -p $port -i 0 -P $flows -f m -t $dur > iperf.$id
336 rates=`grep receiver iperf.$id | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*"`
337 rate=`echo $rates | grep -o "[0-9]*$"`
338
339 if [ $details -ne 0 ] ; then
340 echo ""
341 echo "Details for HBM in cgroup $id"
342 if [ $do_stats -eq 1 ] ; then
343 if [ -e hbm.$id.$dir_name ] ; then
344 cat hbm.$id.$dir_name
345 fi
346 fi
347 delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
348 echo "PING AVG DELAY:$delay"
349 echo "AGGREGATE_GOODPUT:$rate"
350 else
351 echo $rate
352 fi
353else
354 flow_cnt=1
355 while [ $flow_cnt -le $flows ] ; do
356 (iperf3 -s -p $port -1 > /dev/null 2>&1) &
357 ( iperf3 -c $host -p $port -i 0 -P 1 -f m -t $dur | grep receiver | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*" | grep -o "[0-9]*$" > iperf3.$id.$flow_cnt ) &
358 port=$[port+1]
359 flow_cnt=$[flow_cnt+1]
360 done
361 n=$[dur+1]
362 sleep $n
363 flow_cnt=1
364 rate=0
365 if [ $details -ne 0 ] ; then
366 echo ""
367 echo "Details for HBM in cgroup $id"
368 if [ $do_stats -eq 1 ] ; then
369 if [ -e hbm.$id.$dir_name ] ; then
370 cat hbm.$id.$dir_name
371 fi
372 fi
373 fi
374
375 while [ $flow_cnt -le $flows ] ; do
376 r=`cat iperf3.$id.$flow_cnt`
377# echo "rate for flow $flow_cnt: $r"
378 if [ $details -ne 0 ] ; then
379 echo "Rate for cgroup $id, flow $flow_cnt LOCAL_SEND_THROUGHPUT=$r"
380 fi
381 rate=$[rate+r]
382 flow_cnt=$[flow_cnt+1]
383 done
384 if [ $details -ne 0 ] ; then
385 delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
386 echo "PING AVG DELAY:$delay"
387 echo "AGGREGATE_GOODPUT:$rate"
388 else
389 echo $rate
390 fi
391fi
392
393if [ $use_netperf -eq 0 ] ; then
394 sysctl -w -q -n net.ipv4.tcp_congestion_control=$cur_cc
395fi
396if [ $ecn -ne 0 ] ; then
397 sysctl -w -q -n net.ipv4.tcp_ecn=0
398fi
399if [ "$netem" -ne "0" ] ; then
400 tc qdisc del dev lo root > /dev/null 2>&1
401fi
402if [ "$qdisc" != "" ] ; then
403 tc qdisc del dev eth0 root > /dev/null 2>&1
404fi
405sleep 2
406
407hbmPid=`ps ax | grep "hbm " | grep --invert-match "grep" | awk '{ print $1 }'`
408if [ "$hbmPid" == "$hbm_pid" ] ; then
409 kill $hbm_pid
410fi
411
412sleep 1
413
414# Detach any BPF programs that may have lingered
415ttx=`bpftool cgroup tree | grep hbm`
416v=2
417for x in $ttx ; do
418 if [ "${x:0:36}" == "/sys/fs/cgroup/cgroup-test-work-dir/" ] ; then
419 cg=$x ; v=0
420 else
421 if [ $v -eq 0 ] ; then
422 id=$x ; v=1
423 else
424 if [ $v -eq 1 ] ; then
425 type=$x ; bpftool cgroup detach $cg $type id $id
426 v=0
427 fi
428 fi
429 fi
430done
431
432if [ $use_netperf -ne 0 ] ; then
433 if [ "$server" == "" ] ; then
434 if [ "$begNetserverPid" == "" ] ; then
435 netserverPid=`ps ax | grep netserver | grep --invert-match "grep" | awk '{ print $1 }'`
436 if [ "$netserverPid" != "" ] ; then
437 kill $netserverPid
438 fi
439 fi
440 fi
441fi
442exit
diff --git a/samples/bpf/fds_example.c b/samples/bpf/fds_example.c
new file mode 100644
index 000000000..59f45fef5
--- /dev/null
+++ b/samples/bpf/fds_example.c
@@ -0,0 +1,193 @@
1#include <linux/unistd.h>
2#include <linux/bpf.h>
3
4#include <stdio.h>
5#include <stdlib.h>
6#include <stdint.h>
7#include <unistd.h>
8#include <string.h>
9#include <assert.h>
10#include <errno.h>
11
12#include <sys/types.h>
13#include <sys/socket.h>
14
15#include <bpf/bpf.h>
16
17#include <bpf/libbpf.h>
18#include "bpf_insn.h"
19#include "sock_example.h"
20
21#define BPF_F_PIN (1 << 0)
22#define BPF_F_GET (1 << 1)
23#define BPF_F_PIN_GET (BPF_F_PIN | BPF_F_GET)
24
25#define BPF_F_KEY (1 << 2)
26#define BPF_F_VAL (1 << 3)
27#define BPF_F_KEY_VAL (BPF_F_KEY | BPF_F_VAL)
28
29#define BPF_M_UNSPEC 0
30#define BPF_M_MAP 1
31#define BPF_M_PROG 2
32
33char bpf_log_buf[BPF_LOG_BUF_SIZE];
34
35static void usage(void)
36{
37 printf("Usage: fds_example [...]\n");
38 printf(" -F <file> File to pin/get object\n");
39 printf(" -P |- pin object\n");
40 printf(" -G `- get object\n");
41 printf(" -m eBPF map mode\n");
42 printf(" -k <key> |- map key\n");
43 printf(" -v <value> `- map value\n");
44 printf(" -p eBPF prog mode\n");
45 printf(" -o <object> `- object file\n");
46 printf(" -h Display this help.\n");
47}
48
49static int bpf_map_create(void)
50{
51 return bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(uint32_t),
52 sizeof(uint32_t), 1024, 0);
53}
54
55static int bpf_prog_create(const char *object)
56{
57 static struct bpf_insn insns[] = {
58 BPF_MOV64_IMM(BPF_REG_0, 1),
59 BPF_EXIT_INSN(),
60 };
61 size_t insns_cnt = sizeof(insns) / sizeof(struct bpf_insn);
62 struct bpf_object *obj;
63 int prog_fd;
64
65 if (object) {
66 assert(!bpf_prog_load(object, BPF_PROG_TYPE_UNSPEC,
67 &obj, &prog_fd));
68 return prog_fd;
69 } else {
70 return bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER,
71 insns, insns_cnt, "GPL", 0,
72 bpf_log_buf, BPF_LOG_BUF_SIZE);
73 }
74}
75
76static int bpf_do_map(const char *file, uint32_t flags, uint32_t key,
77 uint32_t value)
78{
79 int fd, ret;
80
81 if (flags & BPF_F_PIN) {
82 fd = bpf_map_create();
83 printf("bpf: map fd:%d (%s)\n", fd, strerror(errno));
84 assert(fd > 0);
85
86 ret = bpf_obj_pin(fd, file);
87 printf("bpf: pin ret:(%d,%s)\n", ret, strerror(errno));
88 assert(ret == 0);
89 } else {
90 fd = bpf_obj_get(file);
91 printf("bpf: get fd:%d (%s)\n", fd, strerror(errno));
92 assert(fd > 0);
93 }
94
95 if ((flags & BPF_F_KEY_VAL) == BPF_F_KEY_VAL) {
96 ret = bpf_map_update_elem(fd, &key, &value, 0);
97 printf("bpf: fd:%d u->(%u:%u) ret:(%d,%s)\n", fd, key, value,
98 ret, strerror(errno));
99 assert(ret == 0);
100 } else if (flags & BPF_F_KEY) {
101 ret = bpf_map_lookup_elem(fd, &key, &value);
102 printf("bpf: fd:%d l->(%u):%u ret:(%d,%s)\n", fd, key, value,
103 ret, strerror(errno));
104 assert(ret == 0);
105 }
106
107 return 0;
108}
109
110static int bpf_do_prog(const char *file, uint32_t flags, const char *object)
111{
112 int fd, sock, ret;
113
114 if (flags & BPF_F_PIN) {
115 fd = bpf_prog_create(object);
116 printf("bpf: prog fd:%d (%s)\n", fd, strerror(errno));
117 assert(fd > 0);
118
119 ret = bpf_obj_pin(fd, file);
120 printf("bpf: pin ret:(%d,%s)\n", ret, strerror(errno));
121 assert(ret == 0);
122 } else {
123 fd = bpf_obj_get(file);
124 printf("bpf: get fd:%d (%s)\n", fd, strerror(errno));
125 assert(fd > 0);
126 }
127
128 sock = open_raw_sock("lo");
129 assert(sock > 0);
130
131 ret = setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &fd, sizeof(fd));
132 printf("bpf: sock:%d <- fd:%d attached ret:(%d,%s)\n", sock, fd,
133 ret, strerror(errno));
134 assert(ret == 0);
135
136 return 0;
137}
138
139int main(int argc, char **argv)
140{
141 const char *file = NULL, *object = NULL;
142 uint32_t key = 0, value = 0, flags = 0;
143 int opt, mode = BPF_M_UNSPEC;
144
145 while ((opt = getopt(argc, argv, "F:PGmk:v:po:")) != -1) {
146 switch (opt) {
147 /* General args */
148 case 'F':
149 file = optarg;
150 break;
151 case 'P':
152 flags |= BPF_F_PIN;
153 break;
154 case 'G':
155 flags |= BPF_F_GET;
156 break;
157 /* Map-related args */
158 case 'm':
159 mode = BPF_M_MAP;
160 break;
161 case 'k':
162 key = strtoul(optarg, NULL, 0);
163 flags |= BPF_F_KEY;
164 break;
165 case 'v':
166 value = strtoul(optarg, NULL, 0);
167 flags |= BPF_F_VAL;
168 break;
169 /* Prog-related args */
170 case 'p':
171 mode = BPF_M_PROG;
172 break;
173 case 'o':
174 object = optarg;
175 break;
176 default:
177 goto out;
178 }
179 }
180
181 if (!(flags & BPF_F_PIN_GET) || !file)
182 goto out;
183
184 switch (mode) {
185 case BPF_M_MAP:
186 return bpf_do_map(file, flags, key, value);
187 case BPF_M_PROG:
188 return bpf_do_prog(file, flags, object);
189 }
190out:
191 usage();
192 return -1;
193}
diff --git a/samples/bpf/hash_func01.h b/samples/bpf/hash_func01.h
new file mode 100644
index 000000000..38255812e
--- /dev/null
+++ b/samples/bpf/hash_func01.h
@@ -0,0 +1,55 @@
1/* SPDX-License-Identifier: LGPL-2.1
2 *
3 * Based on Paul Hsieh's (LGPG 2.1) hash function
4 * From: http://www.azillionmonkeys.com/qed/hash.html
5 */
6
7#define get16bits(d) (*((const __u16 *) (d)))
8
9static __always_inline
10__u32 SuperFastHash (const char *data, int len, __u32 initval) {
11 __u32 hash = initval;
12 __u32 tmp;
13 int rem;
14
15 if (len <= 0 || data == NULL) return 0;
16
17 rem = len & 3;
18 len >>= 2;
19
20 /* Main loop */
21#pragma clang loop unroll(full)
22 for (;len > 0; len--) {
23 hash += get16bits (data);
24 tmp = (get16bits (data+2) << 11) ^ hash;
25 hash = (hash << 16) ^ tmp;
26 data += 2*sizeof (__u16);
27 hash += hash >> 11;
28 }
29
30 /* Handle end cases */
31 switch (rem) {
32 case 3: hash += get16bits (data);
33 hash ^= hash << 16;
34 hash ^= ((signed char)data[sizeof (__u16)]) << 18;
35 hash += hash >> 11;
36 break;
37 case 2: hash += get16bits (data);
38 hash ^= hash << 11;
39 hash += hash >> 17;
40 break;
41 case 1: hash += (signed char)*data;
42 hash ^= hash << 10;
43 hash += hash >> 1;
44 }
45
46 /* Force "avalanching" of final 127 bits */
47 hash ^= hash << 3;
48 hash += hash >> 5;
49 hash ^= hash << 4;
50 hash += hash >> 17;
51 hash ^= hash << 25;
52 hash += hash >> 6;
53
54 return hash;
55}
diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c
new file mode 100644
index 000000000..ff4c533df
--- /dev/null
+++ b/samples/bpf/hbm.c
@@ -0,0 +1,499 @@
1// SPDX-License-Identifier: GPL-2.0
2/* Copyright (c) 2019 Facebook
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * Example program for Host Bandwidth Managment
9 *
10 * This program loads a cgroup skb BPF program to enforce cgroup output
11 * (egress) or input (ingress) bandwidth limits.
12 *
13 * USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog]
14 * Where:
15 * -d Print BPF trace debug buffer
16 * -l Also limit flows doing loopback
17 * -n <#> To create cgroup \"/hbm#\" and attach prog
18 * Default is /hbm1
19 * --no_cn Do not return cn notifications
20 * -r <rate> Rate limit in Mbps
21 * -s Get HBM stats (marked, dropped, etc.)
22 * -t <time> Exit after specified seconds (default is 0)
23 * -w Work conserving flag. cgroup can increase its bandwidth
24 * beyond the rate limit specified while there is available
25 * bandwidth. Current implementation assumes there is only
26 * NIC (eth0), but can be extended to support multiple NICs.
27 * Currrently only supported for egress.
28 * -h Print this info
29 * prog BPF program file name. Name defaults to hbm_out_kern.o
30 */
31
32#define _GNU_SOURCE
33
34#include <stdio.h>
35#include <stdlib.h>
36#include <assert.h>
37#include <sys/resource.h>
38#include <sys/time.h>
39#include <unistd.h>
40#include <errno.h>
41#include <fcntl.h>
42#include <linux/unistd.h>
43#include <linux/compiler.h>
44
45#include <linux/bpf.h>
46#include <bpf/bpf.h>
47#include <getopt.h>
48
49#include "bpf_load.h"
50#include "bpf_rlimit.h"
51#include "cgroup_helpers.h"
52#include "hbm.h"
53#include "bpf_util.h"
54#include <bpf/bpf.h>
55#include <bpf/libbpf.h>
56
57bool outFlag = true;
58int minRate = 1000; /* cgroup rate limit in Mbps */
59int rate = 1000; /* can grow if rate conserving is enabled */
60int dur = 1;
61bool stats_flag;
62bool loopback_flag;
63bool debugFlag;
64bool work_conserving_flag;
65bool no_cn_flag;
66bool edt_flag;
67
68static void Usage(void);
69static void read_trace_pipe2(void);
70static void do_error(char *msg, bool errno_flag);
71
72#define DEBUGFS "/sys/kernel/debug/tracing/"
73
74struct bpf_object *obj;
75int bpfprog_fd;
76int cgroup_storage_fd;
77
78static void read_trace_pipe2(void)
79{
80 int trace_fd;
81 FILE *outf;
82 char *outFname = "hbm_out.log";
83
84 trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
85 if (trace_fd < 0) {
86 printf("Error opening trace_pipe\n");
87 return;
88 }
89
90// Future support of ingress
91// if (!outFlag)
92// outFname = "hbm_in.log";
93 outf = fopen(outFname, "w");
94
95 if (outf == NULL)
96 printf("Error creating %s\n", outFname);
97
98 while (1) {
99 static char buf[4097];
100 ssize_t sz;
101
102 sz = read(trace_fd, buf, sizeof(buf) - 1);
103 if (sz > 0) {
104 buf[sz] = 0;
105 puts(buf);
106 if (outf != NULL) {
107 fprintf(outf, "%s\n", buf);
108 fflush(outf);
109 }
110 }
111 }
112}
113
114static void do_error(char *msg, bool errno_flag)
115{
116 if (errno_flag)
117 printf("ERROR: %s, errno: %d\n", msg, errno);
118 else
119 printf("ERROR: %s\n", msg);
120 exit(1);
121}
122
123static int prog_load(char *prog)
124{
125 struct bpf_prog_load_attr prog_load_attr = {
126 .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
127 .file = prog,
128 .expected_attach_type = BPF_CGROUP_INET_EGRESS,
129 };
130 int map_fd;
131 struct bpf_map *map;
132
133 int ret = 0;
134
135 if (access(prog, O_RDONLY) < 0) {
136 printf("Error accessing file %s: %s\n", prog, strerror(errno));
137 return 1;
138 }
139 if (bpf_prog_load_xattr(&prog_load_attr, &obj, &bpfprog_fd))
140 ret = 1;
141 if (!ret) {
142 map = bpf_object__find_map_by_name(obj, "queue_stats");
143 map_fd = bpf_map__fd(map);
144 if (map_fd < 0) {
145 printf("Map not found: %s\n", strerror(map_fd));
146 ret = 1;
147 }
148 }
149
150 if (ret) {
151 printf("ERROR: bpf_prog_load_xattr failed for: %s\n", prog);
152 printf(" Output from verifier:\n%s\n------\n", bpf_log_buf);
153 ret = -1;
154 } else {
155 ret = map_fd;
156 }
157
158 return ret;
159}
160
161static int run_bpf_prog(char *prog, int cg_id)
162{
163 int map_fd;
164 int rc = 0;
165 int key = 0;
166 int cg1 = 0;
167 int type = BPF_CGROUP_INET_EGRESS;
168 char cg_dir[100];
169 struct hbm_queue_stats qstats = {0};
170
171 sprintf(cg_dir, "/hbm%d", cg_id);
172 map_fd = prog_load(prog);
173 if (map_fd == -1)
174 return 1;
175
176 if (setup_cgroup_environment()) {
177 printf("ERROR: setting cgroup environment\n");
178 goto err;
179 }
180 cg1 = create_and_get_cgroup(cg_dir);
181 if (!cg1) {
182 printf("ERROR: create_and_get_cgroup\n");
183 goto err;
184 }
185 if (join_cgroup(cg_dir)) {
186 printf("ERROR: join_cgroup\n");
187 goto err;
188 }
189
190 qstats.rate = rate;
191 qstats.stats = stats_flag ? 1 : 0;
192 qstats.loopback = loopback_flag ? 1 : 0;
193 qstats.no_cn = no_cn_flag ? 1 : 0;
194 if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) {
195 printf("ERROR: Could not update map element\n");
196 goto err;
197 }
198
199 if (!outFlag)
200 type = BPF_CGROUP_INET_INGRESS;
201 if (bpf_prog_attach(bpfprog_fd, cg1, type, 0)) {
202 printf("ERROR: bpf_prog_attach fails!\n");
203 log_err("Attaching prog");
204 goto err;
205 }
206
207 if (work_conserving_flag) {
208 struct timeval t0, t_last, t_new;
209 FILE *fin;
210 unsigned long long last_eth_tx_bytes, new_eth_tx_bytes;
211 signed long long last_cg_tx_bytes, new_cg_tx_bytes;
212 signed long long delta_time, delta_bytes, delta_rate;
213 int delta_ms;
214#define DELTA_RATE_CHECK 10000 /* in us */
215#define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */
216
217 bpf_map_lookup_elem(map_fd, &key, &qstats);
218 if (gettimeofday(&t0, NULL) < 0)
219 do_error("gettimeofday failed", true);
220 t_last = t0;
221 fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r");
222 if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1)
223 do_error("fscanf fails", false);
224 fclose(fin);
225 last_cg_tx_bytes = qstats.bytes_total;
226 while (true) {
227 usleep(DELTA_RATE_CHECK);
228 if (gettimeofday(&t_new, NULL) < 0)
229 do_error("gettimeofday failed", true);
230 delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 +
231 (t_new.tv_usec - t0.tv_usec)/1000;
232 if (delta_ms > dur * 1000)
233 break;
234 delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 +
235 (t_new.tv_usec - t_last.tv_usec);
236 if (delta_time == 0)
237 continue;
238 t_last = t_new;
239 fin = fopen("/sys/class/net/eth0/statistics/tx_bytes",
240 "r");
241 if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1)
242 do_error("fscanf fails", false);
243 fclose(fin);
244 printf(" new_eth_tx_bytes:%llu\n",
245 new_eth_tx_bytes);
246 bpf_map_lookup_elem(map_fd, &key, &qstats);
247 new_cg_tx_bytes = qstats.bytes_total;
248 delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes;
249 last_eth_tx_bytes = new_eth_tx_bytes;
250 delta_rate = (delta_bytes * 8000000) / delta_time;
251 printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps",
252 delta_ms, delta_rate/1000000000.0,
253 rate/1000.0);
254 if (delta_rate < RATE_THRESHOLD) {
255 /* can increase cgroup rate limit, but first
256 * check if we are using the current limit.
257 * Currently increasing by 6.25%, unknown
258 * if that is the optimal rate.
259 */
260 int rate_diff100;
261
262 delta_bytes = new_cg_tx_bytes -
263 last_cg_tx_bytes;
264 last_cg_tx_bytes = new_cg_tx_bytes;
265 delta_rate = (delta_bytes * 8000000) /
266 delta_time;
267 printf(" rate:%.3fGbps",
268 delta_rate/1000000000.0);
269 rate_diff100 = (((long long)rate)*1000000 -
270 delta_rate) * 100 /
271 (((long long) rate) * 1000000);
272 printf(" rdiff:%d", rate_diff100);
273 if (rate_diff100 <= 3) {
274 rate += (rate >> 4);
275 if (rate > RATE_THRESHOLD / 1000000)
276 rate = RATE_THRESHOLD / 1000000;
277 qstats.rate = rate;
278 printf(" INC\n");
279 } else {
280 printf("\n");
281 }
282 } else {
283 /* Need to decrease cgroup rate limit.
284 * Currently decreasing by 12.5%, unknown
285 * if that is optimal
286 */
287 printf(" DEC\n");
288 rate -= (rate >> 3);
289 if (rate < minRate)
290 rate = minRate;
291 qstats.rate = rate;
292 }
293 if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY))
294 do_error("update map element fails", false);
295 }
296 } else {
297 sleep(dur);
298 }
299 // Get stats!
300 if (stats_flag && bpf_map_lookup_elem(map_fd, &key, &qstats)) {
301 char fname[100];
302 FILE *fout;
303
304 if (!outFlag)
305 sprintf(fname, "hbm.%d.in", cg_id);
306 else
307 sprintf(fname, "hbm.%d.out", cg_id);
308 fout = fopen(fname, "w");
309 fprintf(fout, "id:%d\n", cg_id);
310 fprintf(fout, "ERROR: Could not lookup queue_stats\n");
311 } else if (stats_flag && qstats.lastPacketTime >
312 qstats.firstPacketTime) {
313 long long delta_us = (qstats.lastPacketTime -
314 qstats.firstPacketTime)/1000;
315 unsigned int rate_mbps = ((qstats.bytes_total -
316 qstats.bytes_dropped) * 8 /
317 delta_us);
318 double percent_pkts, percent_bytes;
319 char fname[100];
320 FILE *fout;
321 int k;
322 static const char *returnValNames[] = {
323 "DROP_PKT",
324 "ALLOW_PKT",
325 "DROP_PKT_CWR",
326 "ALLOW_PKT_CWR"
327 };
328#define RET_VAL_COUNT 4
329
330// Future support of ingress
331// if (!outFlag)
332// sprintf(fname, "hbm.%d.in", cg_id);
333// else
334 sprintf(fname, "hbm.%d.out", cg_id);
335 fout = fopen(fname, "w");
336 fprintf(fout, "id:%d\n", cg_id);
337 fprintf(fout, "rate_mbps:%d\n", rate_mbps);
338 fprintf(fout, "duration:%.1f secs\n",
339 (qstats.lastPacketTime - qstats.firstPacketTime) /
340 1000000000.0);
341 fprintf(fout, "packets:%d\n", (int)qstats.pkts_total);
342 fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total /
343 1000000));
344 fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped);
345 fprintf(fout, "bytes_dropped_MB:%d\n",
346 (int)(qstats.bytes_dropped /
347 1000000));
348 // Marked Pkts and Bytes
349 percent_pkts = (qstats.pkts_marked * 100.0) /
350 (qstats.pkts_total + 1);
351 percent_bytes = (qstats.bytes_marked * 100.0) /
352 (qstats.bytes_total + 1);
353 fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts);
354 fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes);
355
356 // Dropped Pkts and Bytes
357 percent_pkts = (qstats.pkts_dropped * 100.0) /
358 (qstats.pkts_total + 1);
359 percent_bytes = (qstats.bytes_dropped * 100.0) /
360 (qstats.bytes_total + 1);
361 fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts);
362 fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes);
363
364 // ECN CE markings
365 percent_pkts = (qstats.pkts_ecn_ce * 100.0) /
366 (qstats.pkts_total + 1);
367 fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts,
368 (int)qstats.pkts_ecn_ce);
369
370 // Average cwnd
371 fprintf(fout, "avg cwnd:%d\n",
372 (int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1)));
373 // Average rtt
374 fprintf(fout, "avg rtt:%d\n",
375 (int)(qstats.sum_rtt / (qstats.pkts_total + 1)));
376 // Average credit
377 if (edt_flag)
378 fprintf(fout, "avg credit_ms:%.03f\n",
379 (qstats.sum_credit /
380 (qstats.pkts_total + 1.0)) / 1000000.0);
381 else
382 fprintf(fout, "avg credit:%d\n",
383 (int)(qstats.sum_credit /
384 (1500 * ((int)qstats.pkts_total ) + 1)));
385
386 // Return values stats
387 for (k = 0; k < RET_VAL_COUNT; k++) {
388 percent_pkts = (qstats.returnValCount[k] * 100.0) /
389 (qstats.pkts_total + 1);
390 fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k],
391 percent_pkts, (int)qstats.returnValCount[k]);
392 }
393 fclose(fout);
394 }
395
396 if (debugFlag)
397 read_trace_pipe2();
398 return rc;
399err:
400 rc = 1;
401
402 if (cg1)
403 close(cg1);
404 cleanup_cgroup_environment();
405
406 return rc;
407}
408
409static void Usage(void)
410{
411 printf("This program loads a cgroup skb BPF program to enforce\n"
412 "cgroup output (egress) bandwidth limits.\n\n"
413 "USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>]\n"
414 " [-s] [-t <secs>] [-w] [-h] [prog]\n"
415 " Where:\n"
416 " -o indicates egress direction (default)\n"
417 " -d print BPF trace debug buffer\n"
418 " --edt use fq's Earliest Departure Time\n"
419 " -l also limit flows using loopback\n"
420 " -n <#> to create cgroup \"/hbm#\" and attach prog\n"
421 " Default is /hbm1\n"
422 " --no_cn disable CN notifications\n"
423 " -r <rate> Rate in Mbps\n"
424 " -s Update HBM stats\n"
425 " -t <time> Exit after specified seconds (default is 0)\n"
426 " -w Work conserving flag. cgroup can increase\n"
427 " bandwidth beyond the rate limit specified\n"
428 " while there is available bandwidth. Current\n"
429 " implementation assumes there is only eth0\n"
430 " but can be extended to support multiple NICs\n"
431 " -h print this info\n"
432 " prog BPF program file name. Name defaults to\n"
433 " hbm_out_kern.o\n");
434}
435
436int main(int argc, char **argv)
437{
438 char *prog = "hbm_out_kern.o";
439 int k;
440 int cg_id = 1;
441 char *optstring = "iodln:r:st:wh";
442 struct option loptions[] = {
443 {"no_cn", 0, NULL, 1},
444 {"edt", 0, NULL, 2},
445 {NULL, 0, NULL, 0}
446 };
447
448 while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) {
449 switch (k) {
450 case 1:
451 no_cn_flag = true;
452 break;
453 case 2:
454 prog = "hbm_edt_kern.o";
455 edt_flag = true;
456 break;
457 case'o':
458 break;
459 case 'd':
460 debugFlag = true;
461 break;
462 case 'l':
463 loopback_flag = true;
464 break;
465 case 'n':
466 cg_id = atoi(optarg);
467 break;
468 case 'r':
469 minRate = atoi(optarg) * 1.024;
470 rate = minRate;
471 break;
472 case 's':
473 stats_flag = true;
474 break;
475 case 't':
476 dur = atoi(optarg);
477 break;
478 case 'w':
479 work_conserving_flag = true;
480 break;
481 case '?':
482 if (optopt == 'n' || optopt == 'r' || optopt == 't')
483 fprintf(stderr,
484 "Option -%c requires an argument.\n\n",
485 optopt);
486 case 'h':
487 __fallthrough;
488 default:
489 Usage();
490 return 0;
491 }
492 }
493
494 if (optind < argc)
495 prog = argv[optind];
496 printf("HBM prog: %s\n", prog != NULL ? prog : "NULL");
497
498 return run_bpf_prog(prog, cg_id);
499}
diff --git a/samples/bpf/hbm.h b/samples/bpf/hbm.h
new file mode 100644
index 000000000..f0963ed6a
--- /dev/null
+++ b/samples/bpf/hbm.h
@@ -0,0 +1,38 @@
1/* SPDX-License-Identifier: GPL-2.0
2 *
3 * Copyright (c) 2019 Facebook
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of version 2 of the GNU General Public
7 * License as published by the Free Software Foundation.
8 *
9 * Include file for Host Bandwidth Management (HBM) programs
10 */
11struct hbm_vqueue {
12 struct bpf_spin_lock lock;
13 /* 4 byte hole */
14 unsigned long long lasttime; /* In ns */
15 int credit; /* In bytes */
16 unsigned int rate; /* In bytes per NS << 20 */
17};
18
19struct hbm_queue_stats {
20 unsigned long rate; /* in Mbps*/
21 unsigned long stats:1, /* get HBM stats (marked, dropped,..) */
22 loopback:1, /* also limit flows using loopback */
23 no_cn:1; /* do not use cn flags */
24 unsigned long long pkts_marked;
25 unsigned long long bytes_marked;
26 unsigned long long pkts_dropped;
27 unsigned long long bytes_dropped;
28 unsigned long long pkts_total;
29 unsigned long long bytes_total;
30 unsigned long long firstPacketTime;
31 unsigned long long lastPacketTime;
32 unsigned long long pkts_ecn_ce;
33 unsigned long long returnValCount[4];
34 unsigned long long sum_cwnd;
35 unsigned long long sum_rtt;
36 unsigned long long sum_cwnd_cnt;
37 long long sum_credit;
38};
diff --git a/samples/bpf/hbm_edt_kern.c b/samples/bpf/hbm_edt_kern.c
new file mode 100644
index 000000000..a65b677ac
--- /dev/null
+++ b/samples/bpf/hbm_edt_kern.c
@@ -0,0 +1,168 @@
1// SPDX-License-Identifier: GPL-2.0
2/* Copyright (c) 2019 Facebook
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * Sample Host Bandwidth Manager (HBM) BPF program.
9 *
10 * A cgroup skb BPF egress program to limit cgroup output bandwidth.
11 * It uses a modified virtual token bucket queue to limit average
12 * egress bandwidth. The implementation uses credits instead of tokens.
13 * Negative credits imply that queueing would have happened (this is
14 * a virtual queue, so no queueing is done by it. However, queueing may
15 * occur at the actual qdisc (which is not used for rate limiting).
16 *
17 * This implementation uses 3 thresholds, one to start marking packets and
18 * the other two to drop packets:
19 * CREDIT
20 * - <--------------------------|------------------------> +
21 * | | | 0
22 * | Large pkt |
23 * | drop thresh |
24 * Small pkt drop Mark threshold
25 * thresh
26 *
27 * The effect of marking depends on the type of packet:
28 * a) If the packet is ECN enabled and it is a TCP packet, then the packet
29 * is ECN marked.
30 * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr
31 * to reduce the congestion window. The current implementation uses a linear
32 * distribution (0% probability at marking threshold, 100% probability
33 * at drop threshold).
34 * c) If the packet is not a TCP packet, then it is dropped.
35 *
36 * If the credit is below the drop threshold, the packet is dropped. If it
37 * is a TCP packet, then it also calls tcp_cwr since packets dropped by
38 * by a cgroup skb BPF program do not automatically trigger a call to
39 * tcp_cwr in the current kernel code.
40 *
41 * This BPF program actually uses 2 drop thresholds, one threshold
42 * for larger packets (>= 120 bytes) and another for smaller packets. This
43 * protects smaller packets such as SYNs, ACKs, etc.
44 *
45 * The default bandwidth limit is set at 1Gbps but this can be changed by
46 * a user program through a shared BPF map. In addition, by default this BPF
47 * program does not limit connections using loopback. This behavior can be
48 * overwritten by the user program. There is also an option to calculate
49 * some statistics, such as percent of packets marked or dropped, which
50 * a user program, such as hbm, can access.
51 */
52
53#include "hbm_kern.h"
54
55SEC("cgroup_skb/egress")
56int _hbm_out_cg(struct __sk_buff *skb)
57{
58 long long delta = 0, delta_send;
59 unsigned long long curtime, sendtime;
60 struct hbm_queue_stats *qsp = NULL;
61 unsigned int queue_index = 0;
62 bool congestion_flag = false;
63 bool ecn_ce_flag = false;
64 struct hbm_pkt_info pkti = {};
65 struct hbm_vqueue *qdp;
66 bool drop_flag = false;
67 bool cwr_flag = false;
68 int len = skb->len;
69 int rv = ALLOW_PKT;
70
71 qsp = bpf_map_lookup_elem(&queue_stats, &queue_index);
72
73 // Check if we should ignore loopback traffic
74 if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1))
75 return ALLOW_PKT;
76
77 hbm_get_pkt_info(skb, &pkti);
78
79 // We may want to account for the length of headers in len
80 // calculation, like ETH header + overhead, specially if it
81 // is a gso packet. But I am not doing it right now.
82
83 qdp = bpf_get_local_storage(&queue_state, 0);
84 if (!qdp)
85 return ALLOW_PKT;
86 if (qdp->lasttime == 0)
87 hbm_init_edt_vqueue(qdp, 1024);
88
89 curtime = bpf_ktime_get_ns();
90
91 // Begin critical section
92 bpf_spin_lock(&qdp->lock);
93 delta = qdp->lasttime - curtime;
94 // bound bursts to 100us
95 if (delta < -BURST_SIZE_NS) {
96 // negative delta is a credit that allows bursts
97 qdp->lasttime = curtime - BURST_SIZE_NS;
98 delta = -BURST_SIZE_NS;
99 }
100 sendtime = qdp->lasttime;
101 delta_send = BYTES_TO_NS(len, qdp->rate);
102 __sync_add_and_fetch(&(qdp->lasttime), delta_send);
103 bpf_spin_unlock(&qdp->lock);
104 // End critical section
105
106 // Set EDT of packet
107 skb->tstamp = sendtime;
108
109 // Check if we should update rate
110 if (qsp != NULL && (qsp->rate * 128) != qdp->rate)
111 qdp->rate = qsp->rate * 128;
112
113 // Set flags (drop, congestion, cwr)
114 // last packet will be sent in the future, bound latency
115 if (delta > DROP_THRESH_NS || (delta > LARGE_PKT_DROP_THRESH_NS &&
116 len > LARGE_PKT_THRESH)) {
117 drop_flag = true;
118 if (pkti.is_tcp && pkti.ecn == 0)
119 cwr_flag = true;
120 } else if (delta > MARK_THRESH_NS) {
121 if (pkti.is_tcp)
122 congestion_flag = true;
123 else
124 drop_flag = true;
125 }
126
127 if (congestion_flag) {
128 if (bpf_skb_ecn_set_ce(skb)) {
129 ecn_ce_flag = true;
130 } else {
131 if (pkti.is_tcp) {
132 unsigned int rand = bpf_get_prandom_u32();
133
134 if (delta >= MARK_THRESH_NS +
135 (rand % MARK_REGION_SIZE_NS)) {
136 // Do congestion control
137 cwr_flag = true;
138 }
139 } else if (len > LARGE_PKT_THRESH) {
140 // Problem if too many small packets?
141 drop_flag = true;
142 congestion_flag = false;
143 }
144 }
145 }
146
147 if (pkti.is_tcp && drop_flag && pkti.packets_out <= 1) {
148 drop_flag = false;
149 cwr_flag = true;
150 congestion_flag = false;
151 }
152
153 if (qsp != NULL && qsp->no_cn)
154 cwr_flag = false;
155
156 hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag,
157 cwr_flag, ecn_ce_flag, &pkti, (int) delta);
158
159 if (drop_flag) {
160 __sync_add_and_fetch(&(qdp->lasttime), -delta_send);
161 rv = DROP_PKT;
162 }
163
164 if (cwr_flag)
165 rv |= CWR;
166 return rv;
167}
168char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h
new file mode 100644
index 000000000..e00f26f6a
--- /dev/null
+++ b/samples/bpf/hbm_kern.h
@@ -0,0 +1,217 @@
1/* SPDX-License-Identifier: GPL-2.0
2 *
3 * Copyright (c) 2019 Facebook
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of version 2 of the GNU General Public
7 * License as published by the Free Software Foundation.
8 *
9 * Include file for sample Host Bandwidth Manager (HBM) BPF programs
10 */
11#define KBUILD_MODNAME "foo"
12#include <stddef.h>
13#include <stdbool.h>
14#include <uapi/linux/bpf.h>
15#include <uapi/linux/if_ether.h>
16#include <uapi/linux/if_packet.h>
17#include <uapi/linux/ip.h>
18#include <uapi/linux/ipv6.h>
19#include <uapi/linux/in.h>
20#include <uapi/linux/tcp.h>
21#include <uapi/linux/filter.h>
22#include <uapi/linux/pkt_cls.h>
23#include <net/ipv6.h>
24#include <net/inet_ecn.h>
25#include <bpf/bpf_endian.h>
26#include <bpf/bpf_helpers.h>
27#include "hbm.h"
28
29#define DROP_PKT 0
30#define ALLOW_PKT 1
31#define TCP_ECN_OK 1
32#define CWR 2
33
34#ifndef HBM_DEBUG // Define HBM_DEBUG to enable debugging
35#undef bpf_printk
36#define bpf_printk(fmt, ...)
37#endif
38
39#define INITIAL_CREDIT_PACKETS 100
40#define MAX_BYTES_PER_PACKET 1500
41#define MARK_THRESH (40 * MAX_BYTES_PER_PACKET)
42#define DROP_THRESH (80 * 5 * MAX_BYTES_PER_PACKET)
43#define LARGE_PKT_DROP_THRESH (DROP_THRESH - (15 * MAX_BYTES_PER_PACKET))
44#define MARK_REGION_SIZE (LARGE_PKT_DROP_THRESH - MARK_THRESH)
45#define LARGE_PKT_THRESH 120
46#define MAX_CREDIT (100 * MAX_BYTES_PER_PACKET)
47#define INIT_CREDIT (INITIAL_CREDIT_PACKETS * MAX_BYTES_PER_PACKET)
48
49// Time base accounting for fq's EDT
50#define BURST_SIZE_NS 100000 // 100us
51#define MARK_THRESH_NS 50000 // 50us
52#define DROP_THRESH_NS 500000 // 500us
53// Reserve 20us of queuing for small packets (less than 120 bytes)
54#define LARGE_PKT_DROP_THRESH_NS (DROP_THRESH_NS - 20000)
55#define MARK_REGION_SIZE_NS (LARGE_PKT_DROP_THRESH_NS - MARK_THRESH_NS)
56
57// rate in bytes per ns << 20
58#define CREDIT_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20)
59#define BYTES_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20)
60#define BYTES_TO_NS(bytes, rate) div64_u64(((u64)(bytes)) << 20, (u64)(rate))
61
62struct {
63 __uint(type, BPF_MAP_TYPE_CGROUP_STORAGE);
64 __type(key, struct bpf_cgroup_storage_key);
65 __type(value, struct hbm_vqueue);
66} queue_state SEC(".maps");
67
68struct {
69 __uint(type, BPF_MAP_TYPE_ARRAY);
70 __uint(max_entries, 1);
71 __type(key, u32);
72 __type(value, struct hvm_queue_stats);
73} queue_stats SEC(".maps");
74
75struct hbm_pkt_info {
76 int cwnd;
77 int rtt;
78 int packets_out;
79 bool is_ip;
80 bool is_tcp;
81 short ecn;
82};
83
84static int get_tcp_info(struct __sk_buff *skb, struct hbm_pkt_info *pkti)
85{
86 struct bpf_sock *sk;
87 struct bpf_tcp_sock *tp;
88
89 sk = skb->sk;
90 if (sk) {
91 sk = bpf_sk_fullsock(sk);
92 if (sk) {
93 if (sk->protocol == IPPROTO_TCP) {
94 tp = bpf_tcp_sock(sk);
95 if (tp) {
96 pkti->cwnd = tp->snd_cwnd;
97 pkti->rtt = tp->srtt_us >> 3;
98 pkti->packets_out = tp->packets_out;
99 return 0;
100 }
101 }
102 }
103 }
104 pkti->cwnd = 0;
105 pkti->rtt = 0;
106 pkti->packets_out = 0;
107 return 1;
108}
109
110static void hbm_get_pkt_info(struct __sk_buff *skb,
111 struct hbm_pkt_info *pkti)
112{
113 struct iphdr iph;
114 struct ipv6hdr *ip6h;
115
116 pkti->cwnd = 0;
117 pkti->rtt = 0;
118 bpf_skb_load_bytes(skb, 0, &iph, 12);
119 if (iph.version == 6) {
120 ip6h = (struct ipv6hdr *)&iph;
121 pkti->is_ip = true;
122 pkti->is_tcp = (ip6h->nexthdr == 6);
123 pkti->ecn = (ip6h->flow_lbl[0] >> 4) & INET_ECN_MASK;
124 } else if (iph.version == 4) {
125 pkti->is_ip = true;
126 pkti->is_tcp = (iph.protocol == 6);
127 pkti->ecn = iph.tos & INET_ECN_MASK;
128 } else {
129 pkti->is_ip = false;
130 pkti->is_tcp = false;
131 pkti->ecn = 0;
132 }
133 if (pkti->is_tcp)
134 get_tcp_info(skb, pkti);
135}
136
137static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate)
138{
139 bpf_printk("Initializing queue_state, rate:%d\n", rate * 128);
140 qdp->lasttime = bpf_ktime_get_ns();
141 qdp->credit = INIT_CREDIT;
142 qdp->rate = rate * 128;
143}
144
145static __always_inline void hbm_init_edt_vqueue(struct hbm_vqueue *qdp,
146 int rate)
147{
148 unsigned long long curtime;
149
150 curtime = bpf_ktime_get_ns();
151 bpf_printk("Initializing queue_state, rate:%d\n", rate * 128);
152 qdp->lasttime = curtime - BURST_SIZE_NS; // support initial burst
153 qdp->credit = 0; // not used
154 qdp->rate = rate * 128;
155}
156
157static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp,
158 int len,
159 unsigned long long curtime,
160 bool congestion_flag,
161 bool drop_flag,
162 bool cwr_flag,
163 bool ecn_ce_flag,
164 struct hbm_pkt_info *pkti,
165 int credit)
166{
167 int rv = ALLOW_PKT;
168
169 if (qsp != NULL) {
170 // Following is needed for work conserving
171 __sync_add_and_fetch(&(qsp->bytes_total), len);
172 if (qsp->stats) {
173 // Optionally update statistics
174 if (qsp->firstPacketTime == 0)
175 qsp->firstPacketTime = curtime;
176 qsp->lastPacketTime = curtime;
177 __sync_add_and_fetch(&(qsp->pkts_total), 1);
178 if (congestion_flag) {
179 __sync_add_and_fetch(&(qsp->pkts_marked), 1);
180 __sync_add_and_fetch(&(qsp->bytes_marked), len);
181 }
182 if (drop_flag) {
183 __sync_add_and_fetch(&(qsp->pkts_dropped), 1);
184 __sync_add_and_fetch(&(qsp->bytes_dropped),
185 len);
186 }
187 if (ecn_ce_flag)
188 __sync_add_and_fetch(&(qsp->pkts_ecn_ce), 1);
189 if (pkti->cwnd) {
190 __sync_add_and_fetch(&(qsp->sum_cwnd),
191 pkti->cwnd);
192 __sync_add_and_fetch(&(qsp->sum_cwnd_cnt), 1);
193 }
194 if (pkti->rtt)
195 __sync_add_and_fetch(&(qsp->sum_rtt),
196 pkti->rtt);
197 __sync_add_and_fetch(&(qsp->sum_credit), credit);
198
199 if (drop_flag)
200 rv = DROP_PKT;
201 if (cwr_flag)
202 rv |= 2;
203 if (rv == DROP_PKT)
204 __sync_add_and_fetch(&(qsp->returnValCount[0]),
205 1);
206 else if (rv == ALLOW_PKT)
207 __sync_add_and_fetch(&(qsp->returnValCount[1]),
208 1);
209 else if (rv == 2)
210 __sync_add_and_fetch(&(qsp->returnValCount[2]),
211 1);
212 else if (rv == 3)
213 __sync_add_and_fetch(&(qsp->returnValCount[3]),
214 1);
215 }
216 }
217}
diff --git a/samples/bpf/hbm_out_kern.c b/samples/bpf/hbm_out_kern.c
new file mode 100644
index 000000000..829934bd4
--- /dev/null
+++ b/samples/bpf/hbm_out_kern.c
@@ -0,0 +1,179 @@
1// SPDX-License-Identifier: GPL-2.0
2/* Copyright (c) 2019 Facebook
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * Sample Host Bandwidth Manager (HBM) BPF program.
9 *
10 * A cgroup skb BPF egress program to limit cgroup output bandwidth.
11 * It uses a modified virtual token bucket queue to limit average
12 * egress bandwidth. The implementation uses credits instead of tokens.
13 * Negative credits imply that queueing would have happened (this is
14 * a virtual queue, so no queueing is done by it. However, queueing may
15 * occur at the actual qdisc (which is not used for rate limiting).
16 *
17 * This implementation uses 3 thresholds, one to start marking packets and
18 * the other two to drop packets:
19 * CREDIT
20 * - <--------------------------|------------------------> +
21 * | | | 0
22 * | Large pkt |
23 * | drop thresh |
24 * Small pkt drop Mark threshold
25 * thresh
26 *
27 * The effect of marking depends on the type of packet:
28 * a) If the packet is ECN enabled and it is a TCP packet, then the packet
29 * is ECN marked.
30 * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr
31 * to reduce the congestion window. The current implementation uses a linear
32 * distribution (0% probability at marking threshold, 100% probability
33 * at drop threshold).
34 * c) If the packet is not a TCP packet, then it is dropped.
35 *
36 * If the credit is below the drop threshold, the packet is dropped. If it
37 * is a TCP packet, then it also calls tcp_cwr since packets dropped by
38 * by a cgroup skb BPF program do not automatically trigger a call to
39 * tcp_cwr in the current kernel code.
40 *
41 * This BPF program actually uses 2 drop thresholds, one threshold
42 * for larger packets (>= 120 bytes) and another for smaller packets. This
43 * protects smaller packets such as SYNs, ACKs, etc.
44 *
45 * The default bandwidth limit is set at 1Gbps but this can be changed by
46 * a user program through a shared BPF map. In addition, by default this BPF
47 * program does not limit connections using loopback. This behavior can be
48 * overwritten by the user program. There is also an option to calculate
49 * some statistics, such as percent of packets marked or dropped, which
50 * the user program can access.
51 *
52 * A latter patch provides such a program (hbm.c)
53 */
54
55#include "hbm_kern.h"
56
57SEC("cgroup_skb/egress")
58int _hbm_out_cg(struct __sk_buff *skb)
59{
60 struct hbm_pkt_info pkti;
61 int len = skb->len;
62 unsigned int queue_index = 0;
63 unsigned long long curtime;
64 int credit;
65 signed long long delta = 0, new_credit;
66 int max_credit = MAX_CREDIT;
67 bool congestion_flag = false;
68 bool drop_flag = false;
69 bool cwr_flag = false;
70 bool ecn_ce_flag = false;
71 struct hbm_vqueue *qdp;
72 struct hbm_queue_stats *qsp = NULL;
73 int rv = ALLOW_PKT;
74
75 qsp = bpf_map_lookup_elem(&queue_stats, &queue_index);
76 if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1))
77 return ALLOW_PKT;
78
79 hbm_get_pkt_info(skb, &pkti);
80
81 // We may want to account for the length of headers in len
82 // calculation, like ETH header + overhead, specially if it
83 // is a gso packet. But I am not doing it right now.
84
85 qdp = bpf_get_local_storage(&queue_state, 0);
86 if (!qdp)
87 return ALLOW_PKT;
88 else if (qdp->lasttime == 0)
89 hbm_init_vqueue(qdp, 1024);
90
91 curtime = bpf_ktime_get_ns();
92
93 // Begin critical section
94 bpf_spin_lock(&qdp->lock);
95 credit = qdp->credit;
96 delta = curtime - qdp->lasttime;
97 /* delta < 0 implies that another process with a curtime greater
98 * than ours beat us to the critical section and already added
99 * the new credit, so we should not add it ourselves
100 */
101 if (delta > 0) {
102 qdp->lasttime = curtime;
103 new_credit = credit + CREDIT_PER_NS(delta, qdp->rate);
104 if (new_credit > MAX_CREDIT)
105 credit = MAX_CREDIT;
106 else
107 credit = new_credit;
108 }
109 credit -= len;
110 qdp->credit = credit;
111 bpf_spin_unlock(&qdp->lock);
112 // End critical section
113
114 // Check if we should update rate
115 if (qsp != NULL && (qsp->rate * 128) != qdp->rate) {
116 qdp->rate = qsp->rate * 128;
117 bpf_printk("Updating rate: %d (1sec:%llu bits)\n",
118 (int)qdp->rate,
119 CREDIT_PER_NS(1000000000, qdp->rate) * 8);
120 }
121
122 // Set flags (drop, congestion, cwr)
123 // Dropping => we are congested, so ignore congestion flag
124 if (credit < -DROP_THRESH ||
125 (len > LARGE_PKT_THRESH && credit < -LARGE_PKT_DROP_THRESH)) {
126 // Very congested, set drop packet
127 drop_flag = true;
128 if (pkti.ecn)
129 congestion_flag = true;
130 else if (pkti.is_tcp)
131 cwr_flag = true;
132 } else if (credit < 0) {
133 // Congested, set congestion flag
134 if (pkti.ecn || pkti.is_tcp) {
135 if (credit < -MARK_THRESH)
136 congestion_flag = true;
137 else
138 congestion_flag = false;
139 } else {
140 congestion_flag = true;
141 }
142 }
143
144 if (congestion_flag) {
145 if (bpf_skb_ecn_set_ce(skb)) {
146 ecn_ce_flag = true;
147 } else {
148 if (pkti.is_tcp) {
149 unsigned int rand = bpf_get_prandom_u32();
150
151 if (-credit >= MARK_THRESH +
152 (rand % MARK_REGION_SIZE)) {
153 // Do congestion control
154 cwr_flag = true;
155 }
156 } else if (len > LARGE_PKT_THRESH) {
157 // Problem if too many small packets?
158 drop_flag = true;
159 }
160 }
161 }
162
163 if (qsp != NULL)
164 if (qsp->no_cn)
165 cwr_flag = false;
166
167 hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag,
168 cwr_flag, ecn_ce_flag, &pkti, credit);
169
170 if (drop_flag) {
171 __sync_add_and_fetch(&(qdp->credit), len);
172 rv = DROP_PKT;
173 }
174
175 if (cwr_flag)
176 rv |= 2;
177 return rv;
178}
179char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/ibumad_kern.c b/samples/bpf/ibumad_kern.c
new file mode 100644
index 000000000..3a91b4c19
--- /dev/null
+++ b/samples/bpf/ibumad_kern.c
@@ -0,0 +1,138 @@
1// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2
3/**
4 * ibumad BPF sample kernel side
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of version 2 of the GNU General Public
8 * License as published by the Free Software Foundation.
9 *
10 * Copyright(c) 2018 Ira Weiny, Intel Corporation
11 */
12
13#define KBUILD_MODNAME "ibumad_count_pkts_by_class"
14#include <uapi/linux/bpf.h>
15
16#include <bpf/bpf_helpers.h>
17
18
19struct bpf_map_def SEC("maps") read_count = {
20 .type = BPF_MAP_TYPE_ARRAY,
21 .key_size = sizeof(u32), /* class; u32 required */
22 .value_size = sizeof(u64), /* count of mads read */
23 .max_entries = 256, /* Room for all Classes */
24};
25
26struct bpf_map_def SEC("maps") write_count = {
27 .type = BPF_MAP_TYPE_ARRAY,
28 .key_size = sizeof(u32), /* class; u32 required */
29 .value_size = sizeof(u64), /* count of mads written */
30 .max_entries = 256, /* Room for all Classes */
31};
32
33#undef DEBUG
34#ifndef DEBUG
35#undef bpf_printk
36#define bpf_printk(fmt, ...)
37#endif
38
39/* Taken from the current format defined in
40 * include/trace/events/ib_umad.h
41 * and
42 * /sys/kernel/debug/tracing/events/ib_umad/ib_umad_read/format
43 * /sys/kernel/debug/tracing/events/ib_umad/ib_umad_write/format
44 */
45struct ib_umad_rw_args {
46 u64 pad;
47 u8 port_num;
48 u8 sl;
49 u8 path_bits;
50 u8 grh_present;
51 u32 id;
52 u32 status;
53 u32 timeout_ms;
54 u32 retires;
55 u32 length;
56 u32 qpn;
57 u32 qkey;
58 u8 gid_index;
59 u8 hop_limit;
60 u16 lid;
61 u16 attr_id;
62 u16 pkey_index;
63 u8 base_version;
64 u8 mgmt_class;
65 u8 class_version;
66 u8 method;
67 u32 flow_label;
68 u16 mad_status;
69 u16 class_specific;
70 u32 attr_mod;
71 u64 tid;
72 u8 gid[16];
73 u32 dev_index;
74 u8 traffic_class;
75};
76
77SEC("tracepoint/ib_umad/ib_umad_read_recv")
78int on_ib_umad_read_recv(struct ib_umad_rw_args *ctx)
79{
80 u64 zero = 0, *val;
81 u8 class = ctx->mgmt_class;
82
83 bpf_printk("ib_umad read recv : class 0x%x\n", class);
84
85 val = bpf_map_lookup_elem(&read_count, &class);
86 if (!val) {
87 bpf_map_update_elem(&read_count, &class, &zero, BPF_NOEXIST);
88 val = bpf_map_lookup_elem(&read_count, &class);
89 if (!val)
90 return 0;
91 }
92
93 (*val) += 1;
94
95 return 0;
96}
97SEC("tracepoint/ib_umad/ib_umad_read_send")
98int on_ib_umad_read_send(struct ib_umad_rw_args *ctx)
99{
100 u64 zero = 0, *val;
101 u8 class = ctx->mgmt_class;
102
103 bpf_printk("ib_umad read send : class 0x%x\n", class);
104
105 val = bpf_map_lookup_elem(&read_count, &class);
106 if (!val) {
107 bpf_map_update_elem(&read_count, &class, &zero, BPF_NOEXIST);
108 val = bpf_map_lookup_elem(&read_count, &class);
109 if (!val)
110 return 0;
111 }
112
113 (*val) += 1;
114
115 return 0;
116}
117SEC("tracepoint/ib_umad/ib_umad_write")
118int on_ib_umad_write(struct ib_umad_rw_args *ctx)
119{
120 u64 zero = 0, *val;
121 u8 class = ctx->mgmt_class;
122
123 bpf_printk("ib_umad write : class 0x%x\n", class);
124
125 val = bpf_map_lookup_elem(&write_count, &class);
126 if (!val) {
127 bpf_map_update_elem(&write_count, &class, &zero, BPF_NOEXIST);
128 val = bpf_map_lookup_elem(&write_count, &class);
129 if (!val)
130 return 0;
131 }
132
133 (*val) += 1;
134
135 return 0;
136}
137
138char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/ibumad_user.c b/samples/bpf/ibumad_user.c
new file mode 100644
index 000000000..fa06eef31
--- /dev/null
+++ b/samples/bpf/ibumad_user.c
@@ -0,0 +1,122 @@
1// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2
3/**
4 * ibumad BPF sample user side
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of version 2 of the GNU General Public
8 * License as published by the Free Software Foundation.
9 *
10 * Copyright(c) 2018 Ira Weiny, Intel Corporation
11 */
12
13#include <linux/bpf.h>
14#include <signal.h>
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <unistd.h>
19#include <sys/types.h>
20#include <limits.h>
21
22#include <sys/resource.h>
23#include <getopt.h>
24#include <net/if.h>
25
26#include "bpf_load.h"
27#include "bpf_util.h"
28#include <bpf/libbpf.h>
29
30static void dump_counts(int fd)
31{
32 __u32 key;
33 __u64 value;
34
35 for (key = 0; key < 256; key++) {
36 if (bpf_map_lookup_elem(fd, &key, &value)) {
37 printf("failed to read key %u\n", key);
38 continue;
39 }
40 if (value)
41 printf("0x%02x : %llu\n", key, value);
42 }
43}
44
45static void dump_all_counts(void)
46{
47 printf("Read 'Class : count'\n");
48 dump_counts(map_fd[0]);
49 printf("Write 'Class : count'\n");
50 dump_counts(map_fd[1]);
51}
52
53static void dump_exit(int sig)
54{
55 dump_all_counts();
56 exit(0);
57}
58
59static const struct option long_options[] = {
60 {"help", no_argument, NULL, 'h'},
61 {"delay", required_argument, NULL, 'd'},
62};
63
64static void usage(char *cmd)
65{
66 printf("eBPF test program to count packets from various IP addresses\n"
67 "Usage: %s <options>\n"
68 " --help, -h this menu\n"
69 " --delay, -d <delay> wait <delay> sec between prints [1 - 1000000]\n"
70 , cmd
71 );
72}
73
74int main(int argc, char **argv)
75{
76 unsigned long delay = 5;
77 int longindex = 0;
78 int opt;
79 char bpf_file[256];
80
81 /* Create the eBPF kernel code path name.
82 * This follows the pattern of all of the other bpf samples
83 */
84 snprintf(bpf_file, sizeof(bpf_file), "%s_kern.o", argv[0]);
85
86 /* Do one final dump when exiting */
87 signal(SIGINT, dump_exit);
88 signal(SIGTERM, dump_exit);
89
90 while ((opt = getopt_long(argc, argv, "hd:rSw",
91 long_options, &longindex)) != -1) {
92 switch (opt) {
93 case 'd':
94 delay = strtoul(optarg, NULL, 0);
95 if (delay == ULONG_MAX || delay < 0 ||
96 delay > 1000000) {
97 fprintf(stderr, "ERROR: invalid delay : %s\n",
98 optarg);
99 usage(argv[0]);
100 return 1;
101 }
102 break;
103 default:
104 case 'h':
105 usage(argv[0]);
106 return 1;
107 }
108 }
109
110 if (load_bpf_file(bpf_file)) {
111 fprintf(stderr, "ERROR: failed to load eBPF from file : %s\n",
112 bpf_file);
113 return 1;
114 }
115
116 while (1) {
117 sleep(delay);
118 dump_all_counts();
119 }
120
121 return 0;
122}
diff --git a/samples/bpf/lathist_kern.c b/samples/bpf/lathist_kern.c
new file mode 100644
index 000000000..4adfcbbe6
--- /dev/null
+++ b/samples/bpf/lathist_kern.c
@@ -0,0 +1,99 @@
1/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
2 * Copyright (c) 2015 BMW Car IT GmbH
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 */
8#include <linux/version.h>
9#include <linux/ptrace.h>
10#include <uapi/linux/bpf.h>
11#include <bpf/bpf_helpers.h>
12
13#define MAX_ENTRIES 20
14#define MAX_CPU 4
15
16/* We need to stick to static allocated memory (an array instead of
17 * hash table) because managing dynamic memory from the
18 * trace_preempt_[on|off] tracepoints hooks is not supported.
19 */
20
21struct {
22 __uint(type, BPF_MAP_TYPE_ARRAY);
23 __type(key, int);
24 __type(value, u64);
25 __uint(max_entries, MAX_CPU);
26} my_map SEC(".maps");
27
28SEC("kprobe/trace_preempt_off")
29int bpf_prog1(struct pt_regs *ctx)
30{
31 int cpu = bpf_get_smp_processor_id();
32 u64 *ts = bpf_map_lookup_elem(&my_map, &cpu);
33
34 if (ts)
35 *ts = bpf_ktime_get_ns();
36
37 return 0;
38}
39
40static unsigned int log2(unsigned int v)
41{
42 unsigned int r;
43 unsigned int shift;
44
45 r = (v > 0xFFFF) << 4; v >>= r;
46 shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
47 shift = (v > 0xF) << 2; v >>= shift; r |= shift;
48 shift = (v > 0x3) << 1; v >>= shift; r |= shift;
49 r |= (v >> 1);
50
51 return r;
52}
53
54static unsigned int log2l(unsigned long v)
55{
56 unsigned int hi = v >> 32;
57
58 if (hi)
59 return log2(hi) + 32;
60 else
61 return log2(v);
62}
63
64struct {
65 __uint(type, BPF_MAP_TYPE_ARRAY);
66 __type(key, int);
67 __type(value, long);
68 __uint(max_entries, MAX_CPU * MAX_ENTRIES);
69} my_lat SEC(".maps");
70
71SEC("kprobe/trace_preempt_on")
72int bpf_prog2(struct pt_regs *ctx)
73{
74 u64 *ts, cur_ts, delta;
75 int key, cpu;
76 long *val;
77
78 cpu = bpf_get_smp_processor_id();
79 ts = bpf_map_lookup_elem(&my_map, &cpu);
80 if (!ts)
81 return 0;
82
83 cur_ts = bpf_ktime_get_ns();
84 delta = log2l(cur_ts - *ts);
85
86 if (delta > MAX_ENTRIES - 1)
87 delta = MAX_ENTRIES - 1;
88
89 key = cpu * MAX_ENTRIES + delta;
90 val = bpf_map_lookup_elem(&my_lat, &key);
91 if (val)
92 __sync_fetch_and_add((long *)val, 1);
93
94 return 0;
95
96}
97
98char _license[] SEC("license") = "GPL";
99u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/lathist_user.c b/samples/bpf/lathist_user.c
new file mode 100644
index 000000000..7d8ff2418
--- /dev/null
+++ b/samples/bpf/lathist_user.c
@@ -0,0 +1,130 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
3 * Copyright (c) 2015 BMW Car IT GmbH
4 */
5#include <stdio.h>
6#include <unistd.h>
7#include <stdlib.h>
8#include <signal.h>
9#include <bpf/libbpf.h>
10#include <bpf/bpf.h>
11
12#define MAX_ENTRIES 20
13#define MAX_CPU 4
14#define MAX_STARS 40
15
16struct cpu_hist {
17 long data[MAX_ENTRIES];
18 long max;
19};
20
21static struct cpu_hist cpu_hist[MAX_CPU];
22
23static void stars(char *str, long val, long max, int width)
24{
25 int i;
26
27 for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
28 str[i] = '*';
29 if (val > max)
30 str[i - 1] = '+';
31 str[i] = '\0';
32}
33
34static void print_hist(void)
35{
36 char starstr[MAX_STARS];
37 struct cpu_hist *hist;
38 int i, j;
39
40 /* clear screen */
41 printf("\033[2J");
42
43 for (j = 0; j < MAX_CPU; j++) {
44 hist = &cpu_hist[j];
45
46 /* ignore CPUs without data (maybe offline?) */
47 if (hist->max == 0)
48 continue;
49
50 printf("CPU %d\n", j);
51 printf(" latency : count distribution\n");
52 for (i = 1; i <= MAX_ENTRIES; i++) {
53 stars(starstr, hist->data[i - 1], hist->max, MAX_STARS);
54 printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
55 (1l << i) >> 1, (1l << i) - 1,
56 hist->data[i - 1], MAX_STARS, starstr);
57 }
58 }
59}
60
61static void get_data(int fd)
62{
63 long key, value;
64 int c, i;
65
66 for (i = 0; i < MAX_CPU; i++)
67 cpu_hist[i].max = 0;
68
69 for (c = 0; c < MAX_CPU; c++) {
70 for (i = 0; i < MAX_ENTRIES; i++) {
71 key = c * MAX_ENTRIES + i;
72 bpf_map_lookup_elem(fd, &key, &value);
73
74 cpu_hist[c].data[i] = value;
75 if (value > cpu_hist[c].max)
76 cpu_hist[c].max = value;
77 }
78 }
79}
80
81int main(int argc, char **argv)
82{
83 struct bpf_link *links[2];
84 struct bpf_program *prog;
85 struct bpf_object *obj;
86 char filename[256];
87 int map_fd, i = 0;
88
89 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
90 obj = bpf_object__open_file(filename, NULL);
91 if (libbpf_get_error(obj)) {
92 fprintf(stderr, "ERROR: opening BPF object file failed\n");
93 return 0;
94 }
95
96 /* load BPF program */
97 if (bpf_object__load(obj)) {
98 fprintf(stderr, "ERROR: loading BPF object file failed\n");
99 goto cleanup;
100 }
101
102 map_fd = bpf_object__find_map_fd_by_name(obj, "my_lat");
103 if (map_fd < 0) {
104 fprintf(stderr, "ERROR: finding a map in obj file failed\n");
105 goto cleanup;
106 }
107
108 bpf_object__for_each_program(prog, obj) {
109 links[i] = bpf_program__attach(prog);
110 if (libbpf_get_error(links[i])) {
111 fprintf(stderr, "ERROR: bpf_program__attach failed\n");
112 links[i] = NULL;
113 goto cleanup;
114 }
115 i++;
116 }
117
118 while (1) {
119 get_data(map_fd);
120 print_hist();
121 sleep(5);
122 }
123
124cleanup:
125 for (i--; i >= 0; i--)
126 bpf_link__destroy(links[i]);
127
128 bpf_object__close(obj);
129 return 0;
130}
diff --git a/samples/bpf/lwt_len_hist.sh b/samples/bpf/lwt_len_hist.sh
new file mode 100755
index 000000000..0eda9754f
--- /dev/null
+++ b/samples/bpf/lwt_len_hist.sh
@@ -0,0 +1,40 @@
1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3
4NS1=lwt_ns1
5VETH0=tst_lwt1a
6VETH1=tst_lwt1b
7
8TRACE_ROOT=/sys/kernel/debug/tracing
9
10function cleanup {
11 # To reset saved histogram, remove pinned map
12 rm /sys/fs/bpf/tc/globals/lwt_len_hist_map
13 ip route del 192.168.253.2/32 dev $VETH0 2> /dev/null
14 ip link del $VETH0 2> /dev/null
15 ip link del $VETH1 2> /dev/null
16 ip netns exec $NS1 killall netserver
17 ip netns delete $NS1 2> /dev/null
18}
19
20cleanup
21
22ip netns add $NS1
23ip link add $VETH0 type veth peer name $VETH1
24ip link set dev $VETH0 up
25ip addr add 192.168.253.1/24 dev $VETH0
26ip link set $VETH1 netns $NS1
27ip netns exec $NS1 ip link set dev $VETH1 up
28ip netns exec $NS1 ip addr add 192.168.253.2/24 dev $VETH1
29ip netns exec $NS1 netserver
30
31echo 1 > ${TRACE_ROOT}/tracing_on
32cp /dev/null ${TRACE_ROOT}/trace
33ip route add 192.168.253.2/32 encap bpf out obj lwt_len_hist_kern.o section len_hist dev $VETH0
34netperf -H 192.168.253.2 -t TCP_STREAM
35cat ${TRACE_ROOT}/trace | grep -v '^#'
36./lwt_len_hist
37cleanup
38echo 0 > ${TRACE_ROOT}/tracing_on
39
40exit 0
diff --git a/samples/bpf/lwt_len_hist_kern.c b/samples/bpf/lwt_len_hist_kern.c
new file mode 100644
index 000000000..9ed63e10e
--- /dev/null
+++ b/samples/bpf/lwt_len_hist_kern.c
@@ -0,0 +1,82 @@
1/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12
13#include <uapi/linux/bpf.h>
14#include <uapi/linux/if_ether.h>
15#include <uapi/linux/ip.h>
16#include <uapi/linux/in.h>
17#include <bpf/bpf_helpers.h>
18
19# define printk(fmt, ...) \
20 ({ \
21 char ____fmt[] = fmt; \
22 bpf_trace_printk(____fmt, sizeof(____fmt), \
23 ##__VA_ARGS__); \
24 })
25
26struct bpf_elf_map {
27 __u32 type;
28 __u32 size_key;
29 __u32 size_value;
30 __u32 max_elem;
31 __u32 flags;
32 __u32 id;
33 __u32 pinning;
34};
35
36struct bpf_elf_map SEC("maps") lwt_len_hist_map = {
37 .type = BPF_MAP_TYPE_PERCPU_HASH,
38 .size_key = sizeof(__u64),
39 .size_value = sizeof(__u64),
40 .pinning = 2,
41 .max_elem = 1024,
42};
43
44static unsigned int log2(unsigned int v)
45{
46 unsigned int r;
47 unsigned int shift;
48
49 r = (v > 0xFFFF) << 4; v >>= r;
50 shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
51 shift = (v > 0xF) << 2; v >>= shift; r |= shift;
52 shift = (v > 0x3) << 1; v >>= shift; r |= shift;
53 r |= (v >> 1);
54 return r;
55}
56
57static unsigned int log2l(unsigned long v)
58{
59 unsigned int hi = v >> 32;
60 if (hi)
61 return log2(hi) + 32;
62 else
63 return log2(v);
64}
65
66SEC("len_hist")
67int do_len_hist(struct __sk_buff *skb)
68{
69 __u64 *value, key, init_val = 1;
70
71 key = log2l(skb->len);
72
73 value = bpf_map_lookup_elem(&lwt_len_hist_map, &key);
74 if (value)
75 __sync_fetch_and_add(value, 1);
76 else
77 bpf_map_update_elem(&lwt_len_hist_map, &key, &init_val, BPF_ANY);
78
79 return BPF_OK;
80}
81
82char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/lwt_len_hist_user.c b/samples/bpf/lwt_len_hist_user.c
new file mode 100644
index 000000000..430a4b7e3
--- /dev/null
+++ b/samples/bpf/lwt_len_hist_user.c
@@ -0,0 +1,77 @@
1// SPDX-License-Identifier: GPL-2.0
2#include <linux/unistd.h>
3#include <linux/bpf.h>
4
5#include <stdlib.h>
6#include <stdio.h>
7#include <unistd.h>
8#include <string.h>
9#include <errno.h>
10#include <arpa/inet.h>
11
12#include <bpf/bpf.h>
13#include "bpf_util.h"
14
15#define MAX_INDEX 64
16#define MAX_STARS 38
17
18static void stars(char *str, long val, long max, int width)
19{
20 int i;
21
22 for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
23 str[i] = '*';
24 if (val > max)
25 str[i - 1] = '+';
26 str[i] = '\0';
27}
28
29int main(int argc, char **argv)
30{
31 unsigned int nr_cpus = bpf_num_possible_cpus();
32 const char *map_filename = "/sys/fs/bpf/tc/globals/lwt_len_hist_map";
33 uint64_t values[nr_cpus], sum, max_value = 0, data[MAX_INDEX] = {};
34 uint64_t key = 0, next_key, max_key = 0;
35 char starstr[MAX_STARS];
36 int i, map_fd;
37
38 map_fd = bpf_obj_get(map_filename);
39 if (map_fd < 0) {
40 fprintf(stderr, "bpf_obj_get(%s): %s(%d)\n",
41 map_filename, strerror(errno), errno);
42 return -1;
43 }
44
45 while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) {
46 if (next_key >= MAX_INDEX) {
47 fprintf(stderr, "Key %lu out of bounds\n", next_key);
48 continue;
49 }
50
51 bpf_map_lookup_elem(map_fd, &next_key, values);
52
53 sum = 0;
54 for (i = 0; i < nr_cpus; i++)
55 sum += values[i];
56
57 data[next_key] = sum;
58 if (sum && next_key > max_key)
59 max_key = next_key;
60
61 if (sum > max_value)
62 max_value = sum;
63
64 key = next_key;
65 }
66
67 for (i = 1; i <= max_key + 1; i++) {
68 stars(starstr, data[i - 1], max_value, MAX_STARS);
69 printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
70 (1l << i) >> 1, (1l << i) - 1, data[i - 1],
71 MAX_STARS, starstr);
72 }
73
74 close(map_fd);
75
76 return 0;
77}
diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c
new file mode 100644
index 000000000..8773f22b6
--- /dev/null
+++ b/samples/bpf/map_perf_test_kern.c
@@ -0,0 +1,291 @@
1/* Copyright (c) 2016 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#include <linux/skbuff.h>
8#include <linux/netdevice.h>
9#include <linux/version.h>
10#include <uapi/linux/bpf.h>
11#include <bpf/bpf_helpers.h>
12#include <bpf/bpf_tracing.h>
13#include <bpf/bpf_core_read.h>
14#include "trace_common.h"
15
16#define MAX_ENTRIES 1000
17#define MAX_NR_CPUS 1024
18
19struct {
20 __uint(type, BPF_MAP_TYPE_HASH);
21 __type(key, u32);
22 __type(value, long);
23 __uint(max_entries, MAX_ENTRIES);
24} hash_map SEC(".maps");
25
26struct {
27 __uint(type, BPF_MAP_TYPE_LRU_HASH);
28 __type(key, u32);
29 __type(value, long);
30 __uint(max_entries, 10000);
31} lru_hash_map SEC(".maps");
32
33struct {
34 __uint(type, BPF_MAP_TYPE_LRU_HASH);
35 __type(key, u32);
36 __type(value, long);
37 __uint(max_entries, 10000);
38 __uint(map_flags, BPF_F_NO_COMMON_LRU);
39} nocommon_lru_hash_map SEC(".maps");
40
41struct inner_lru {
42 __uint(type, BPF_MAP_TYPE_LRU_HASH);
43 __type(key, u32);
44 __type(value, long);
45 __uint(max_entries, MAX_ENTRIES);
46 __uint(map_flags, BPF_F_NUMA_NODE);
47 __uint(numa_node, 0);
48} inner_lru_hash_map SEC(".maps");
49
50struct {
51 __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
52 __uint(max_entries, MAX_NR_CPUS);
53 __uint(key_size, sizeof(u32));
54 __array(values, struct inner_lru); /* use inner_lru as inner map */
55} array_of_lru_hashs SEC(".maps") = {
56 /* statically initialize the first element */
57 .values = { &inner_lru_hash_map },
58};
59
60struct {
61 __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
62 __uint(key_size, sizeof(u32));
63 __uint(value_size, sizeof(long));
64 __uint(max_entries, MAX_ENTRIES);
65} percpu_hash_map SEC(".maps");
66
67struct {
68 __uint(type, BPF_MAP_TYPE_HASH);
69 __type(key, u32);
70 __type(value, long);
71 __uint(max_entries, MAX_ENTRIES);
72 __uint(map_flags, BPF_F_NO_PREALLOC);
73} hash_map_alloc SEC(".maps");
74
75struct {
76 __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
77 __uint(key_size, sizeof(u32));
78 __uint(value_size, sizeof(long));
79 __uint(max_entries, MAX_ENTRIES);
80 __uint(map_flags, BPF_F_NO_PREALLOC);
81} percpu_hash_map_alloc SEC(".maps");
82
83struct {
84 __uint(type, BPF_MAP_TYPE_LPM_TRIE);
85 __uint(key_size, 8);
86 __uint(value_size, sizeof(long));
87 __uint(max_entries, 10000);
88 __uint(map_flags, BPF_F_NO_PREALLOC);
89} lpm_trie_map_alloc SEC(".maps");
90
91struct {
92 __uint(type, BPF_MAP_TYPE_ARRAY);
93 __type(key, u32);
94 __type(value, long);
95 __uint(max_entries, MAX_ENTRIES);
96} array_map SEC(".maps");
97
98struct {
99 __uint(type, BPF_MAP_TYPE_LRU_HASH);
100 __type(key, u32);
101 __type(value, long);
102 __uint(max_entries, MAX_ENTRIES);
103} lru_hash_lookup_map SEC(".maps");
104
105SEC("kprobe/" SYSCALL(sys_getuid))
106int stress_hmap(struct pt_regs *ctx)
107{
108 u32 key = bpf_get_current_pid_tgid();
109 long init_val = 1;
110 long *value;
111
112 bpf_map_update_elem(&hash_map, &key, &init_val, BPF_ANY);
113 value = bpf_map_lookup_elem(&hash_map, &key);
114 if (value)
115 bpf_map_delete_elem(&hash_map, &key);
116
117 return 0;
118}
119
120SEC("kprobe/" SYSCALL(sys_geteuid))
121int stress_percpu_hmap(struct pt_regs *ctx)
122{
123 u32 key = bpf_get_current_pid_tgid();
124 long init_val = 1;
125 long *value;
126
127 bpf_map_update_elem(&percpu_hash_map, &key, &init_val, BPF_ANY);
128 value = bpf_map_lookup_elem(&percpu_hash_map, &key);
129 if (value)
130 bpf_map_delete_elem(&percpu_hash_map, &key);
131 return 0;
132}
133
134SEC("kprobe/" SYSCALL(sys_getgid))
135int stress_hmap_alloc(struct pt_regs *ctx)
136{
137 u32 key = bpf_get_current_pid_tgid();
138 long init_val = 1;
139 long *value;
140
141 bpf_map_update_elem(&hash_map_alloc, &key, &init_val, BPF_ANY);
142 value = bpf_map_lookup_elem(&hash_map_alloc, &key);
143 if (value)
144 bpf_map_delete_elem(&hash_map_alloc, &key);
145 return 0;
146}
147
148SEC("kprobe/" SYSCALL(sys_getegid))
149int stress_percpu_hmap_alloc(struct pt_regs *ctx)
150{
151 u32 key = bpf_get_current_pid_tgid();
152 long init_val = 1;
153 long *value;
154
155 bpf_map_update_elem(&percpu_hash_map_alloc, &key, &init_val, BPF_ANY);
156 value = bpf_map_lookup_elem(&percpu_hash_map_alloc, &key);
157 if (value)
158 bpf_map_delete_elem(&percpu_hash_map_alloc, &key);
159 return 0;
160}
161
162SEC("kprobe/" SYSCALL(sys_connect))
163int stress_lru_hmap_alloc(struct pt_regs *ctx)
164{
165 struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx);
166 char fmt[] = "Failed at stress_lru_hmap_alloc. ret:%dn";
167 union {
168 u16 dst6[8];
169 struct {
170 u16 magic0;
171 u16 magic1;
172 u16 tcase;
173 u16 unused16;
174 u32 unused32;
175 u32 key;
176 };
177 } test_params;
178 struct sockaddr_in6 *in6;
179 u16 test_case;
180 int addrlen, ret;
181 long val = 1;
182 u32 key = 0;
183
184 in6 = (struct sockaddr_in6 *)PT_REGS_PARM2_CORE(real_regs);
185 addrlen = (int)PT_REGS_PARM3_CORE(real_regs);
186
187 if (addrlen != sizeof(*in6))
188 return 0;
189
190 ret = bpf_probe_read_user(test_params.dst6, sizeof(test_params.dst6),
191 &in6->sin6_addr);
192 if (ret)
193 goto done;
194
195 if (test_params.magic0 != 0xdead ||
196 test_params.magic1 != 0xbeef)
197 return 0;
198
199 test_case = test_params.tcase;
200 if (test_case != 3)
201 key = bpf_get_prandom_u32();
202
203 if (test_case == 0) {
204 ret = bpf_map_update_elem(&lru_hash_map, &key, &val, BPF_ANY);
205 } else if (test_case == 1) {
206 ret = bpf_map_update_elem(&nocommon_lru_hash_map, &key, &val,
207 BPF_ANY);
208 } else if (test_case == 2) {
209 void *nolocal_lru_map;
210 int cpu = bpf_get_smp_processor_id();
211
212 nolocal_lru_map = bpf_map_lookup_elem(&array_of_lru_hashs,
213 &cpu);
214 if (!nolocal_lru_map) {
215 ret = -ENOENT;
216 goto done;
217 }
218
219 ret = bpf_map_update_elem(nolocal_lru_map, &key, &val,
220 BPF_ANY);
221 } else if (test_case == 3) {
222 u32 i;
223
224 key = test_params.key;
225
226#pragma clang loop unroll(full)
227 for (i = 0; i < 32; i++) {
228 bpf_map_lookup_elem(&lru_hash_lookup_map, &key);
229 key++;
230 }
231 } else {
232 ret = -EINVAL;
233 }
234
235done:
236 if (ret)
237 bpf_trace_printk(fmt, sizeof(fmt), ret);
238
239 return 0;
240}
241
242SEC("kprobe/" SYSCALL(sys_gettid))
243int stress_lpm_trie_map_alloc(struct pt_regs *ctx)
244{
245 union {
246 u32 b32[2];
247 u8 b8[8];
248 } key;
249 unsigned int i;
250
251 key.b32[0] = 32;
252 key.b8[4] = 192;
253 key.b8[5] = 168;
254 key.b8[6] = 0;
255 key.b8[7] = 1;
256
257#pragma clang loop unroll(full)
258 for (i = 0; i < 32; ++i)
259 bpf_map_lookup_elem(&lpm_trie_map_alloc, &key);
260
261 return 0;
262}
263
264SEC("kprobe/" SYSCALL(sys_getpgid))
265int stress_hash_map_lookup(struct pt_regs *ctx)
266{
267 u32 key = 1, i;
268 long *value;
269
270#pragma clang loop unroll(full)
271 for (i = 0; i < 64; ++i)
272 value = bpf_map_lookup_elem(&hash_map, &key);
273
274 return 0;
275}
276
277SEC("kprobe/" SYSCALL(sys_getppid))
278int stress_array_map_lookup(struct pt_regs *ctx)
279{
280 u32 key = 1, i;
281 long *value;
282
283#pragma clang loop unroll(full)
284 for (i = 0; i < 64; ++i)
285 value = bpf_map_lookup_elem(&array_map, &key);
286
287 return 0;
288}
289
290char _license[] SEC("license") = "GPL";
291u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c
new file mode 100644
index 000000000..8b13230b4
--- /dev/null
+++ b/samples/bpf/map_perf_test_user.c
@@ -0,0 +1,507 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright (c) 2016 Facebook
3 */
4#define _GNU_SOURCE
5#include <sched.h>
6#include <stdio.h>
7#include <sys/types.h>
8#include <asm/unistd.h>
9#include <unistd.h>
10#include <assert.h>
11#include <sys/wait.h>
12#include <stdlib.h>
13#include <signal.h>
14#include <string.h>
15#include <time.h>
16#include <sys/resource.h>
17#include <arpa/inet.h>
18#include <errno.h>
19
20#include <bpf/bpf.h>
21#include <bpf/libbpf.h>
22
23#define TEST_BIT(t) (1U << (t))
24#define MAX_NR_CPUS 1024
25
26static __u64 time_get_ns(void)
27{
28 struct timespec ts;
29
30 clock_gettime(CLOCK_MONOTONIC, &ts);
31 return ts.tv_sec * 1000000000ull + ts.tv_nsec;
32}
33
34enum test_type {
35 HASH_PREALLOC,
36 PERCPU_HASH_PREALLOC,
37 HASH_KMALLOC,
38 PERCPU_HASH_KMALLOC,
39 LRU_HASH_PREALLOC,
40 NOCOMMON_LRU_HASH_PREALLOC,
41 LPM_KMALLOC,
42 HASH_LOOKUP,
43 ARRAY_LOOKUP,
44 INNER_LRU_HASH_PREALLOC,
45 LRU_HASH_LOOKUP,
46 NR_TESTS,
47};
48
49const char *test_map_names[NR_TESTS] = {
50 [HASH_PREALLOC] = "hash_map",
51 [PERCPU_HASH_PREALLOC] = "percpu_hash_map",
52 [HASH_KMALLOC] = "hash_map_alloc",
53 [PERCPU_HASH_KMALLOC] = "percpu_hash_map_alloc",
54 [LRU_HASH_PREALLOC] = "lru_hash_map",
55 [NOCOMMON_LRU_HASH_PREALLOC] = "nocommon_lru_hash_map",
56 [LPM_KMALLOC] = "lpm_trie_map_alloc",
57 [HASH_LOOKUP] = "hash_map",
58 [ARRAY_LOOKUP] = "array_map",
59 [INNER_LRU_HASH_PREALLOC] = "inner_lru_hash_map",
60 [LRU_HASH_LOOKUP] = "lru_hash_lookup_map",
61};
62
63enum map_idx {
64 array_of_lru_hashs_idx,
65 hash_map_alloc_idx,
66 lru_hash_lookup_idx,
67 NR_IDXES,
68};
69
70static int map_fd[NR_IDXES];
71
72static int test_flags = ~0;
73static uint32_t num_map_entries;
74static uint32_t inner_lru_hash_size;
75static int lru_hash_lookup_test_entries = 32;
76static uint32_t max_cnt = 1000000;
77
78static int check_test_flags(enum test_type t)
79{
80 return test_flags & TEST_BIT(t);
81}
82
83static void test_hash_prealloc(int cpu)
84{
85 __u64 start_time;
86 int i;
87
88 start_time = time_get_ns();
89 for (i = 0; i < max_cnt; i++)
90 syscall(__NR_getuid);
91 printf("%d:hash_map_perf pre-alloc %lld events per sec\n",
92 cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time));
93}
94
95static int pre_test_lru_hash_lookup(int tasks)
96{
97 int fd = map_fd[lru_hash_lookup_idx];
98 uint32_t key;
99 long val = 1;
100 int ret;
101
102 if (num_map_entries > lru_hash_lookup_test_entries)
103 lru_hash_lookup_test_entries = num_map_entries;
104
105 /* Populate the lru_hash_map for LRU_HASH_LOOKUP perf test.
106 *
107 * It is fine that the user requests for a map with
108 * num_map_entries < 32 and some of the later lru hash lookup
109 * may return not found. For LRU map, we are not interested
110 * in such small map performance.
111 */
112 for (key = 0; key < lru_hash_lookup_test_entries; key++) {
113 ret = bpf_map_update_elem(fd, &key, &val, BPF_NOEXIST);
114 if (ret)
115 return ret;
116 }
117
118 return 0;
119}
120
121static void do_test_lru(enum test_type test, int cpu)
122{
123 static int inner_lru_map_fds[MAX_NR_CPUS];
124
125 struct sockaddr_in6 in6 = { .sin6_family = AF_INET6 };
126 const char *test_name;
127 __u64 start_time;
128 int i, ret;
129
130 if (test == INNER_LRU_HASH_PREALLOC && cpu) {
131 /* If CPU is not 0, create inner_lru hash map and insert the fd
132 * value into the array_of_lru_hash map. In case of CPU 0,
133 * 'inner_lru_hash_map' was statically inserted on the map init
134 */
135 int outer_fd = map_fd[array_of_lru_hashs_idx];
136 unsigned int mycpu, mynode;
137
138 assert(cpu < MAX_NR_CPUS);
139
140 ret = syscall(__NR_getcpu, &mycpu, &mynode, NULL);
141 assert(!ret);
142
143 inner_lru_map_fds[cpu] =
144 bpf_create_map_node(BPF_MAP_TYPE_LRU_HASH,
145 test_map_names[INNER_LRU_HASH_PREALLOC],
146 sizeof(uint32_t),
147 sizeof(long),
148 inner_lru_hash_size, 0,
149 mynode);
150 if (inner_lru_map_fds[cpu] == -1) {
151 printf("cannot create BPF_MAP_TYPE_LRU_HASH %s(%d)\n",
152 strerror(errno), errno);
153 exit(1);
154 }
155
156 ret = bpf_map_update_elem(outer_fd, &cpu,
157 &inner_lru_map_fds[cpu],
158 BPF_ANY);
159 if (ret) {
160 printf("cannot update ARRAY_OF_LRU_HASHS with key:%u. %s(%d)\n",
161 cpu, strerror(errno), errno);
162 exit(1);
163 }
164 }
165
166 in6.sin6_addr.s6_addr16[0] = 0xdead;
167 in6.sin6_addr.s6_addr16[1] = 0xbeef;
168
169 if (test == LRU_HASH_PREALLOC) {
170 test_name = "lru_hash_map_perf";
171 in6.sin6_addr.s6_addr16[2] = 0;
172 } else if (test == NOCOMMON_LRU_HASH_PREALLOC) {
173 test_name = "nocommon_lru_hash_map_perf";
174 in6.sin6_addr.s6_addr16[2] = 1;
175 } else if (test == INNER_LRU_HASH_PREALLOC) {
176 test_name = "inner_lru_hash_map_perf";
177 in6.sin6_addr.s6_addr16[2] = 2;
178 } else if (test == LRU_HASH_LOOKUP) {
179 test_name = "lru_hash_lookup_perf";
180 in6.sin6_addr.s6_addr16[2] = 3;
181 in6.sin6_addr.s6_addr32[3] = 0;
182 } else {
183 assert(0);
184 }
185
186 start_time = time_get_ns();
187 for (i = 0; i < max_cnt; i++) {
188 ret = connect(-1, (const struct sockaddr *)&in6, sizeof(in6));
189 assert(ret == -1 && errno == EBADF);
190 if (in6.sin6_addr.s6_addr32[3] <
191 lru_hash_lookup_test_entries - 32)
192 in6.sin6_addr.s6_addr32[3] += 32;
193 else
194 in6.sin6_addr.s6_addr32[3] = 0;
195 }
196 printf("%d:%s pre-alloc %lld events per sec\n",
197 cpu, test_name,
198 max_cnt * 1000000000ll / (time_get_ns() - start_time));
199}
200
201static void test_lru_hash_prealloc(int cpu)
202{
203 do_test_lru(LRU_HASH_PREALLOC, cpu);
204}
205
206static void test_nocommon_lru_hash_prealloc(int cpu)
207{
208 do_test_lru(NOCOMMON_LRU_HASH_PREALLOC, cpu);
209}
210
211static void test_inner_lru_hash_prealloc(int cpu)
212{
213 do_test_lru(INNER_LRU_HASH_PREALLOC, cpu);
214}
215
216static void test_lru_hash_lookup(int cpu)
217{
218 do_test_lru(LRU_HASH_LOOKUP, cpu);
219}
220
221static void test_percpu_hash_prealloc(int cpu)
222{
223 __u64 start_time;
224 int i;
225
226 start_time = time_get_ns();
227 for (i = 0; i < max_cnt; i++)
228 syscall(__NR_geteuid);
229 printf("%d:percpu_hash_map_perf pre-alloc %lld events per sec\n",
230 cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time));
231}
232
233static void test_hash_kmalloc(int cpu)
234{
235 __u64 start_time;
236 int i;
237
238 start_time = time_get_ns();
239 for (i = 0; i < max_cnt; i++)
240 syscall(__NR_getgid);
241 printf("%d:hash_map_perf kmalloc %lld events per sec\n",
242 cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time));
243}
244
245static void test_percpu_hash_kmalloc(int cpu)
246{
247 __u64 start_time;
248 int i;
249
250 start_time = time_get_ns();
251 for (i = 0; i < max_cnt; i++)
252 syscall(__NR_getegid);
253 printf("%d:percpu_hash_map_perf kmalloc %lld events per sec\n",
254 cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time));
255}
256
257static void test_lpm_kmalloc(int cpu)
258{
259 __u64 start_time;
260 int i;
261
262 start_time = time_get_ns();
263 for (i = 0; i < max_cnt; i++)
264 syscall(__NR_gettid);
265 printf("%d:lpm_perf kmalloc %lld events per sec\n",
266 cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time));
267}
268
269static void test_hash_lookup(int cpu)
270{
271 __u64 start_time;
272 int i;
273
274 start_time = time_get_ns();
275 for (i = 0; i < max_cnt; i++)
276 syscall(__NR_getpgid, 0);
277 printf("%d:hash_lookup %lld lookups per sec\n",
278 cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time));
279}
280
281static void test_array_lookup(int cpu)
282{
283 __u64 start_time;
284 int i;
285
286 start_time = time_get_ns();
287 for (i = 0; i < max_cnt; i++)
288 syscall(__NR_getppid, 0);
289 printf("%d:array_lookup %lld lookups per sec\n",
290 cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time));
291}
292
293typedef int (*pre_test_func)(int tasks);
294const pre_test_func pre_test_funcs[] = {
295 [LRU_HASH_LOOKUP] = pre_test_lru_hash_lookup,
296};
297
298typedef void (*test_func)(int cpu);
299const test_func test_funcs[] = {
300 [HASH_PREALLOC] = test_hash_prealloc,
301 [PERCPU_HASH_PREALLOC] = test_percpu_hash_prealloc,
302 [HASH_KMALLOC] = test_hash_kmalloc,
303 [PERCPU_HASH_KMALLOC] = test_percpu_hash_kmalloc,
304 [LRU_HASH_PREALLOC] = test_lru_hash_prealloc,
305 [NOCOMMON_LRU_HASH_PREALLOC] = test_nocommon_lru_hash_prealloc,
306 [LPM_KMALLOC] = test_lpm_kmalloc,
307 [HASH_LOOKUP] = test_hash_lookup,
308 [ARRAY_LOOKUP] = test_array_lookup,
309 [INNER_LRU_HASH_PREALLOC] = test_inner_lru_hash_prealloc,
310 [LRU_HASH_LOOKUP] = test_lru_hash_lookup,
311};
312
313static int pre_test(int tasks)
314{
315 int i;
316
317 for (i = 0; i < NR_TESTS; i++) {
318 if (pre_test_funcs[i] && check_test_flags(i)) {
319 int ret = pre_test_funcs[i](tasks);
320
321 if (ret)
322 return ret;
323 }
324 }
325
326 return 0;
327}
328
329static void loop(int cpu)
330{
331 cpu_set_t cpuset;
332 int i;
333
334 CPU_ZERO(&cpuset);
335 CPU_SET(cpu, &cpuset);
336 sched_setaffinity(0, sizeof(cpuset), &cpuset);
337
338 for (i = 0; i < NR_TESTS; i++) {
339 if (check_test_flags(i))
340 test_funcs[i](cpu);
341 }
342}
343
344static void run_perf_test(int tasks)
345{
346 pid_t pid[tasks];
347 int i;
348
349 assert(!pre_test(tasks));
350
351 for (i = 0; i < tasks; i++) {
352 pid[i] = fork();
353 if (pid[i] == 0) {
354 loop(i);
355 exit(0);
356 } else if (pid[i] == -1) {
357 printf("couldn't spawn #%d process\n", i);
358 exit(1);
359 }
360 }
361 for (i = 0; i < tasks; i++) {
362 int status;
363
364 assert(waitpid(pid[i], &status, 0) == pid[i]);
365 assert(status == 0);
366 }
367}
368
369static void fill_lpm_trie(void)
370{
371 struct bpf_lpm_trie_key *key;
372 unsigned long value = 0;
373 unsigned int i;
374 int r;
375
376 key = alloca(sizeof(*key) + 4);
377 key->prefixlen = 32;
378
379 for (i = 0; i < 512; ++i) {
380 key->prefixlen = rand() % 33;
381 key->data[0] = rand() & 0xff;
382 key->data[1] = rand() & 0xff;
383 key->data[2] = rand() & 0xff;
384 key->data[3] = rand() & 0xff;
385 r = bpf_map_update_elem(map_fd[hash_map_alloc_idx],
386 key, &value, 0);
387 assert(!r);
388 }
389
390 key->prefixlen = 32;
391 key->data[0] = 192;
392 key->data[1] = 168;
393 key->data[2] = 0;
394 key->data[3] = 1;
395 value = 128;
396
397 r = bpf_map_update_elem(map_fd[hash_map_alloc_idx], key, &value, 0);
398 assert(!r);
399}
400
401static void fixup_map(struct bpf_object *obj)
402{
403 struct bpf_map *map;
404 int i;
405
406 bpf_object__for_each_map(map, obj) {
407 const char *name = bpf_map__name(map);
408
409 /* Only change the max_entries for the enabled test(s) */
410 for (i = 0; i < NR_TESTS; i++) {
411 if (!strcmp(test_map_names[i], name) &&
412 (check_test_flags(i))) {
413 bpf_map__resize(map, num_map_entries);
414 continue;
415 }
416 }
417 }
418
419 inner_lru_hash_size = num_map_entries;
420}
421
422int main(int argc, char **argv)
423{
424 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
425 int nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
426 struct bpf_link *links[8];
427 struct bpf_program *prog;
428 struct bpf_object *obj;
429 struct bpf_map *map;
430 char filename[256];
431 int i = 0;
432
433 if (setrlimit(RLIMIT_MEMLOCK, &r)) {
434 perror("setrlimit(RLIMIT_MEMLOCK)");
435 return 1;
436 }
437
438 if (argc > 1)
439 test_flags = atoi(argv[1]) ? : test_flags;
440
441 if (argc > 2)
442 nr_cpus = atoi(argv[2]) ? : nr_cpus;
443
444 if (argc > 3)
445 num_map_entries = atoi(argv[3]);
446
447 if (argc > 4)
448 max_cnt = atoi(argv[4]);
449
450 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
451 obj = bpf_object__open_file(filename, NULL);
452 if (libbpf_get_error(obj)) {
453 fprintf(stderr, "ERROR: opening BPF object file failed\n");
454 return 0;
455 }
456
457 map = bpf_object__find_map_by_name(obj, "inner_lru_hash_map");
458 if (libbpf_get_error(map)) {
459 fprintf(stderr, "ERROR: finding a map in obj file failed\n");
460 goto cleanup;
461 }
462
463 inner_lru_hash_size = bpf_map__max_entries(map);
464 if (!inner_lru_hash_size) {
465 fprintf(stderr, "ERROR: failed to get map attribute\n");
466 goto cleanup;
467 }
468
469 /* resize BPF map prior to loading */
470 if (num_map_entries > 0)
471 fixup_map(obj);
472
473 /* load BPF program */
474 if (bpf_object__load(obj)) {
475 fprintf(stderr, "ERROR: loading BPF object file failed\n");
476 goto cleanup;
477 }
478
479 map_fd[0] = bpf_object__find_map_fd_by_name(obj, "array_of_lru_hashs");
480 map_fd[1] = bpf_object__find_map_fd_by_name(obj, "hash_map_alloc");
481 map_fd[2] = bpf_object__find_map_fd_by_name(obj, "lru_hash_lookup_map");
482 if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0) {
483 fprintf(stderr, "ERROR: finding a map in obj file failed\n");
484 goto cleanup;
485 }
486
487 bpf_object__for_each_program(prog, obj) {
488 links[i] = bpf_program__attach(prog);
489 if (libbpf_get_error(links[i])) {
490 fprintf(stderr, "ERROR: bpf_program__attach failed\n");
491 links[i] = NULL;
492 goto cleanup;
493 }
494 i++;
495 }
496
497 fill_lpm_trie();
498
499 run_perf_test(nr_cpus);
500
501cleanup:
502 for (i--; i >= 0; i--)
503 bpf_link__destroy(links[i]);
504
505 bpf_object__close(obj);
506 return 0;
507}
diff --git a/samples/bpf/offwaketime_kern.c b/samples/bpf/offwaketime_kern.c
new file mode 100644
index 000000000..14b792915
--- /dev/null
+++ b/samples/bpf/offwaketime_kern.c
@@ -0,0 +1,157 @@
1/* Copyright (c) 2016 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#include <uapi/linux/bpf.h>
8#include <uapi/linux/ptrace.h>
9#include <uapi/linux/perf_event.h>
10#include <linux/version.h>
11#include <linux/sched.h>
12#include <bpf/bpf_helpers.h>
13#include <bpf/bpf_tracing.h>
14
15#define _(P) \
16 ({ \
17 typeof(P) val; \
18 bpf_probe_read_kernel(&val, sizeof(val), &(P)); \
19 val; \
20 })
21
22#define MINBLOCK_US 1
23
24struct key_t {
25 char waker[TASK_COMM_LEN];
26 char target[TASK_COMM_LEN];
27 u32 wret;
28 u32 tret;
29};
30
31struct {
32 __uint(type, BPF_MAP_TYPE_HASH);
33 __type(key, struct key_t);
34 __type(value, u64);
35 __uint(max_entries, 10000);
36} counts SEC(".maps");
37
38struct {
39 __uint(type, BPF_MAP_TYPE_HASH);
40 __type(key, u32);
41 __type(value, u64);
42 __uint(max_entries, 10000);
43} start SEC(".maps");
44
45struct wokeby_t {
46 char name[TASK_COMM_LEN];
47 u32 ret;
48};
49
50struct {
51 __uint(type, BPF_MAP_TYPE_HASH);
52 __type(key, u32);
53 __type(value, struct wokeby_t);
54 __uint(max_entries, 10000);
55} wokeby SEC(".maps");
56
57struct {
58 __uint(type, BPF_MAP_TYPE_STACK_TRACE);
59 __uint(key_size, sizeof(u32));
60 __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64));
61 __uint(max_entries, 10000);
62} stackmap SEC(".maps");
63
64#define STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP)
65
66SEC("kprobe/try_to_wake_up")
67int waker(struct pt_regs *ctx)
68{
69 struct task_struct *p = (void *) PT_REGS_PARM1(ctx);
70 struct wokeby_t woke;
71 u32 pid;
72
73 pid = _(p->pid);
74
75 bpf_get_current_comm(&woke.name, sizeof(woke.name));
76 woke.ret = bpf_get_stackid(ctx, &stackmap, STACKID_FLAGS);
77
78 bpf_map_update_elem(&wokeby, &pid, &woke, BPF_ANY);
79 return 0;
80}
81
82static inline int update_counts(void *ctx, u32 pid, u64 delta)
83{
84 struct wokeby_t *woke;
85 u64 zero = 0, *val;
86 struct key_t key;
87
88 __builtin_memset(&key.waker, 0, sizeof(key.waker));
89 bpf_get_current_comm(&key.target, sizeof(key.target));
90 key.tret = bpf_get_stackid(ctx, &stackmap, STACKID_FLAGS);
91 key.wret = 0;
92
93 woke = bpf_map_lookup_elem(&wokeby, &pid);
94 if (woke) {
95 key.wret = woke->ret;
96 __builtin_memcpy(&key.waker, woke->name, sizeof(key.waker));
97 bpf_map_delete_elem(&wokeby, &pid);
98 }
99
100 val = bpf_map_lookup_elem(&counts, &key);
101 if (!val) {
102 bpf_map_update_elem(&counts, &key, &zero, BPF_NOEXIST);
103 val = bpf_map_lookup_elem(&counts, &key);
104 if (!val)
105 return 0;
106 }
107 (*val) += delta;
108 return 0;
109}
110
111#if 1
112/* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
113struct sched_switch_args {
114 unsigned long long pad;
115 char prev_comm[16];
116 int prev_pid;
117 int prev_prio;
118 long long prev_state;
119 char next_comm[16];
120 int next_pid;
121 int next_prio;
122};
123SEC("tracepoint/sched/sched_switch")
124int oncpu(struct sched_switch_args *ctx)
125{
126 /* record previous thread sleep time */
127 u32 pid = ctx->prev_pid;
128#else
129SEC("kprobe/finish_task_switch")
130int oncpu(struct pt_regs *ctx)
131{
132 struct task_struct *p = (void *) PT_REGS_PARM1(ctx);
133 /* record previous thread sleep time */
134 u32 pid = _(p->pid);
135#endif
136 u64 delta, ts, *tsp;
137
138 ts = bpf_ktime_get_ns();
139 bpf_map_update_elem(&start, &pid, &ts, BPF_ANY);
140
141 /* calculate current thread's delta time */
142 pid = bpf_get_current_pid_tgid();
143 tsp = bpf_map_lookup_elem(&start, &pid);
144 if (!tsp)
145 /* missed start or filtered */
146 return 0;
147
148 delta = bpf_ktime_get_ns() - *tsp;
149 bpf_map_delete_elem(&start, &pid);
150 delta = delta / 1000;
151 if (delta < MINBLOCK_US)
152 return 0;
153
154 return update_counts(ctx, pid, delta);
155}
156char _license[] SEC("license") = "GPL";
157u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/offwaketime_user.c b/samples/bpf/offwaketime_user.c
new file mode 100644
index 000000000..5734cfdaa
--- /dev/null
+++ b/samples/bpf/offwaketime_user.c
@@ -0,0 +1,160 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright (c) 2016 Facebook
3 */
4#include <stdio.h>
5#include <unistd.h>
6#include <stdlib.h>
7#include <signal.h>
8#include <linux/perf_event.h>
9#include <errno.h>
10#include <stdbool.h>
11#include <sys/resource.h>
12#include <bpf/libbpf.h>
13#include <bpf/bpf.h>
14#include "trace_helpers.h"
15
16#define PRINT_RAW_ADDR 0
17
18/* counts, stackmap */
19static int map_fd[2];
20
21static void print_ksym(__u64 addr)
22{
23 struct ksym *sym;
24
25 if (!addr)
26 return;
27 sym = ksym_search(addr);
28 if (!sym) {
29 printf("ksym not found. Is kallsyms loaded?\n");
30 return;
31 }
32
33 if (PRINT_RAW_ADDR)
34 printf("%s/%llx;", sym->name, addr);
35 else
36 printf("%s;", sym->name);
37}
38
39#define TASK_COMM_LEN 16
40
41struct key_t {
42 char waker[TASK_COMM_LEN];
43 char target[TASK_COMM_LEN];
44 __u32 wret;
45 __u32 tret;
46};
47
48static void print_stack(struct key_t *key, __u64 count)
49{
50 __u64 ip[PERF_MAX_STACK_DEPTH] = {};
51 static bool warned;
52 int i;
53
54 printf("%s;", key->target);
55 if (bpf_map_lookup_elem(map_fd[1], &key->tret, ip) != 0) {
56 printf("---;");
57 } else {
58 for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--)
59 print_ksym(ip[i]);
60 }
61 printf("-;");
62 if (bpf_map_lookup_elem(map_fd[1], &key->wret, ip) != 0) {
63 printf("---;");
64 } else {
65 for (i = 0; i < PERF_MAX_STACK_DEPTH; i++)
66 print_ksym(ip[i]);
67 }
68 printf(";%s %lld\n", key->waker, count);
69
70 if ((key->tret == -EEXIST || key->wret == -EEXIST) && !warned) {
71 printf("stackmap collisions seen. Consider increasing size\n");
72 warned = true;
73 } else if (((int)(key->tret) < 0 || (int)(key->wret) < 0)) {
74 printf("err stackid %d %d\n", key->tret, key->wret);
75 }
76}
77
78static void print_stacks(int fd)
79{
80 struct key_t key = {}, next_key;
81 __u64 value;
82
83 while (bpf_map_get_next_key(fd, &key, &next_key) == 0) {
84 bpf_map_lookup_elem(fd, &next_key, &value);
85 print_stack(&next_key, value);
86 key = next_key;
87 }
88}
89
90static void int_exit(int sig)
91{
92 print_stacks(map_fd[0]);
93 exit(0);
94}
95
96int main(int argc, char **argv)
97{
98 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
99 struct bpf_object *obj = NULL;
100 struct bpf_link *links[2];
101 struct bpf_program *prog;
102 int delay = 1, i = 0;
103 char filename[256];
104
105 if (setrlimit(RLIMIT_MEMLOCK, &r)) {
106 perror("setrlimit(RLIMIT_MEMLOCK)");
107 return 1;
108 }
109
110 if (load_kallsyms()) {
111 printf("failed to process /proc/kallsyms\n");
112 return 2;
113 }
114
115 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
116 obj = bpf_object__open_file(filename, NULL);
117 if (libbpf_get_error(obj)) {
118 fprintf(stderr, "ERROR: opening BPF object file failed\n");
119 obj = NULL;
120 goto cleanup;
121 }
122
123 /* load BPF program */
124 if (bpf_object__load(obj)) {
125 fprintf(stderr, "ERROR: loading BPF object file failed\n");
126 goto cleanup;
127 }
128
129 map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counts");
130 map_fd[1] = bpf_object__find_map_fd_by_name(obj, "stackmap");
131 if (map_fd[0] < 0 || map_fd[1] < 0) {
132 fprintf(stderr, "ERROR: finding a map in obj file failed\n");
133 goto cleanup;
134 }
135
136 signal(SIGINT, int_exit);
137 signal(SIGTERM, int_exit);
138
139 bpf_object__for_each_program(prog, obj) {
140 links[i] = bpf_program__attach(prog);
141 if (libbpf_get_error(links[i])) {
142 fprintf(stderr, "ERROR: bpf_program__attach failed\n");
143 links[i] = NULL;
144 goto cleanup;
145 }
146 i++;
147 }
148
149 if (argc > 1)
150 delay = atoi(argv[1]);
151 sleep(delay);
152 print_stacks(map_fd[0]);
153
154cleanup:
155 for (i--; i >= 0; i--)
156 bpf_link__destroy(links[i]);
157
158 bpf_object__close(obj);
159 return 0;
160}
diff --git a/samples/bpf/parse_ldabs.c b/samples/bpf/parse_ldabs.c
new file mode 100644
index 000000000..c6f65f90a
--- /dev/null
+++ b/samples/bpf/parse_ldabs.c
@@ -0,0 +1,43 @@
1/* Copyright (c) 2016 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#define KBUILD_MODNAME "foo"
8#include <linux/ip.h>
9#include <linux/ipv6.h>
10#include <linux/in.h>
11#include <linux/tcp.h>
12#include <linux/udp.h>
13#include <uapi/linux/bpf.h>
14#include <bpf/bpf_helpers.h>
15#include "bpf_legacy.h"
16
17#define DEFAULT_PKTGEN_UDP_PORT 9
18#define IP_MF 0x2000
19#define IP_OFFSET 0x1FFF
20
21static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff)
22{
23 return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off))
24 & (IP_MF | IP_OFFSET);
25}
26
27SEC("ldabs")
28int handle_ingress(struct __sk_buff *skb)
29{
30 __u64 troff = ETH_HLEN + sizeof(struct iphdr);
31
32 if (load_half(skb, offsetof(struct ethhdr, h_proto)) != ETH_P_IP)
33 return 0;
34 if (load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)) != IPPROTO_UDP ||
35 load_byte(skb, ETH_HLEN) != 0x45)
36 return 0;
37 if (ip_is_fragment(skb, ETH_HLEN))
38 return 0;
39 if (load_half(skb, troff + offsetof(struct udphdr, dest)) == DEFAULT_PKTGEN_UDP_PORT)
40 return TC_ACT_SHOT;
41 return 0;
42}
43char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/parse_simple.c b/samples/bpf/parse_simple.c
new file mode 100644
index 000000000..4a486cb1e
--- /dev/null
+++ b/samples/bpf/parse_simple.c
@@ -0,0 +1,49 @@
1/* Copyright (c) 2016 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#define KBUILD_MODNAME "foo"
8#include <linux/ip.h>
9#include <linux/ipv6.h>
10#include <linux/in.h>
11#include <linux/tcp.h>
12#include <linux/udp.h>
13#include <uapi/linux/bpf.h>
14#include <net/ip.h>
15#include <bpf/bpf_helpers.h>
16
17#define DEFAULT_PKTGEN_UDP_PORT 9
18
19/* copy of 'struct ethhdr' without __packed */
20struct eth_hdr {
21 unsigned char h_dest[ETH_ALEN];
22 unsigned char h_source[ETH_ALEN];
23 unsigned short h_proto;
24};
25
26SEC("simple")
27int handle_ingress(struct __sk_buff *skb)
28{
29 void *data = (void *)(long)skb->data;
30 struct eth_hdr *eth = data;
31 struct iphdr *iph = data + sizeof(*eth);
32 struct udphdr *udp = data + sizeof(*eth) + sizeof(*iph);
33 void *data_end = (void *)(long)skb->data_end;
34
35 /* single length check */
36 if (data + sizeof(*eth) + sizeof(*iph) + sizeof(*udp) > data_end)
37 return 0;
38
39 if (eth->h_proto != htons(ETH_P_IP))
40 return 0;
41 if (iph->protocol != IPPROTO_UDP || iph->ihl != 5)
42 return 0;
43 if (ip_is_fragment(iph))
44 return 0;
45 if (udp->dest == htons(DEFAULT_PKTGEN_UDP_PORT))
46 return TC_ACT_SHOT;
47 return 0;
48}
49char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/parse_varlen.c b/samples/bpf/parse_varlen.c
new file mode 100644
index 000000000..d8623846e
--- /dev/null
+++ b/samples/bpf/parse_varlen.c
@@ -0,0 +1,150 @@
1/* Copyright (c) 2016 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#define KBUILD_MODNAME "foo"
8#include <linux/if_ether.h>
9#include <linux/if_vlan.h>
10#include <linux/ip.h>
11#include <linux/ipv6.h>
12#include <linux/in.h>
13#include <linux/tcp.h>
14#include <linux/udp.h>
15#include <uapi/linux/bpf.h>
16#include <net/ip.h>
17#include <bpf/bpf_helpers.h>
18
19#define DEFAULT_PKTGEN_UDP_PORT 9
20#define DEBUG 0
21
22static int tcp(void *data, uint64_t tp_off, void *data_end)
23{
24 struct tcphdr *tcp = data + tp_off;
25
26 if (tcp + 1 > data_end)
27 return 0;
28 if (tcp->dest == htons(80) || tcp->source == htons(80))
29 return TC_ACT_SHOT;
30 return 0;
31}
32
33static int udp(void *data, uint64_t tp_off, void *data_end)
34{
35 struct udphdr *udp = data + tp_off;
36
37 if (udp + 1 > data_end)
38 return 0;
39 if (udp->dest == htons(DEFAULT_PKTGEN_UDP_PORT) ||
40 udp->source == htons(DEFAULT_PKTGEN_UDP_PORT)) {
41 if (DEBUG) {
42 char fmt[] = "udp port 9 indeed\n";
43
44 bpf_trace_printk(fmt, sizeof(fmt));
45 }
46 return TC_ACT_SHOT;
47 }
48 return 0;
49}
50
51static int parse_ipv4(void *data, uint64_t nh_off, void *data_end)
52{
53 struct iphdr *iph;
54 uint64_t ihl_len;
55
56 iph = data + nh_off;
57 if (iph + 1 > data_end)
58 return 0;
59
60 if (ip_is_fragment(iph))
61 return 0;
62 ihl_len = iph->ihl * 4;
63
64 if (iph->protocol == IPPROTO_IPIP) {
65 iph = data + nh_off + ihl_len;
66 if (iph + 1 > data_end)
67 return 0;
68 ihl_len += iph->ihl * 4;
69 }
70
71 if (iph->protocol == IPPROTO_TCP)
72 return tcp(data, nh_off + ihl_len, data_end);
73 else if (iph->protocol == IPPROTO_UDP)
74 return udp(data, nh_off + ihl_len, data_end);
75 return 0;
76}
77
78static int parse_ipv6(void *data, uint64_t nh_off, void *data_end)
79{
80 struct ipv6hdr *ip6h;
81 struct iphdr *iph;
82 uint64_t ihl_len = sizeof(struct ipv6hdr);
83 uint64_t nexthdr;
84
85 ip6h = data + nh_off;
86 if (ip6h + 1 > data_end)
87 return 0;
88
89 nexthdr = ip6h->nexthdr;
90
91 if (nexthdr == IPPROTO_IPIP) {
92 iph = data + nh_off + ihl_len;
93 if (iph + 1 > data_end)
94 return 0;
95 ihl_len += iph->ihl * 4;
96 nexthdr = iph->protocol;
97 } else if (nexthdr == IPPROTO_IPV6) {
98 ip6h = data + nh_off + ihl_len;
99 if (ip6h + 1 > data_end)
100 return 0;
101 ihl_len += sizeof(struct ipv6hdr);
102 nexthdr = ip6h->nexthdr;
103 }
104
105 if (nexthdr == IPPROTO_TCP)
106 return tcp(data, nh_off + ihl_len, data_end);
107 else if (nexthdr == IPPROTO_UDP)
108 return udp(data, nh_off + ihl_len, data_end);
109 return 0;
110}
111
112SEC("varlen")
113int handle_ingress(struct __sk_buff *skb)
114{
115 void *data = (void *)(long)skb->data;
116 struct ethhdr *eth = data;
117 void *data_end = (void *)(long)skb->data_end;
118 uint64_t h_proto, nh_off;
119
120 nh_off = sizeof(*eth);
121 if (data + nh_off > data_end)
122 return 0;
123
124 h_proto = eth->h_proto;
125
126 if (h_proto == ETH_P_8021Q || h_proto == ETH_P_8021AD) {
127 struct vlan_hdr *vhdr;
128
129 vhdr = data + nh_off;
130 nh_off += sizeof(struct vlan_hdr);
131 if (data + nh_off > data_end)
132 return 0;
133 h_proto = vhdr->h_vlan_encapsulated_proto;
134 }
135 if (h_proto == ETH_P_8021Q || h_proto == ETH_P_8021AD) {
136 struct vlan_hdr *vhdr;
137
138 vhdr = data + nh_off;
139 nh_off += sizeof(struct vlan_hdr);
140 if (data + nh_off > data_end)
141 return 0;
142 h_proto = vhdr->h_vlan_encapsulated_proto;
143 }
144 if (h_proto == htons(ETH_P_IP))
145 return parse_ipv4(data, nh_off, data_end);
146 else if (h_proto == htons(ETH_P_IPV6))
147 return parse_ipv6(data, nh_off, data_end);
148 return 0;
149}
150char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/run_cookie_uid_helper_example.sh b/samples/bpf/run_cookie_uid_helper_example.sh
new file mode 100755
index 000000000..fc6bc0451
--- /dev/null
+++ b/samples/bpf/run_cookie_uid_helper_example.sh
@@ -0,0 +1,15 @@
1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3local_dir="$(pwd)"
4root_dir=$local_dir/../..
5mnt_dir=$(mktemp -d --tmp)
6
7on_exit() {
8 iptables -D OUTPUT -m bpf --object-pinned ${mnt_dir}/bpf_prog -j ACCEPT
9 umount ${mnt_dir}
10 rm -r ${mnt_dir}
11}
12
13trap on_exit EXIT
14mount -t bpf bpf ${mnt_dir}
15./per_socket_stats_example ${mnt_dir}/bpf_prog $1
diff --git a/samples/bpf/sampleip_kern.c b/samples/bpf/sampleip_kern.c
new file mode 100644
index 000000000..f24806ac2
--- /dev/null
+++ b/samples/bpf/sampleip_kern.c
@@ -0,0 +1,39 @@
1/* Copyright 2016 Netflix, Inc.
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#include <linux/version.h>
8#include <linux/ptrace.h>
9#include <uapi/linux/bpf.h>
10#include <uapi/linux/bpf_perf_event.h>
11#include <bpf/bpf_helpers.h>
12#include <bpf/bpf_tracing.h>
13
14#define MAX_IPS 8192
15
16struct {
17 __uint(type, BPF_MAP_TYPE_HASH);
18 __type(key, u64);
19 __type(value, u32);
20 __uint(max_entries, MAX_IPS);
21} ip_map SEC(".maps");
22
23SEC("perf_event")
24int do_sample(struct bpf_perf_event_data *ctx)
25{
26 u64 ip;
27 u32 *value, init_val = 1;
28
29 ip = PT_REGS_IP(&ctx->regs);
30 value = bpf_map_lookup_elem(&ip_map, &ip);
31 if (value)
32 *value += 1;
33 else
34 /* E2BIG not tested for this example only */
35 bpf_map_update_elem(&ip_map, &ip, &init_val, BPF_NOEXIST);
36
37 return 0;
38}
39char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c
new file mode 100644
index 000000000..921c505bb
--- /dev/null
+++ b/samples/bpf/sampleip_user.c
@@ -0,0 +1,227 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * sampleip: sample instruction pointer and frequency count in a BPF map.
4 *
5 * Copyright 2016 Netflix, Inc.
6 */
7#include <stdio.h>
8#include <stdlib.h>
9#include <unistd.h>
10#include <errno.h>
11#include <signal.h>
12#include <string.h>
13#include <linux/perf_event.h>
14#include <linux/ptrace.h>
15#include <linux/bpf.h>
16#include <bpf/bpf.h>
17#include <bpf/libbpf.h>
18#include "perf-sys.h"
19#include "trace_helpers.h"
20
21#define DEFAULT_FREQ 99
22#define DEFAULT_SECS 5
23#define MAX_IPS 8192
24#define PAGE_OFFSET 0xffff880000000000
25
26static int map_fd;
27static int nr_cpus;
28
29static void usage(void)
30{
31 printf("USAGE: sampleip [-F freq] [duration]\n");
32 printf(" -F freq # sample frequency (Hertz), default 99\n");
33 printf(" duration # sampling duration (seconds), default 5\n");
34}
35
36static int sampling_start(int freq, struct bpf_program *prog,
37 struct bpf_link *links[])
38{
39 int i, pmu_fd;
40
41 struct perf_event_attr pe_sample_attr = {
42 .type = PERF_TYPE_SOFTWARE,
43 .freq = 1,
44 .sample_period = freq,
45 .config = PERF_COUNT_SW_CPU_CLOCK,
46 .inherit = 1,
47 };
48
49 for (i = 0; i < nr_cpus; i++) {
50 pmu_fd = sys_perf_event_open(&pe_sample_attr, -1 /* pid */, i,
51 -1 /* group_fd */, 0 /* flags */);
52 if (pmu_fd < 0) {
53 fprintf(stderr, "ERROR: Initializing perf sampling\n");
54 return 1;
55 }
56 links[i] = bpf_program__attach_perf_event(prog, pmu_fd);
57 if (libbpf_get_error(links[i])) {
58 fprintf(stderr, "ERROR: Attach perf event\n");
59 links[i] = NULL;
60 close(pmu_fd);
61 return 1;
62 }
63 }
64
65 return 0;
66}
67
68static void sampling_end(struct bpf_link *links[])
69{
70 int i;
71
72 for (i = 0; i < nr_cpus; i++)
73 bpf_link__destroy(links[i]);
74}
75
76struct ipcount {
77 __u64 ip;
78 __u32 count;
79};
80
81/* used for sorting */
82struct ipcount counts[MAX_IPS];
83
84static int count_cmp(const void *p1, const void *p2)
85{
86 return ((struct ipcount *)p1)->count - ((struct ipcount *)p2)->count;
87}
88
89static void print_ip_map(int fd)
90{
91 struct ksym *sym;
92 __u64 key, next_key;
93 __u32 value;
94 int i, max;
95
96 printf("%-19s %-32s %s\n", "ADDR", "KSYM", "COUNT");
97
98 /* fetch IPs and counts */
99 key = 0, i = 0;
100 while (bpf_map_get_next_key(fd, &key, &next_key) == 0) {
101 bpf_map_lookup_elem(fd, &next_key, &value);
102 counts[i].ip = next_key;
103 counts[i++].count = value;
104 key = next_key;
105 }
106 max = i;
107
108 /* sort and print */
109 qsort(counts, max, sizeof(struct ipcount), count_cmp);
110 for (i = 0; i < max; i++) {
111 if (counts[i].ip > PAGE_OFFSET) {
112 sym = ksym_search(counts[i].ip);
113 if (!sym) {
114 printf("ksym not found. Is kallsyms loaded?\n");
115 continue;
116 }
117
118 printf("0x%-17llx %-32s %u\n", counts[i].ip, sym->name,
119 counts[i].count);
120 } else {
121 printf("0x%-17llx %-32s %u\n", counts[i].ip, "(user)",
122 counts[i].count);
123 }
124 }
125
126 if (max == MAX_IPS) {
127 printf("WARNING: IP hash was full (max %d entries); ", max);
128 printf("may have dropped samples\n");
129 }
130}
131
132static void int_exit(int sig)
133{
134 printf("\n");
135 print_ip_map(map_fd);
136 exit(0);
137}
138
139int main(int argc, char **argv)
140{
141 int opt, freq = DEFAULT_FREQ, secs = DEFAULT_SECS, error = 1;
142 struct bpf_object *obj = NULL;
143 struct bpf_program *prog;
144 struct bpf_link **links;
145 char filename[256];
146
147 /* process arguments */
148 while ((opt = getopt(argc, argv, "F:h")) != -1) {
149 switch (opt) {
150 case 'F':
151 freq = atoi(optarg);
152 break;
153 case 'h':
154 default:
155 usage();
156 return 0;
157 }
158 }
159 if (argc - optind == 1)
160 secs = atoi(argv[optind]);
161 if (freq == 0 || secs == 0) {
162 usage();
163 return 1;
164 }
165
166 /* initialize kernel symbol translation */
167 if (load_kallsyms()) {
168 fprintf(stderr, "ERROR: loading /proc/kallsyms\n");
169 return 2;
170 }
171
172 /* create perf FDs for each CPU */
173 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
174 links = calloc(nr_cpus, sizeof(struct bpf_link *));
175 if (!links) {
176 fprintf(stderr, "ERROR: malloc of links\n");
177 goto cleanup;
178 }
179
180 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
181 obj = bpf_object__open_file(filename, NULL);
182 if (libbpf_get_error(obj)) {
183 fprintf(stderr, "ERROR: opening BPF object file failed\n");
184 obj = NULL;
185 goto cleanup;
186 }
187
188 prog = bpf_object__find_program_by_name(obj, "do_sample");
189 if (!prog) {
190 fprintf(stderr, "ERROR: finding a prog in obj file failed\n");
191 goto cleanup;
192 }
193
194 /* load BPF program */
195 if (bpf_object__load(obj)) {
196 fprintf(stderr, "ERROR: loading BPF object file failed\n");
197 goto cleanup;
198 }
199
200 map_fd = bpf_object__find_map_fd_by_name(obj, "ip_map");
201 if (map_fd < 0) {
202 fprintf(stderr, "ERROR: finding a map in obj file failed\n");
203 goto cleanup;
204 }
205
206 signal(SIGINT, int_exit);
207 signal(SIGTERM, int_exit);
208
209 /* do sampling */
210 printf("Sampling at %d Hertz for %d seconds. Ctrl-C also ends.\n",
211 freq, secs);
212 if (sampling_start(freq, prog, links) != 0)
213 goto cleanup;
214
215 sleep(secs);
216 error = 0;
217
218cleanup:
219 sampling_end(links);
220 /* output sample counts */
221 if (!error)
222 print_ip_map(map_fd);
223
224 free(links);
225 bpf_object__close(obj);
226 return error;
227}
diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c
new file mode 100644
index 000000000..00aae1d33
--- /dev/null
+++ b/samples/bpf/sock_example.c
@@ -0,0 +1,106 @@
1/* eBPF example program:
2 * - creates arraymap in kernel with key 4 bytes and value 8 bytes
3 *
4 * - loads eBPF program:
5 * r0 = skb->data[ETH_HLEN + offsetof(struct iphdr, protocol)];
6 * *(u32*)(fp - 4) = r0;
7 * // assuming packet is IPv4, lookup ip->proto in a map
8 * value = bpf_map_lookup_elem(map_fd, fp - 4);
9 * if (value)
10 * (*(u64*)value) += 1;
11 *
12 * - attaches this program to loopback interface "lo" raw socket
13 *
14 * - every second user space reads map[tcp], map[udp], map[icmp] to see
15 * how many packets of given protocol were seen on "lo"
16 */
17#include <stdio.h>
18#include <unistd.h>
19#include <assert.h>
20#include <linux/bpf.h>
21#include <string.h>
22#include <stdlib.h>
23#include <errno.h>
24#include <sys/socket.h>
25#include <arpa/inet.h>
26#include <linux/if_ether.h>
27#include <linux/ip.h>
28#include <stddef.h>
29#include <bpf/bpf.h>
30#include "bpf_insn.h"
31#include "sock_example.h"
32
33char bpf_log_buf[BPF_LOG_BUF_SIZE];
34
35static int test_sock(void)
36{
37 int sock = -1, map_fd, prog_fd, i, key;
38 long long value = 0, tcp_cnt, udp_cnt, icmp_cnt;
39
40 map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value),
41 256, 0);
42 if (map_fd < 0) {
43 printf("failed to create map '%s'\n", strerror(errno));
44 goto cleanup;
45 }
46
47 struct bpf_insn prog[] = {
48 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
49 BPF_LD_ABS(BPF_B, ETH_HLEN + offsetof(struct iphdr, protocol) /* R0 = ip->proto */),
50 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
51 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
52 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
53 BPF_LD_MAP_FD(BPF_REG_1, map_fd),
54 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
55 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
56 BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
57 BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
58 BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
59 BPF_EXIT_INSN(),
60 };
61 size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
62
63 prog_fd = bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog, insns_cnt,
64 "GPL", 0, bpf_log_buf, BPF_LOG_BUF_SIZE);
65 if (prog_fd < 0) {
66 printf("failed to load prog '%s'\n", strerror(errno));
67 goto cleanup;
68 }
69
70 sock = open_raw_sock("lo");
71
72 if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd,
73 sizeof(prog_fd)) < 0) {
74 printf("setsockopt %s\n", strerror(errno));
75 goto cleanup;
76 }
77
78 for (i = 0; i < 10; i++) {
79 key = IPPROTO_TCP;
80 assert(bpf_map_lookup_elem(map_fd, &key, &tcp_cnt) == 0);
81
82 key = IPPROTO_UDP;
83 assert(bpf_map_lookup_elem(map_fd, &key, &udp_cnt) == 0);
84
85 key = IPPROTO_ICMP;
86 assert(bpf_map_lookup_elem(map_fd, &key, &icmp_cnt) == 0);
87
88 printf("TCP %lld UDP %lld ICMP %lld packets\n",
89 tcp_cnt, udp_cnt, icmp_cnt);
90 sleep(1);
91 }
92
93cleanup:
94 /* maps, programs, raw sockets will auto cleanup on process exit */
95 return 0;
96}
97
98int main(void)
99{
100 FILE *f;
101
102 f = popen("ping -4 -c5 localhost", "r");
103 (void)f;
104
105 return test_sock();
106}
diff --git a/samples/bpf/sock_example.h b/samples/bpf/sock_example.h
new file mode 100644
index 000000000..a27d7579b
--- /dev/null
+++ b/samples/bpf/sock_example.h
@@ -0,0 +1,35 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#include <stdlib.h>
3#include <stdio.h>
4#include <linux/unistd.h>
5#include <unistd.h>
6#include <string.h>
7#include <errno.h>
8#include <linux/if_ether.h>
9#include <net/if.h>
10#include <linux/if_packet.h>
11#include <arpa/inet.h>
12
13static inline int open_raw_sock(const char *name)
14{
15 struct sockaddr_ll sll;
16 int sock;
17
18 sock = socket(PF_PACKET, SOCK_RAW | SOCK_NONBLOCK | SOCK_CLOEXEC, htons(ETH_P_ALL));
19 if (sock < 0) {
20 printf("cannot create raw socket\n");
21 return -1;
22 }
23
24 memset(&sll, 0, sizeof(sll));
25 sll.sll_family = AF_PACKET;
26 sll.sll_ifindex = if_nametoindex(name);
27 sll.sll_protocol = htons(ETH_P_ALL);
28 if (bind(sock, (struct sockaddr *)&sll, sizeof(sll)) < 0) {
29 printf("bind to %s: %s\n", name, strerror(errno));
30 close(sock);
31 return -1;
32 }
33
34 return sock;
35}
diff --git a/samples/bpf/sock_flags_kern.c b/samples/bpf/sock_flags_kern.c
new file mode 100644
index 000000000..6d0ac7569
--- /dev/null
+++ b/samples/bpf/sock_flags_kern.c
@@ -0,0 +1,49 @@
1#include <uapi/linux/bpf.h>
2#include <linux/socket.h>
3#include <linux/net.h>
4#include <uapi/linux/in.h>
5#include <uapi/linux/in6.h>
6#include <bpf/bpf_helpers.h>
7
8SEC("cgroup/sock1")
9int bpf_prog1(struct bpf_sock *sk)
10{
11 char fmt[] = "socket: family %d type %d protocol %d\n";
12 char fmt2[] = "socket: uid %u gid %u\n";
13 __u64 gid_uid = bpf_get_current_uid_gid();
14 __u32 uid = gid_uid & 0xffffffff;
15 __u32 gid = gid_uid >> 32;
16
17 bpf_trace_printk(fmt, sizeof(fmt), sk->family, sk->type, sk->protocol);
18 bpf_trace_printk(fmt2, sizeof(fmt2), uid, gid);
19
20 /* block PF_INET6, SOCK_RAW, IPPROTO_ICMPV6 sockets
21 * ie., make ping6 fail
22 */
23 if (sk->family == PF_INET6 &&
24 sk->type == SOCK_RAW &&
25 sk->protocol == IPPROTO_ICMPV6)
26 return 0;
27
28 return 1;
29}
30
31SEC("cgroup/sock2")
32int bpf_prog2(struct bpf_sock *sk)
33{
34 char fmt[] = "socket: family %d type %d protocol %d\n";
35
36 bpf_trace_printk(fmt, sizeof(fmt), sk->family, sk->type, sk->protocol);
37
38 /* block PF_INET, SOCK_RAW, IPPROTO_ICMP sockets
39 * ie., make ping fail
40 */
41 if (sk->family == PF_INET &&
42 sk->type == SOCK_RAW &&
43 sk->protocol == IPPROTO_ICMP)
44 return 0;
45
46 return 1;
47}
48
49char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/sockex1_kern.c b/samples/bpf/sockex1_kern.c
new file mode 100644
index 000000000..431c95646
--- /dev/null
+++ b/samples/bpf/sockex1_kern.c
@@ -0,0 +1,30 @@
1#include <uapi/linux/bpf.h>
2#include <uapi/linux/if_ether.h>
3#include <uapi/linux/if_packet.h>
4#include <uapi/linux/ip.h>
5#include <bpf/bpf_helpers.h>
6#include "bpf_legacy.h"
7
8struct {
9 __uint(type, BPF_MAP_TYPE_ARRAY);
10 __type(key, u32);
11 __type(value, long);
12 __uint(max_entries, 256);
13} my_map SEC(".maps");
14
15SEC("socket1")
16int bpf_prog1(struct __sk_buff *skb)
17{
18 int index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
19 long *value;
20
21 if (skb->pkt_type != PACKET_OUTGOING)
22 return 0;
23
24 value = bpf_map_lookup_elem(&my_map, &index);
25 if (value)
26 __sync_fetch_and_add(value, skb->len);
27
28 return 0;
29}
30char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/sockex1_user.c b/samples/bpf/sockex1_user.c
new file mode 100644
index 000000000..3c8372287
--- /dev/null
+++ b/samples/bpf/sockex1_user.c
@@ -0,0 +1,54 @@
1// SPDX-License-Identifier: GPL-2.0
2#include <stdio.h>
3#include <assert.h>
4#include <linux/bpf.h>
5#include <bpf/bpf.h>
6#include <bpf/libbpf.h>
7#include "sock_example.h"
8#include <unistd.h>
9#include <arpa/inet.h>
10
11int main(int ac, char **argv)
12{
13 struct bpf_object *obj;
14 int map_fd, prog_fd;
15 char filename[256];
16 int i, sock;
17 FILE *f;
18
19 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
20
21 if (bpf_prog_load(filename, BPF_PROG_TYPE_SOCKET_FILTER,
22 &obj, &prog_fd))
23 return 1;
24
25 map_fd = bpf_object__find_map_fd_by_name(obj, "my_map");
26
27 sock = open_raw_sock("lo");
28
29 assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd,
30 sizeof(prog_fd)) == 0);
31
32 f = popen("ping -4 -c5 localhost", "r");
33 (void) f;
34
35 for (i = 0; i < 5; i++) {
36 long long tcp_cnt, udp_cnt, icmp_cnt;
37 int key;
38
39 key = IPPROTO_TCP;
40 assert(bpf_map_lookup_elem(map_fd, &key, &tcp_cnt) == 0);
41
42 key = IPPROTO_UDP;
43 assert(bpf_map_lookup_elem(map_fd, &key, &udp_cnt) == 0);
44
45 key = IPPROTO_ICMP;
46 assert(bpf_map_lookup_elem(map_fd, &key, &icmp_cnt) == 0);
47
48 printf("TCP %lld UDP %lld ICMP %lld bytes\n",
49 tcp_cnt, udp_cnt, icmp_cnt);
50 sleep(1);
51 }
52
53 return 0;
54}
diff --git a/samples/bpf/sockex2_kern.c b/samples/bpf/sockex2_kern.c
new file mode 100644
index 000000000..b7997541f
--- /dev/null
+++ b/samples/bpf/sockex2_kern.c
@@ -0,0 +1,223 @@
1#include <uapi/linux/bpf.h>
2#include <uapi/linux/in.h>
3#include <uapi/linux/if.h>
4#include <uapi/linux/if_ether.h>
5#include <uapi/linux/ip.h>
6#include <uapi/linux/ipv6.h>
7#include <uapi/linux/if_tunnel.h>
8#include <bpf/bpf_helpers.h>
9#include "bpf_legacy.h"
10#define IP_MF 0x2000
11#define IP_OFFSET 0x1FFF
12
13struct vlan_hdr {
14 __be16 h_vlan_TCI;
15 __be16 h_vlan_encapsulated_proto;
16};
17
18struct flow_key_record {
19 __be32 src;
20 __be32 dst;
21 union {
22 __be32 ports;
23 __be16 port16[2];
24 };
25 __u16 thoff;
26 __u8 ip_proto;
27};
28
29static inline int proto_ports_offset(__u64 proto)
30{
31 switch (proto) {
32 case IPPROTO_TCP:
33 case IPPROTO_UDP:
34 case IPPROTO_DCCP:
35 case IPPROTO_ESP:
36 case IPPROTO_SCTP:
37 case IPPROTO_UDPLITE:
38 return 0;
39 case IPPROTO_AH:
40 return 4;
41 default:
42 return 0;
43 }
44}
45
46static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff)
47{
48 return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off))
49 & (IP_MF | IP_OFFSET);
50}
51
52static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off)
53{
54 __u64 w0 = load_word(ctx, off);
55 __u64 w1 = load_word(ctx, off + 4);
56 __u64 w2 = load_word(ctx, off + 8);
57 __u64 w3 = load_word(ctx, off + 12);
58
59 return (__u32)(w0 ^ w1 ^ w2 ^ w3);
60}
61
62static inline __u64 parse_ip(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto,
63 struct flow_key_record *flow)
64{
65 __u64 verlen;
66
67 if (unlikely(ip_is_fragment(skb, nhoff)))
68 *ip_proto = 0;
69 else
70 *ip_proto = load_byte(skb, nhoff + offsetof(struct iphdr, protocol));
71
72 if (*ip_proto != IPPROTO_GRE) {
73 flow->src = load_word(skb, nhoff + offsetof(struct iphdr, saddr));
74 flow->dst = load_word(skb, nhoff + offsetof(struct iphdr, daddr));
75 }
76
77 verlen = load_byte(skb, nhoff + 0/*offsetof(struct iphdr, ihl)*/);
78 if (likely(verlen == 0x45))
79 nhoff += 20;
80 else
81 nhoff += (verlen & 0xF) << 2;
82
83 return nhoff;
84}
85
86static inline __u64 parse_ipv6(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto,
87 struct flow_key_record *flow)
88{
89 *ip_proto = load_byte(skb,
90 nhoff + offsetof(struct ipv6hdr, nexthdr));
91 flow->src = ipv6_addr_hash(skb,
92 nhoff + offsetof(struct ipv6hdr, saddr));
93 flow->dst = ipv6_addr_hash(skb,
94 nhoff + offsetof(struct ipv6hdr, daddr));
95 nhoff += sizeof(struct ipv6hdr);
96
97 return nhoff;
98}
99
100static inline bool flow_dissector(struct __sk_buff *skb,
101 struct flow_key_record *flow)
102{
103 __u64 nhoff = ETH_HLEN;
104 __u64 ip_proto;
105 __u64 proto = load_half(skb, 12);
106 int poff;
107
108 if (proto == ETH_P_8021AD) {
109 proto = load_half(skb, nhoff + offsetof(struct vlan_hdr,
110 h_vlan_encapsulated_proto));
111 nhoff += sizeof(struct vlan_hdr);
112 }
113
114 if (proto == ETH_P_8021Q) {
115 proto = load_half(skb, nhoff + offsetof(struct vlan_hdr,
116 h_vlan_encapsulated_proto));
117 nhoff += sizeof(struct vlan_hdr);
118 }
119
120 if (likely(proto == ETH_P_IP))
121 nhoff = parse_ip(skb, nhoff, &ip_proto, flow);
122 else if (proto == ETH_P_IPV6)
123 nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow);
124 else
125 return false;
126
127 switch (ip_proto) {
128 case IPPROTO_GRE: {
129 struct gre_hdr {
130 __be16 flags;
131 __be16 proto;
132 };
133
134 __u64 gre_flags = load_half(skb,
135 nhoff + offsetof(struct gre_hdr, flags));
136 __u64 gre_proto = load_half(skb,
137 nhoff + offsetof(struct gre_hdr, proto));
138
139 if (gre_flags & (GRE_VERSION|GRE_ROUTING))
140 break;
141
142 proto = gre_proto;
143 nhoff += 4;
144 if (gre_flags & GRE_CSUM)
145 nhoff += 4;
146 if (gre_flags & GRE_KEY)
147 nhoff += 4;
148 if (gre_flags & GRE_SEQ)
149 nhoff += 4;
150
151 if (proto == ETH_P_8021Q) {
152 proto = load_half(skb,
153 nhoff + offsetof(struct vlan_hdr,
154 h_vlan_encapsulated_proto));
155 nhoff += sizeof(struct vlan_hdr);
156 }
157
158 if (proto == ETH_P_IP)
159 nhoff = parse_ip(skb, nhoff, &ip_proto, flow);
160 else if (proto == ETH_P_IPV6)
161 nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow);
162 else
163 return false;
164 break;
165 }
166 case IPPROTO_IPIP:
167 nhoff = parse_ip(skb, nhoff, &ip_proto, flow);
168 break;
169 case IPPROTO_IPV6:
170 nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow);
171 break;
172 default:
173 break;
174 }
175
176 flow->ip_proto = ip_proto;
177 poff = proto_ports_offset(ip_proto);
178 if (poff >= 0) {
179 nhoff += poff;
180 flow->ports = load_word(skb, nhoff);
181 }
182
183 flow->thoff = (__u16) nhoff;
184
185 return true;
186}
187
188struct pair {
189 long packets;
190 long bytes;
191};
192
193struct {
194 __uint(type, BPF_MAP_TYPE_HASH);
195 __type(key, __be32);
196 __type(value, struct pair);
197 __uint(max_entries, 1024);
198} hash_map SEC(".maps");
199
200SEC("socket2")
201int bpf_prog2(struct __sk_buff *skb)
202{
203 struct flow_key_record flow = {};
204 struct pair *value;
205 u32 key;
206
207 if (!flow_dissector(skb, &flow))
208 return 0;
209
210 key = flow.dst;
211 value = bpf_map_lookup_elem(&hash_map, &key);
212 if (value) {
213 __sync_fetch_and_add(&value->packets, 1);
214 __sync_fetch_and_add(&value->bytes, skb->len);
215 } else {
216 struct pair val = {1, skb->len};
217
218 bpf_map_update_elem(&hash_map, &key, &val, BPF_ANY);
219 }
220 return 0;
221}
222
223char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/sockex2_user.c b/samples/bpf/sockex2_user.c
new file mode 100644
index 000000000..af925a5af
--- /dev/null
+++ b/samples/bpf/sockex2_user.c
@@ -0,0 +1,57 @@
1// SPDX-License-Identifier: GPL-2.0
2#include <stdio.h>
3#include <assert.h>
4#include <linux/bpf.h>
5#include <bpf/bpf.h>
6#include <bpf/libbpf.h>
7#include "sock_example.h"
8#include <unistd.h>
9#include <arpa/inet.h>
10#include <sys/resource.h>
11
12struct pair {
13 __u64 packets;
14 __u64 bytes;
15};
16
17int main(int ac, char **argv)
18{
19 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
20 struct bpf_object *obj;
21 int map_fd, prog_fd;
22 char filename[256];
23 int i, sock;
24 FILE *f;
25
26 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
27 setrlimit(RLIMIT_MEMLOCK, &r);
28
29 if (bpf_prog_load(filename, BPF_PROG_TYPE_SOCKET_FILTER,
30 &obj, &prog_fd))
31 return 1;
32
33 map_fd = bpf_object__find_map_fd_by_name(obj, "hash_map");
34
35 sock = open_raw_sock("lo");
36
37 assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd,
38 sizeof(prog_fd)) == 0);
39
40 f = popen("ping -4 -c5 localhost", "r");
41 (void) f;
42
43 for (i = 0; i < 5; i++) {
44 int key = 0, next_key;
45 struct pair value;
46
47 while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) {
48 bpf_map_lookup_elem(map_fd, &next_key, &value);
49 printf("ip %s bytes %lld packets %lld\n",
50 inet_ntoa((struct in_addr){htonl(next_key)}),
51 value.bytes, value.packets);
52 key = next_key;
53 }
54 sleep(1);
55 }
56 return 0;
57}
diff --git a/samples/bpf/sockex3_kern.c b/samples/bpf/sockex3_kern.c
new file mode 100644
index 000000000..b36350335
--- /dev/null
+++ b/samples/bpf/sockex3_kern.c
@@ -0,0 +1,293 @@
1/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#include <uapi/linux/bpf.h>
8#include <uapi/linux/in.h>
9#include <uapi/linux/if.h>
10#include <uapi/linux/if_ether.h>
11#include <uapi/linux/ip.h>
12#include <uapi/linux/ipv6.h>
13#include <uapi/linux/if_tunnel.h>
14#include <uapi/linux/mpls.h>
15#include <bpf/bpf_helpers.h>
16#include "bpf_legacy.h"
17#define IP_MF 0x2000
18#define IP_OFFSET 0x1FFF
19
20#define PROG(F) SEC("socket/"__stringify(F)) int bpf_func_##F
21
22struct {
23 __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
24 __uint(key_size, sizeof(u32));
25 __uint(value_size, sizeof(u32));
26 __uint(max_entries, 8);
27} jmp_table SEC(".maps");
28
29#define PARSE_VLAN 1
30#define PARSE_MPLS 2
31#define PARSE_IP 3
32#define PARSE_IPV6 4
33
34/* Protocol dispatch routine. It tail-calls next BPF program depending
35 * on eth proto. Note, we could have used ...
36 *
37 * bpf_tail_call(skb, &jmp_table, proto);
38 *
39 * ... but it would need large prog_array and cannot be optimised given
40 * the map key is not static.
41 */
42static inline void parse_eth_proto(struct __sk_buff *skb, u32 proto)
43{
44 switch (proto) {
45 case ETH_P_8021Q:
46 case ETH_P_8021AD:
47 bpf_tail_call(skb, &jmp_table, PARSE_VLAN);
48 break;
49 case ETH_P_MPLS_UC:
50 case ETH_P_MPLS_MC:
51 bpf_tail_call(skb, &jmp_table, PARSE_MPLS);
52 break;
53 case ETH_P_IP:
54 bpf_tail_call(skb, &jmp_table, PARSE_IP);
55 break;
56 case ETH_P_IPV6:
57 bpf_tail_call(skb, &jmp_table, PARSE_IPV6);
58 break;
59 }
60}
61
62struct vlan_hdr {
63 __be16 h_vlan_TCI;
64 __be16 h_vlan_encapsulated_proto;
65};
66
67struct flow_key_record {
68 __be32 src;
69 __be32 dst;
70 union {
71 __be32 ports;
72 __be16 port16[2];
73 };
74 __u32 ip_proto;
75};
76
77static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff)
78{
79 return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off))
80 & (IP_MF | IP_OFFSET);
81}
82
83static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off)
84{
85 __u64 w0 = load_word(ctx, off);
86 __u64 w1 = load_word(ctx, off + 4);
87 __u64 w2 = load_word(ctx, off + 8);
88 __u64 w3 = load_word(ctx, off + 12);
89
90 return (__u32)(w0 ^ w1 ^ w2 ^ w3);
91}
92
93struct globals {
94 struct flow_key_record flow;
95};
96
97struct {
98 __uint(type, BPF_MAP_TYPE_ARRAY);
99 __type(key, __u32);
100 __type(value, struct globals);
101 __uint(max_entries, 32);
102} percpu_map SEC(".maps");
103
104/* user poor man's per_cpu until native support is ready */
105static struct globals *this_cpu_globals(void)
106{
107 u32 key = bpf_get_smp_processor_id();
108
109 return bpf_map_lookup_elem(&percpu_map, &key);
110}
111
112/* some simple stats for user space consumption */
113struct pair {
114 __u64 packets;
115 __u64 bytes;
116};
117
118struct {
119 __uint(type, BPF_MAP_TYPE_HASH);
120 __type(key, struct flow_key_record);
121 __type(value, struct pair);
122 __uint(max_entries, 1024);
123} hash_map SEC(".maps");
124
125static void update_stats(struct __sk_buff *skb, struct globals *g)
126{
127 struct flow_key_record key = g->flow;
128 struct pair *value;
129
130 value = bpf_map_lookup_elem(&hash_map, &key);
131 if (value) {
132 __sync_fetch_and_add(&value->packets, 1);
133 __sync_fetch_and_add(&value->bytes, skb->len);
134 } else {
135 struct pair val = {1, skb->len};
136
137 bpf_map_update_elem(&hash_map, &key, &val, BPF_ANY);
138 }
139}
140
141static __always_inline void parse_ip_proto(struct __sk_buff *skb,
142 struct globals *g, __u32 ip_proto)
143{
144 __u32 nhoff = skb->cb[0];
145 int poff;
146
147 switch (ip_proto) {
148 case IPPROTO_GRE: {
149 struct gre_hdr {
150 __be16 flags;
151 __be16 proto;
152 };
153
154 __u32 gre_flags = load_half(skb,
155 nhoff + offsetof(struct gre_hdr, flags));
156 __u32 gre_proto = load_half(skb,
157 nhoff + offsetof(struct gre_hdr, proto));
158
159 if (gre_flags & (GRE_VERSION|GRE_ROUTING))
160 break;
161
162 nhoff += 4;
163 if (gre_flags & GRE_CSUM)
164 nhoff += 4;
165 if (gre_flags & GRE_KEY)
166 nhoff += 4;
167 if (gre_flags & GRE_SEQ)
168 nhoff += 4;
169
170 skb->cb[0] = nhoff;
171 parse_eth_proto(skb, gre_proto);
172 break;
173 }
174 case IPPROTO_IPIP:
175 parse_eth_proto(skb, ETH_P_IP);
176 break;
177 case IPPROTO_IPV6:
178 parse_eth_proto(skb, ETH_P_IPV6);
179 break;
180 case IPPROTO_TCP:
181 case IPPROTO_UDP:
182 g->flow.ports = load_word(skb, nhoff);
183 case IPPROTO_ICMP:
184 g->flow.ip_proto = ip_proto;
185 update_stats(skb, g);
186 break;
187 default:
188 break;
189 }
190}
191
192PROG(PARSE_IP)(struct __sk_buff *skb)
193{
194 struct globals *g = this_cpu_globals();
195 __u32 nhoff, verlen, ip_proto;
196
197 if (!g)
198 return 0;
199
200 nhoff = skb->cb[0];
201
202 if (unlikely(ip_is_fragment(skb, nhoff)))
203 return 0;
204
205 ip_proto = load_byte(skb, nhoff + offsetof(struct iphdr, protocol));
206
207 if (ip_proto != IPPROTO_GRE) {
208 g->flow.src = load_word(skb, nhoff + offsetof(struct iphdr, saddr));
209 g->flow.dst = load_word(skb, nhoff + offsetof(struct iphdr, daddr));
210 }
211
212 verlen = load_byte(skb, nhoff + 0/*offsetof(struct iphdr, ihl)*/);
213 nhoff += (verlen & 0xF) << 2;
214
215 skb->cb[0] = nhoff;
216 parse_ip_proto(skb, g, ip_proto);
217 return 0;
218}
219
220PROG(PARSE_IPV6)(struct __sk_buff *skb)
221{
222 struct globals *g = this_cpu_globals();
223 __u32 nhoff, ip_proto;
224
225 if (!g)
226 return 0;
227
228 nhoff = skb->cb[0];
229
230 ip_proto = load_byte(skb,
231 nhoff + offsetof(struct ipv6hdr, nexthdr));
232 g->flow.src = ipv6_addr_hash(skb,
233 nhoff + offsetof(struct ipv6hdr, saddr));
234 g->flow.dst = ipv6_addr_hash(skb,
235 nhoff + offsetof(struct ipv6hdr, daddr));
236 nhoff += sizeof(struct ipv6hdr);
237
238 skb->cb[0] = nhoff;
239 parse_ip_proto(skb, g, ip_proto);
240 return 0;
241}
242
243PROG(PARSE_VLAN)(struct __sk_buff *skb)
244{
245 __u32 nhoff, proto;
246
247 nhoff = skb->cb[0];
248
249 proto = load_half(skb, nhoff + offsetof(struct vlan_hdr,
250 h_vlan_encapsulated_proto));
251 nhoff += sizeof(struct vlan_hdr);
252 skb->cb[0] = nhoff;
253
254 parse_eth_proto(skb, proto);
255
256 return 0;
257}
258
259PROG(PARSE_MPLS)(struct __sk_buff *skb)
260{
261 __u32 nhoff, label;
262
263 nhoff = skb->cb[0];
264
265 label = load_word(skb, nhoff);
266 nhoff += sizeof(struct mpls_label);
267 skb->cb[0] = nhoff;
268
269 if (label & MPLS_LS_S_MASK) {
270 __u8 verlen = load_byte(skb, nhoff);
271 if ((verlen & 0xF0) == 4)
272 parse_eth_proto(skb, ETH_P_IP);
273 else
274 parse_eth_proto(skb, ETH_P_IPV6);
275 } else {
276 parse_eth_proto(skb, ETH_P_MPLS_UC);
277 }
278
279 return 0;
280}
281
282SEC("socket/0")
283int main_prog(struct __sk_buff *skb)
284{
285 __u32 nhoff = ETH_HLEN;
286 __u32 proto = load_half(skb, 12);
287
288 skb->cb[0] = nhoff;
289 parse_eth_proto(skb, proto);
290 return 0;
291}
292
293char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c
new file mode 100644
index 000000000..7793f6a6a
--- /dev/null
+++ b/samples/bpf/sockex3_user.c
@@ -0,0 +1,106 @@
1// SPDX-License-Identifier: GPL-2.0
2#include <stdio.h>
3#include <assert.h>
4#include <bpf/bpf.h>
5#include <bpf/libbpf.h>
6#include "sock_example.h"
7#include <unistd.h>
8#include <arpa/inet.h>
9#include <sys/resource.h>
10
11struct flow_key_record {
12 __be32 src;
13 __be32 dst;
14 union {
15 __be32 ports;
16 __be16 port16[2];
17 };
18 __u32 ip_proto;
19};
20
21struct pair {
22 __u64 packets;
23 __u64 bytes;
24};
25
26int main(int argc, char **argv)
27{
28 int i, sock, key, fd, main_prog_fd, jmp_table_fd, hash_map_fd;
29 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
30 struct bpf_program *prog;
31 struct bpf_object *obj;
32 const char *section;
33 char filename[256];
34 FILE *f;
35
36 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
37 setrlimit(RLIMIT_MEMLOCK, &r);
38
39 obj = bpf_object__open_file(filename, NULL);
40 if (libbpf_get_error(obj)) {
41 fprintf(stderr, "ERROR: opening BPF object file failed\n");
42 return 0;
43 }
44
45 /* load BPF program */
46 if (bpf_object__load(obj)) {
47 fprintf(stderr, "ERROR: loading BPF object file failed\n");
48 goto cleanup;
49 }
50
51 jmp_table_fd = bpf_object__find_map_fd_by_name(obj, "jmp_table");
52 hash_map_fd = bpf_object__find_map_fd_by_name(obj, "hash_map");
53 if (jmp_table_fd < 0 || hash_map_fd < 0) {
54 fprintf(stderr, "ERROR: finding a map in obj file failed\n");
55 goto cleanup;
56 }
57
58 bpf_object__for_each_program(prog, obj) {
59 fd = bpf_program__fd(prog);
60
61 section = bpf_program__section_name(prog);
62 if (sscanf(section, "socket/%d", &key) != 1) {
63 fprintf(stderr, "ERROR: finding prog failed\n");
64 goto cleanup;
65 }
66
67 if (key == 0)
68 main_prog_fd = fd;
69 else
70 bpf_map_update_elem(jmp_table_fd, &key, &fd, BPF_ANY);
71 }
72
73 sock = open_raw_sock("lo");
74
75 /* attach BPF program to socket */
76 assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &main_prog_fd,
77 sizeof(__u32)) == 0);
78
79 if (argc > 1)
80 f = popen("ping -4 -c5 localhost", "r");
81 else
82 f = popen("netperf -l 4 localhost", "r");
83 (void) f;
84
85 for (i = 0; i < 5; i++) {
86 struct flow_key_record key = {}, next_key;
87 struct pair value;
88
89 sleep(1);
90 printf("IP src.port -> dst.port bytes packets\n");
91 while (bpf_map_get_next_key(hash_map_fd, &key, &next_key) == 0) {
92 bpf_map_lookup_elem(hash_map_fd, &next_key, &value);
93 printf("%s.%05d -> %s.%05d %12lld %12lld\n",
94 inet_ntoa((struct in_addr){htonl(next_key.src)}),
95 next_key.port16[0],
96 inet_ntoa((struct in_addr){htonl(next_key.dst)}),
97 next_key.port16[1],
98 value.bytes, value.packets);
99 key = next_key;
100 }
101 }
102
103cleanup:
104 bpf_object__close(obj);
105 return 0;
106}
diff --git a/samples/bpf/spintest_kern.c b/samples/bpf/spintest_kern.c
new file mode 100644
index 000000000..455da7731
--- /dev/null
+++ b/samples/bpf/spintest_kern.c
@@ -0,0 +1,69 @@
1/* Copyright (c) 2016, Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#include <linux/skbuff.h>
8#include <linux/netdevice.h>
9#include <linux/version.h>
10#include <uapi/linux/bpf.h>
11#include <uapi/linux/perf_event.h>
12#include <bpf/bpf_helpers.h>
13#include <bpf/bpf_tracing.h>
14
15struct {
16 __uint(type, BPF_MAP_TYPE_HASH);
17 __type(key, long);
18 __type(value, long);
19 __uint(max_entries, 1024);
20} my_map SEC(".maps");
21struct {
22 __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
23 __uint(key_size, sizeof(long));
24 __uint(value_size, sizeof(long));
25 __uint(max_entries, 1024);
26} my_map2 SEC(".maps");
27
28struct {
29 __uint(type, BPF_MAP_TYPE_STACK_TRACE);
30 __uint(key_size, sizeof(u32));
31 __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64));
32 __uint(max_entries, 10000);
33} stackmap SEC(".maps");
34
35#define PROG(foo) \
36int foo(struct pt_regs *ctx) \
37{ \
38 long v = PT_REGS_IP(ctx), *val; \
39\
40 val = bpf_map_lookup_elem(&my_map, &v); \
41 bpf_map_update_elem(&my_map, &v, &v, BPF_ANY); \
42 bpf_map_update_elem(&my_map2, &v, &v, BPF_ANY); \
43 bpf_map_delete_elem(&my_map2, &v); \
44 bpf_get_stackid(ctx, &stackmap, BPF_F_REUSE_STACKID); \
45 return 0; \
46}
47
48/* add kprobes to all possible *spin* functions */
49SEC("kprobe/spin_unlock")PROG(p1)
50SEC("kprobe/spin_lock")PROG(p2)
51SEC("kprobe/mutex_spin_on_owner")PROG(p3)
52SEC("kprobe/rwsem_spin_on_owner")PROG(p4)
53SEC("kprobe/spin_unlock_irqrestore")PROG(p5)
54SEC("kprobe/_raw_spin_unlock_irqrestore")PROG(p6)
55SEC("kprobe/_raw_spin_unlock_bh")PROG(p7)
56SEC("kprobe/_raw_spin_unlock")PROG(p8)
57SEC("kprobe/_raw_spin_lock_irqsave")PROG(p9)
58SEC("kprobe/_raw_spin_trylock_bh")PROG(p10)
59SEC("kprobe/_raw_spin_lock_irq")PROG(p11)
60SEC("kprobe/_raw_spin_trylock")PROG(p12)
61SEC("kprobe/_raw_spin_lock")PROG(p13)
62SEC("kprobe/_raw_spin_lock_bh")PROG(p14)
63/* and to inner bpf helpers */
64SEC("kprobe/htab_map_update_elem")PROG(p15)
65SEC("kprobe/__htab_percpu_map_update_elem")PROG(p16)
66SEC("kprobe/htab_map_alloc")PROG(p17)
67
68char _license[] SEC("license") = "GPL";
69u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/spintest_user.c b/samples/bpf/spintest_user.c
new file mode 100644
index 000000000..f090d0dc6
--- /dev/null
+++ b/samples/bpf/spintest_user.c
@@ -0,0 +1,99 @@
1// SPDX-License-Identifier: GPL-2.0
2#include <stdio.h>
3#include <unistd.h>
4#include <string.h>
5#include <assert.h>
6#include <sys/resource.h>
7#include <bpf/libbpf.h>
8#include <bpf/bpf.h>
9#include "trace_helpers.h"
10
11int main(int ac, char **argv)
12{
13 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
14 char filename[256], symbol[256];
15 struct bpf_object *obj = NULL;
16 struct bpf_link *links[20];
17 long key, next_key, value;
18 struct bpf_program *prog;
19 int map_fd, i, j = 0;
20 const char *section;
21 struct ksym *sym;
22
23 if (setrlimit(RLIMIT_MEMLOCK, &r)) {
24 perror("setrlimit(RLIMIT_MEMLOCK)");
25 return 1;
26 }
27
28 if (load_kallsyms()) {
29 printf("failed to process /proc/kallsyms\n");
30 return 2;
31 }
32
33 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
34 obj = bpf_object__open_file(filename, NULL);
35 if (libbpf_get_error(obj)) {
36 fprintf(stderr, "ERROR: opening BPF object file failed\n");
37 obj = NULL;
38 goto cleanup;
39 }
40
41 /* load BPF program */
42 if (bpf_object__load(obj)) {
43 fprintf(stderr, "ERROR: loading BPF object file failed\n");
44 goto cleanup;
45 }
46
47 map_fd = bpf_object__find_map_fd_by_name(obj, "my_map");
48 if (map_fd < 0) {
49 fprintf(stderr, "ERROR: finding a map in obj file failed\n");
50 goto cleanup;
51 }
52
53 bpf_object__for_each_program(prog, obj) {
54 section = bpf_program__section_name(prog);
55 if (sscanf(section, "kprobe/%s", symbol) != 1)
56 continue;
57
58 /* Attach prog only when symbol exists */
59 if (ksym_get_addr(symbol)) {
60 links[j] = bpf_program__attach(prog);
61 if (libbpf_get_error(links[j])) {
62 fprintf(stderr, "bpf_program__attach failed\n");
63 links[j] = NULL;
64 goto cleanup;
65 }
66 j++;
67 }
68 }
69
70 for (i = 0; i < 5; i++) {
71 key = 0;
72 printf("kprobing funcs:");
73 while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) {
74 bpf_map_lookup_elem(map_fd, &next_key, &value);
75 assert(next_key == value);
76 sym = ksym_search(value);
77 key = next_key;
78 if (!sym) {
79 printf("ksym not found. Is kallsyms loaded?\n");
80 continue;
81 }
82
83 printf(" %s", sym->name);
84 }
85 if (key)
86 printf("\n");
87 key = 0;
88 while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0)
89 bpf_map_delete_elem(map_fd, &next_key);
90 sleep(1);
91 }
92
93cleanup:
94 for (j--; j >= 0; j--)
95 bpf_link__destroy(links[j]);
96
97 bpf_object__close(obj);
98 return 0;
99}
diff --git a/samples/bpf/syscall_nrs.c b/samples/bpf/syscall_nrs.c
new file mode 100644
index 000000000..88f940052
--- /dev/null
+++ b/samples/bpf/syscall_nrs.c
@@ -0,0 +1,19 @@
1// SPDX-License-Identifier: GPL-2.0
2#include <uapi/linux/unistd.h>
3#include <linux/kbuild.h>
4
5#define SYSNR(_NR) DEFINE(SYS ## _NR, _NR)
6
7void syscall_defines(void)
8{
9 COMMENT("Linux system call numbers.");
10 SYSNR(__NR_write);
11 SYSNR(__NR_read);
12#ifdef __NR_mmap2
13 SYSNR(__NR_mmap2);
14#endif
15#ifdef __NR_mmap
16 SYSNR(__NR_mmap);
17#endif
18
19}
diff --git a/samples/bpf/syscall_tp_kern.c b/samples/bpf/syscall_tp_kern.c
new file mode 100644
index 000000000..50231c2ef
--- /dev/null
+++ b/samples/bpf/syscall_tp_kern.c
@@ -0,0 +1,73 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright (c) 2017 Facebook
3 */
4#include <uapi/linux/bpf.h>
5#include <bpf/bpf_helpers.h>
6
7struct syscalls_enter_open_args {
8 unsigned long long unused;
9 long syscall_nr;
10 long filename_ptr;
11 long flags;
12 long mode;
13};
14
15struct syscalls_exit_open_args {
16 unsigned long long unused;
17 long syscall_nr;
18 long ret;
19};
20
21struct {
22 __uint(type, BPF_MAP_TYPE_ARRAY);
23 __type(key, u32);
24 __type(value, u32);
25 __uint(max_entries, 1);
26} enter_open_map SEC(".maps");
27
28struct {
29 __uint(type, BPF_MAP_TYPE_ARRAY);
30 __type(key, u32);
31 __type(value, u32);
32 __uint(max_entries, 1);
33} exit_open_map SEC(".maps");
34
35static __always_inline void count(void *map)
36{
37 u32 key = 0;
38 u32 *value, init_val = 1;
39
40 value = bpf_map_lookup_elem(map, &key);
41 if (value)
42 *value += 1;
43 else
44 bpf_map_update_elem(map, &key, &init_val, BPF_NOEXIST);
45}
46
47SEC("tracepoint/syscalls/sys_enter_open")
48int trace_enter_open(struct syscalls_enter_open_args *ctx)
49{
50 count(&enter_open_map);
51 return 0;
52}
53
54SEC("tracepoint/syscalls/sys_enter_openat")
55int trace_enter_open_at(struct syscalls_enter_open_args *ctx)
56{
57 count(&enter_open_map);
58 return 0;
59}
60
61SEC("tracepoint/syscalls/sys_exit_open")
62int trace_enter_exit(struct syscalls_exit_open_args *ctx)
63{
64 count(&exit_open_map);
65 return 0;
66}
67
68SEC("tracepoint/syscalls/sys_exit_openat")
69int trace_enter_exit_at(struct syscalls_exit_open_args *ctx)
70{
71 count(&exit_open_map);
72 return 0;
73}
diff --git a/samples/bpf/syscall_tp_user.c b/samples/bpf/syscall_tp_user.c
new file mode 100644
index 000000000..76a1d0012
--- /dev/null
+++ b/samples/bpf/syscall_tp_user.c
@@ -0,0 +1,138 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright (c) 2017 Facebook
3 */
4#include <stdio.h>
5#include <unistd.h>
6#include <fcntl.h>
7#include <stdlib.h>
8#include <string.h>
9#include <linux/perf_event.h>
10#include <errno.h>
11#include <sys/resource.h>
12#include <bpf/libbpf.h>
13#include <bpf/bpf.h>
14
15/* This program verifies bpf attachment to tracepoint sys_enter_* and sys_exit_*.
16 * This requires kernel CONFIG_FTRACE_SYSCALLS to be set.
17 */
18
19static void usage(const char *cmd)
20{
21 printf("USAGE: %s [-i num_progs] [-h]\n", cmd);
22 printf(" -i num_progs # number of progs of the test\n");
23 printf(" -h # help\n");
24}
25
26static void verify_map(int map_id)
27{
28 __u32 key = 0;
29 __u32 val;
30
31 if (bpf_map_lookup_elem(map_id, &key, &val) != 0) {
32 fprintf(stderr, "map_lookup failed: %s\n", strerror(errno));
33 return;
34 }
35 if (val == 0) {
36 fprintf(stderr, "failed: map #%d returns value 0\n", map_id);
37 return;
38 }
39 val = 0;
40 if (bpf_map_update_elem(map_id, &key, &val, BPF_ANY) != 0) {
41 fprintf(stderr, "map_update failed: %s\n", strerror(errno));
42 return;
43 }
44}
45
46static int test(char *filename, int num_progs)
47{
48 int map0_fds[num_progs], map1_fds[num_progs], fd, i, j = 0;
49 struct bpf_link *links[num_progs * 4];
50 struct bpf_object *objs[num_progs];
51 struct bpf_program *prog;
52
53 for (i = 0; i < num_progs; i++) {
54 objs[i] = bpf_object__open_file(filename, NULL);
55 if (libbpf_get_error(objs[i])) {
56 fprintf(stderr, "opening BPF object file failed\n");
57 objs[i] = NULL;
58 goto cleanup;
59 }
60
61 /* load BPF program */
62 if (bpf_object__load(objs[i])) {
63 fprintf(stderr, "loading BPF object file failed\n");
64 goto cleanup;
65 }
66
67 map0_fds[i] = bpf_object__find_map_fd_by_name(objs[i],
68 "enter_open_map");
69 map1_fds[i] = bpf_object__find_map_fd_by_name(objs[i],
70 "exit_open_map");
71 if (map0_fds[i] < 0 || map1_fds[i] < 0) {
72 fprintf(stderr, "finding a map in obj file failed\n");
73 goto cleanup;
74 }
75
76 bpf_object__for_each_program(prog, objs[i]) {
77 links[j] = bpf_program__attach(prog);
78 if (libbpf_get_error(links[j])) {
79 fprintf(stderr, "bpf_program__attach failed\n");
80 links[j] = NULL;
81 goto cleanup;
82 }
83 j++;
84 }
85 printf("prog #%d: map ids %d %d\n", i, map0_fds[i], map1_fds[i]);
86 }
87
88 /* current load_bpf_file has perf_event_open default pid = -1
89 * and cpu = 0, which permits attached bpf execution on
90 * all cpus for all pid's. bpf program execution ignores
91 * cpu affinity.
92 */
93 /* trigger some "open" operations */
94 fd = open(filename, O_RDONLY);
95 if (fd < 0) {
96 fprintf(stderr, "open failed: %s\n", strerror(errno));
97 return 1;
98 }
99 close(fd);
100
101 /* verify the map */
102 for (i = 0; i < num_progs; i++) {
103 verify_map(map0_fds[i]);
104 verify_map(map1_fds[i]);
105 }
106
107cleanup:
108 for (j--; j >= 0; j--)
109 bpf_link__destroy(links[j]);
110
111 for (i--; i >= 0; i--)
112 bpf_object__close(objs[i]);
113 return 0;
114}
115
116int main(int argc, char **argv)
117{
118 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
119 int opt, num_progs = 1;
120 char filename[256];
121
122 while ((opt = getopt(argc, argv, "i:h")) != -1) {
123 switch (opt) {
124 case 'i':
125 num_progs = atoi(optarg);
126 break;
127 case 'h':
128 default:
129 usage(argv[0]);
130 return 0;
131 }
132 }
133
134 setrlimit(RLIMIT_MEMLOCK, &r);
135 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
136
137 return test(filename, num_progs);
138}
diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c
new file mode 100644
index 000000000..c821294e1
--- /dev/null
+++ b/samples/bpf/task_fd_query_kern.c
@@ -0,0 +1,19 @@
1// SPDX-License-Identifier: GPL-2.0
2#include <linux/version.h>
3#include <linux/ptrace.h>
4#include <uapi/linux/bpf.h>
5#include <bpf/bpf_helpers.h>
6
7SEC("kprobe/blk_mq_start_request")
8int bpf_prog1(struct pt_regs *ctx)
9{
10 return 0;
11}
12
13SEC("kretprobe/blk_account_io_done")
14int bpf_prog2(struct pt_regs *ctx)
15{
16 return 0;
17}
18char _license[] SEC("license") = "GPL";
19u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c
new file mode 100644
index 000000000..b68bd2f8f
--- /dev/null
+++ b/samples/bpf/task_fd_query_user.c
@@ -0,0 +1,383 @@
1// SPDX-License-Identifier: GPL-2.0
2
3#include <stdio.h>
4#include <stdlib.h>
5#include <signal.h>
6#include <unistd.h>
7#include <stdbool.h>
8#include <string.h>
9#include <stdint.h>
10#include <fcntl.h>
11#include <linux/bpf.h>
12#include <sys/ioctl.h>
13#include <sys/resource.h>
14#include <sys/types.h>
15#include <sys/stat.h>
16#include <linux/perf_event.h>
17
18#include <bpf/libbpf.h>
19#include "bpf_load.h"
20#include "bpf_util.h"
21#include "perf-sys.h"
22#include "trace_helpers.h"
23
24#define CHECK_PERROR_RET(condition) ({ \
25 int __ret = !!(condition); \
26 if (__ret) { \
27 printf("FAIL: %s:\n", __func__); \
28 perror(" "); \
29 return -1; \
30 } \
31})
32
33#define CHECK_AND_RET(condition) ({ \
34 int __ret = !!(condition); \
35 if (__ret) \
36 return -1; \
37})
38
39static __u64 ptr_to_u64(void *ptr)
40{
41 return (__u64) (unsigned long) ptr;
42}
43
44#define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type"
45static int bpf_find_probe_type(const char *event_type)
46{
47 char buf[256];
48 int fd, ret;
49
50 ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type);
51 CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
52
53 fd = open(buf, O_RDONLY);
54 CHECK_PERROR_RET(fd < 0);
55
56 ret = read(fd, buf, sizeof(buf));
57 close(fd);
58 CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
59
60 errno = 0;
61 ret = (int)strtol(buf, NULL, 10);
62 CHECK_PERROR_RET(errno);
63 return ret;
64}
65
66#define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe"
67static int bpf_get_retprobe_bit(const char *event_type)
68{
69 char buf[256];
70 int fd, ret;
71
72 ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type);
73 CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
74
75 fd = open(buf, O_RDONLY);
76 CHECK_PERROR_RET(fd < 0);
77
78 ret = read(fd, buf, sizeof(buf));
79 close(fd);
80 CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
81 CHECK_PERROR_RET(strlen(buf) < strlen("config:"));
82
83 errno = 0;
84 ret = (int)strtol(buf + strlen("config:"), NULL, 10);
85 CHECK_PERROR_RET(errno);
86 return ret;
87}
88
89static int test_debug_fs_kprobe(int prog_fd_idx, const char *fn_name,
90 __u32 expected_fd_type)
91{
92 __u64 probe_offset, probe_addr;
93 __u32 len, prog_id, fd_type;
94 char buf[256];
95 int err;
96
97 len = sizeof(buf);
98 err = bpf_task_fd_query(getpid(), event_fd[prog_fd_idx], 0, buf, &len,
99 &prog_id, &fd_type, &probe_offset,
100 &probe_addr);
101 if (err < 0) {
102 printf("FAIL: %s, for event_fd idx %d, fn_name %s\n",
103 __func__, prog_fd_idx, fn_name);
104 perror(" :");
105 return -1;
106 }
107 if (strcmp(buf, fn_name) != 0 ||
108 fd_type != expected_fd_type ||
109 probe_offset != 0x0 || probe_addr != 0x0) {
110 printf("FAIL: bpf_trace_event_query(event_fd[%d]):\n",
111 prog_fd_idx);
112 printf("buf: %s, fd_type: %u, probe_offset: 0x%llx,"
113 " probe_addr: 0x%llx\n",
114 buf, fd_type, probe_offset, probe_addr);
115 return -1;
116 }
117 return 0;
118}
119
120static int test_nondebug_fs_kuprobe_common(const char *event_type,
121 const char *name, __u64 offset, __u64 addr, bool is_return,
122 char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type,
123 __u64 *probe_offset, __u64 *probe_addr)
124{
125 int is_return_bit = bpf_get_retprobe_bit(event_type);
126 int type = bpf_find_probe_type(event_type);
127 struct perf_event_attr attr = {};
128 int fd;
129
130 if (type < 0 || is_return_bit < 0) {
131 printf("FAIL: %s incorrect type (%d) or is_return_bit (%d)\n",
132 __func__, type, is_return_bit);
133 return -1;
134 }
135
136 attr.sample_period = 1;
137 attr.wakeup_events = 1;
138 if (is_return)
139 attr.config |= 1 << is_return_bit;
140
141 if (name) {
142 attr.config1 = ptr_to_u64((void *)name);
143 attr.config2 = offset;
144 } else {
145 attr.config1 = 0;
146 attr.config2 = addr;
147 }
148 attr.size = sizeof(attr);
149 attr.type = type;
150
151 fd = sys_perf_event_open(&attr, -1, 0, -1, 0);
152 CHECK_PERROR_RET(fd < 0);
153
154 CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0);
155 CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0);
156 CHECK_PERROR_RET(bpf_task_fd_query(getpid(), fd, 0, buf, buf_len,
157 prog_id, fd_type, probe_offset, probe_addr) < 0);
158
159 return 0;
160}
161
162static int test_nondebug_fs_probe(const char *event_type, const char *name,
163 __u64 offset, __u64 addr, bool is_return,
164 __u32 expected_fd_type,
165 __u32 expected_ret_fd_type,
166 char *buf, __u32 buf_len)
167{
168 __u64 probe_offset, probe_addr;
169 __u32 prog_id, fd_type;
170 int err;
171
172 err = test_nondebug_fs_kuprobe_common(event_type, name,
173 offset, addr, is_return,
174 buf, &buf_len, &prog_id,
175 &fd_type, &probe_offset,
176 &probe_addr);
177 if (err < 0) {
178 printf("FAIL: %s, "
179 "for name %s, offset 0x%llx, addr 0x%llx, is_return %d\n",
180 __func__, name ? name : "", offset, addr, is_return);
181 perror(" :");
182 return -1;
183 }
184 if ((is_return && fd_type != expected_ret_fd_type) ||
185 (!is_return && fd_type != expected_fd_type)) {
186 printf("FAIL: %s, incorrect fd_type %u\n",
187 __func__, fd_type);
188 return -1;
189 }
190 if (name) {
191 if (strcmp(name, buf) != 0) {
192 printf("FAIL: %s, incorrect buf %s\n", __func__, buf);
193 return -1;
194 }
195 if (probe_offset != offset) {
196 printf("FAIL: %s, incorrect probe_offset 0x%llx\n",
197 __func__, probe_offset);
198 return -1;
199 }
200 } else {
201 if (buf_len != 0) {
202 printf("FAIL: %s, incorrect buf %p\n",
203 __func__, buf);
204 return -1;
205 }
206
207 if (probe_addr != addr) {
208 printf("FAIL: %s, incorrect probe_addr 0x%llx\n",
209 __func__, probe_addr);
210 return -1;
211 }
212 }
213 return 0;
214}
215
216static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return)
217{
218 const char *event_type = "uprobe";
219 struct perf_event_attr attr = {};
220 char buf[256], event_alias[sizeof("test_1234567890")];
221 __u64 probe_offset, probe_addr;
222 __u32 len, prog_id, fd_type;
223 int err, res, kfd, efd;
224 ssize_t bytes;
225
226 snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events",
227 event_type);
228 kfd = open(buf, O_WRONLY | O_APPEND, 0);
229 CHECK_PERROR_RET(kfd < 0);
230
231 res = snprintf(event_alias, sizeof(event_alias), "test_%d", getpid());
232 CHECK_PERROR_RET(res < 0 || res >= sizeof(event_alias));
233
234 res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx",
235 is_return ? 'r' : 'p', event_type, event_alias,
236 binary_path, offset);
237 CHECK_PERROR_RET(res < 0 || res >= sizeof(buf));
238 CHECK_PERROR_RET(write(kfd, buf, strlen(buf)) < 0);
239
240 close(kfd);
241 kfd = -1;
242
243 snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s/id",
244 event_type, event_alias);
245 efd = open(buf, O_RDONLY, 0);
246 CHECK_PERROR_RET(efd < 0);
247
248 bytes = read(efd, buf, sizeof(buf));
249 CHECK_PERROR_RET(bytes <= 0 || bytes >= sizeof(buf));
250 close(efd);
251 buf[bytes] = '\0';
252
253 attr.config = strtol(buf, NULL, 0);
254 attr.type = PERF_TYPE_TRACEPOINT;
255 attr.sample_period = 1;
256 attr.wakeup_events = 1;
257 kfd = sys_perf_event_open(&attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC);
258 CHECK_PERROR_RET(kfd < 0);
259 CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0);
260 CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_ENABLE, 0) < 0);
261
262 len = sizeof(buf);
263 err = bpf_task_fd_query(getpid(), kfd, 0, buf, &len,
264 &prog_id, &fd_type, &probe_offset,
265 &probe_addr);
266 if (err < 0) {
267 printf("FAIL: %s, binary_path %s\n", __func__, binary_path);
268 perror(" :");
269 return -1;
270 }
271 if ((is_return && fd_type != BPF_FD_TYPE_URETPROBE) ||
272 (!is_return && fd_type != BPF_FD_TYPE_UPROBE)) {
273 printf("FAIL: %s, incorrect fd_type %u\n", __func__,
274 fd_type);
275 return -1;
276 }
277 if (strcmp(binary_path, buf) != 0) {
278 printf("FAIL: %s, incorrect buf %s\n", __func__, buf);
279 return -1;
280 }
281 if (probe_offset != offset) {
282 printf("FAIL: %s, incorrect probe_offset 0x%llx\n", __func__,
283 probe_offset);
284 return -1;
285 }
286
287 close(kfd);
288 return 0;
289}
290
291int main(int argc, char **argv)
292{
293 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
294 extern char __executable_start;
295 char filename[256], buf[256];
296 __u64 uprobe_file_offset;
297
298 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
299 if (setrlimit(RLIMIT_MEMLOCK, &r)) {
300 perror("setrlimit(RLIMIT_MEMLOCK)");
301 return 1;
302 }
303
304 if (load_kallsyms()) {
305 printf("failed to process /proc/kallsyms\n");
306 return 1;
307 }
308
309 if (load_bpf_file(filename)) {
310 printf("%s", bpf_log_buf);
311 return 1;
312 }
313
314 /* test two functions in the corresponding *_kern.c file */
315 CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_mq_start_request",
316 BPF_FD_TYPE_KPROBE));
317 CHECK_AND_RET(test_debug_fs_kprobe(1, "blk_account_io_done",
318 BPF_FD_TYPE_KRETPROBE));
319
320 /* test nondebug fs kprobe */
321 CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0,
322 false, BPF_FD_TYPE_KPROBE,
323 BPF_FD_TYPE_KRETPROBE,
324 buf, sizeof(buf)));
325#ifdef __x86_64__
326 /* set a kprobe on "bpf_check + 0x5", which is x64 specific */
327 CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x5, 0x0,
328 false, BPF_FD_TYPE_KPROBE,
329 BPF_FD_TYPE_KRETPROBE,
330 buf, sizeof(buf)));
331#endif
332 CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0,
333 true, BPF_FD_TYPE_KPROBE,
334 BPF_FD_TYPE_KRETPROBE,
335 buf, sizeof(buf)));
336 CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
337 ksym_get_addr("bpf_check"), false,
338 BPF_FD_TYPE_KPROBE,
339 BPF_FD_TYPE_KRETPROBE,
340 buf, sizeof(buf)));
341 CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
342 ksym_get_addr("bpf_check"), false,
343 BPF_FD_TYPE_KPROBE,
344 BPF_FD_TYPE_KRETPROBE,
345 NULL, 0));
346 CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
347 ksym_get_addr("bpf_check"), true,
348 BPF_FD_TYPE_KPROBE,
349 BPF_FD_TYPE_KRETPROBE,
350 buf, sizeof(buf)));
351 CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
352 ksym_get_addr("bpf_check"), true,
353 BPF_FD_TYPE_KPROBE,
354 BPF_FD_TYPE_KRETPROBE,
355 0, 0));
356
357 /* test nondebug fs uprobe */
358 /* the calculation of uprobe file offset is based on gcc 7.3.1 on x64
359 * and the default linker script, which defines __executable_start as
360 * the start of the .text section. The calculation could be different
361 * on different systems with different compilers. The right way is
362 * to parse the ELF file. We took a shortcut here.
363 */
364 uprobe_file_offset = (__u64)main - (__u64)&__executable_start;
365 CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0],
366 uprobe_file_offset, 0x0, false,
367 BPF_FD_TYPE_UPROBE,
368 BPF_FD_TYPE_URETPROBE,
369 buf, sizeof(buf)));
370 CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0],
371 uprobe_file_offset, 0x0, true,
372 BPF_FD_TYPE_UPROBE,
373 BPF_FD_TYPE_URETPROBE,
374 buf, sizeof(buf)));
375
376 /* test debug fs uprobe */
377 CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset,
378 false));
379 CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset,
380 true));
381
382 return 0;
383}
diff --git a/samples/bpf/tc_l2_redirect.sh b/samples/bpf/tc_l2_redirect.sh
new file mode 100755
index 000000000..37d95ef3c
--- /dev/null
+++ b/samples/bpf/tc_l2_redirect.sh
@@ -0,0 +1,174 @@
1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3
4[[ -z $TC ]] && TC='tc'
5[[ -z $IP ]] && IP='ip'
6
7REDIRECT_USER='./tc_l2_redirect'
8REDIRECT_BPF='./tc_l2_redirect_kern.o'
9
10RP_FILTER=$(< /proc/sys/net/ipv4/conf/all/rp_filter)
11IPV6_FORWARDING=$(< /proc/sys/net/ipv6/conf/all/forwarding)
12
13function config_common {
14 local tun_type=$1
15
16 $IP netns add ns1
17 $IP netns add ns2
18 $IP link add ve1 type veth peer name vens1
19 $IP link add ve2 type veth peer name vens2
20 $IP link set dev ve1 up
21 $IP link set dev ve2 up
22 $IP link set dev ve1 mtu 1500
23 $IP link set dev ve2 mtu 1500
24 $IP link set dev vens1 netns ns1
25 $IP link set dev vens2 netns ns2
26
27 $IP -n ns1 link set dev lo up
28 $IP -n ns1 link set dev vens1 up
29 $IP -n ns1 addr add 10.1.1.101/24 dev vens1
30 $IP -n ns1 addr add 2401:db01::65/64 dev vens1 nodad
31 $IP -n ns1 route add default via 10.1.1.1 dev vens1
32 $IP -n ns1 route add default via 2401:db01::1 dev vens1
33
34 $IP -n ns2 link set dev lo up
35 $IP -n ns2 link set dev vens2 up
36 $IP -n ns2 addr add 10.2.1.102/24 dev vens2
37 $IP -n ns2 addr add 2401:db02::66/64 dev vens2 nodad
38 $IP -n ns2 addr add 10.10.1.102 dev lo
39 $IP -n ns2 addr add 2401:face::66/64 dev lo nodad
40 $IP -n ns2 link add ipt2 type ipip local 10.2.1.102 remote 10.2.1.1
41 $IP -n ns2 link add ip6t2 type ip6tnl mode any local 2401:db02::66 remote 2401:db02::1
42 $IP -n ns2 link set dev ipt2 up
43 $IP -n ns2 link set dev ip6t2 up
44 $IP netns exec ns2 $TC qdisc add dev vens2 clsact
45 $IP netns exec ns2 $TC filter add dev vens2 ingress bpf da obj $REDIRECT_BPF sec drop_non_tun_vip
46 if [[ $tun_type == "ipip" ]]; then
47 $IP -n ns2 route add 10.1.1.0/24 dev ipt2
48 $IP netns exec ns2 sysctl -q -w net.ipv4.conf.all.rp_filter=0
49 $IP netns exec ns2 sysctl -q -w net.ipv4.conf.ipt2.rp_filter=0
50 else
51 $IP -n ns2 route add 10.1.1.0/24 dev ip6t2
52 $IP -n ns2 route add 2401:db01::/64 dev ip6t2
53 $IP netns exec ns2 sysctl -q -w net.ipv4.conf.all.rp_filter=0
54 $IP netns exec ns2 sysctl -q -w net.ipv4.conf.ip6t2.rp_filter=0
55 fi
56
57 $IP addr add 10.1.1.1/24 dev ve1
58 $IP addr add 2401:db01::1/64 dev ve1 nodad
59 $IP addr add 10.2.1.1/24 dev ve2
60 $IP addr add 2401:db02::1/64 dev ve2 nodad
61
62 $TC qdisc add dev ve2 clsact
63 $TC filter add dev ve2 ingress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_forward
64
65 sysctl -q -w net.ipv4.conf.all.rp_filter=0
66 sysctl -q -w net.ipv6.conf.all.forwarding=1
67}
68
69function cleanup {
70 set +e
71 [[ -z $DEBUG ]] || set +x
72 $IP netns delete ns1 >& /dev/null
73 $IP netns delete ns2 >& /dev/null
74 $IP link del ve1 >& /dev/null
75 $IP link del ve2 >& /dev/null
76 $IP link del ipt >& /dev/null
77 $IP link del ip6t >& /dev/null
78 sysctl -q -w net.ipv4.conf.all.rp_filter=$RP_FILTER
79 sysctl -q -w net.ipv6.conf.all.forwarding=$IPV6_FORWARDING
80 rm -f /sys/fs/bpf/tc/globals/tun_iface
81 [[ -z $DEBUG ]] || set -x
82 set -e
83}
84
85function l2_to_ipip {
86 echo -n "l2_to_ipip $1: "
87
88 local dir=$1
89
90 config_common ipip
91
92 $IP link add ipt type ipip external
93 $IP link set dev ipt up
94 sysctl -q -w net.ipv4.conf.ipt.rp_filter=0
95 sysctl -q -w net.ipv4.conf.ipt.forwarding=1
96
97 if [[ $dir == "egress" ]]; then
98 $IP route add 10.10.1.0/24 via 10.2.1.102 dev ve2
99 $TC filter add dev ve2 egress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_redirect
100 sysctl -q -w net.ipv4.conf.ve1.forwarding=1
101 else
102 $TC qdisc add dev ve1 clsact
103 $TC filter add dev ve1 ingress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_redirect
104 fi
105
106 $REDIRECT_USER -U /sys/fs/bpf/tc/globals/tun_iface -i $(< /sys/class/net/ipt/ifindex)
107
108 $IP netns exec ns1 ping -c1 10.10.1.102 >& /dev/null
109
110 if [[ $dir == "egress" ]]; then
111 # test direct egress to ve2 (i.e. not forwarding from
112 # ve1 to ve2).
113 ping -c1 10.10.1.102 >& /dev/null
114 fi
115
116 cleanup
117
118 echo "OK"
119}
120
121function l2_to_ip6tnl {
122 echo -n "l2_to_ip6tnl $1: "
123
124 local dir=$1
125
126 config_common ip6tnl
127
128 $IP link add ip6t type ip6tnl mode any external
129 $IP link set dev ip6t up
130 sysctl -q -w net.ipv4.conf.ip6t.rp_filter=0
131 sysctl -q -w net.ipv4.conf.ip6t.forwarding=1
132
133 if [[ $dir == "egress" ]]; then
134 $IP route add 10.10.1.0/24 via 10.2.1.102 dev ve2
135 $IP route add 2401:face::/64 via 2401:db02::66 dev ve2
136 $TC filter add dev ve2 egress bpf da obj $REDIRECT_BPF sec l2_to_ip6tun_ingress_redirect
137 sysctl -q -w net.ipv4.conf.ve1.forwarding=1
138 else
139 $TC qdisc add dev ve1 clsact
140 $TC filter add dev ve1 ingress bpf da obj $REDIRECT_BPF sec l2_to_ip6tun_ingress_redirect
141 fi
142
143 $REDIRECT_USER -U /sys/fs/bpf/tc/globals/tun_iface -i $(< /sys/class/net/ip6t/ifindex)
144
145 $IP netns exec ns1 ping -c1 10.10.1.102 >& /dev/null
146 $IP netns exec ns1 ping -6 -c1 2401:face::66 >& /dev/null
147
148 if [[ $dir == "egress" ]]; then
149 # test direct egress to ve2 (i.e. not forwarding from
150 # ve1 to ve2).
151 ping -c1 10.10.1.102 >& /dev/null
152 ping -6 -c1 2401:face::66 >& /dev/null
153 fi
154
155 cleanup
156
157 echo "OK"
158}
159
160cleanup
161test_names="l2_to_ipip l2_to_ip6tnl"
162test_dirs="ingress egress"
163if [[ $# -ge 2 ]]; then
164 test_names=$1
165 test_dirs=$2
166elif [[ $# -ge 1 ]]; then
167 test_names=$1
168fi
169
170for t in $test_names; do
171 for d in $test_dirs; do
172 $t $d
173 done
174done
diff --git a/samples/bpf/tc_l2_redirect_kern.c b/samples/bpf/tc_l2_redirect_kern.c
new file mode 100644
index 000000000..fd2fa0004
--- /dev/null
+++ b/samples/bpf/tc_l2_redirect_kern.c
@@ -0,0 +1,237 @@
1/* Copyright (c) 2016 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#define KBUILD_MODNAME "foo"
8#include <uapi/linux/bpf.h>
9#include <uapi/linux/if_ether.h>
10#include <uapi/linux/if_packet.h>
11#include <uapi/linux/ip.h>
12#include <uapi/linux/ipv6.h>
13#include <uapi/linux/in.h>
14#include <uapi/linux/tcp.h>
15#include <uapi/linux/filter.h>
16#include <uapi/linux/pkt_cls.h>
17#include <net/ipv6.h>
18#include <bpf/bpf_helpers.h>
19
20#define _htonl __builtin_bswap32
21
22#define PIN_GLOBAL_NS 2
23struct bpf_elf_map {
24 __u32 type;
25 __u32 size_key;
26 __u32 size_value;
27 __u32 max_elem;
28 __u32 flags;
29 __u32 id;
30 __u32 pinning;
31};
32
33/* copy of 'struct ethhdr' without __packed */
34struct eth_hdr {
35 unsigned char h_dest[ETH_ALEN];
36 unsigned char h_source[ETH_ALEN];
37 unsigned short h_proto;
38};
39
40struct bpf_elf_map SEC("maps") tun_iface = {
41 .type = BPF_MAP_TYPE_ARRAY,
42 .size_key = sizeof(int),
43 .size_value = sizeof(int),
44 .pinning = PIN_GLOBAL_NS,
45 .max_elem = 1,
46};
47
48static __always_inline bool is_vip_addr(__be16 eth_proto, __be32 daddr)
49{
50 if (eth_proto == htons(ETH_P_IP))
51 return (_htonl(0xffffff00) & daddr) == _htonl(0x0a0a0100);
52 else if (eth_proto == htons(ETH_P_IPV6))
53 return (daddr == _htonl(0x2401face));
54
55 return false;
56}
57
58SEC("l2_to_iptun_ingress_forward")
59int _l2_to_iptun_ingress_forward(struct __sk_buff *skb)
60{
61 struct bpf_tunnel_key tkey = {};
62 void *data = (void *)(long)skb->data;
63 struct eth_hdr *eth = data;
64 void *data_end = (void *)(long)skb->data_end;
65 int key = 0, *ifindex;
66
67 int ret;
68
69 if (data + sizeof(*eth) > data_end)
70 return TC_ACT_OK;
71
72 ifindex = bpf_map_lookup_elem(&tun_iface, &key);
73 if (!ifindex)
74 return TC_ACT_OK;
75
76 if (eth->h_proto == htons(ETH_P_IP)) {
77 char fmt4[] = "ingress forward to ifindex:%d daddr4:%x\n";
78 struct iphdr *iph = data + sizeof(*eth);
79
80 if (data + sizeof(*eth) + sizeof(*iph) > data_end)
81 return TC_ACT_OK;
82
83 if (iph->protocol != IPPROTO_IPIP)
84 return TC_ACT_OK;
85
86 bpf_trace_printk(fmt4, sizeof(fmt4), *ifindex,
87 _htonl(iph->daddr));
88 return bpf_redirect(*ifindex, BPF_F_INGRESS);
89 } else if (eth->h_proto == htons(ETH_P_IPV6)) {
90 char fmt6[] = "ingress forward to ifindex:%d daddr6:%x::%x\n";
91 struct ipv6hdr *ip6h = data + sizeof(*eth);
92
93 if (data + sizeof(*eth) + sizeof(*ip6h) > data_end)
94 return TC_ACT_OK;
95
96 if (ip6h->nexthdr != IPPROTO_IPIP &&
97 ip6h->nexthdr != IPPROTO_IPV6)
98 return TC_ACT_OK;
99
100 bpf_trace_printk(fmt6, sizeof(fmt6), *ifindex,
101 _htonl(ip6h->daddr.s6_addr32[0]),
102 _htonl(ip6h->daddr.s6_addr32[3]));
103 return bpf_redirect(*ifindex, BPF_F_INGRESS);
104 }
105
106 return TC_ACT_OK;
107}
108
109SEC("l2_to_iptun_ingress_redirect")
110int _l2_to_iptun_ingress_redirect(struct __sk_buff *skb)
111{
112 struct bpf_tunnel_key tkey = {};
113 void *data = (void *)(long)skb->data;
114 struct eth_hdr *eth = data;
115 void *data_end = (void *)(long)skb->data_end;
116 int key = 0, *ifindex;
117
118 int ret;
119
120 if (data + sizeof(*eth) > data_end)
121 return TC_ACT_OK;
122
123 ifindex = bpf_map_lookup_elem(&tun_iface, &key);
124 if (!ifindex)
125 return TC_ACT_OK;
126
127 if (eth->h_proto == htons(ETH_P_IP)) {
128 char fmt4[] = "e/ingress redirect daddr4:%x to ifindex:%d\n";
129 struct iphdr *iph = data + sizeof(*eth);
130 __be32 daddr = iph->daddr;
131
132 if (data + sizeof(*eth) + sizeof(*iph) > data_end)
133 return TC_ACT_OK;
134
135 if (!is_vip_addr(eth->h_proto, daddr))
136 return TC_ACT_OK;
137
138 bpf_trace_printk(fmt4, sizeof(fmt4), _htonl(daddr), *ifindex);
139 } else {
140 return TC_ACT_OK;
141 }
142
143 tkey.tunnel_id = 10000;
144 tkey.tunnel_ttl = 64;
145 tkey.remote_ipv4 = 0x0a020166; /* 10.2.1.102 */
146 bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), 0);
147 return bpf_redirect(*ifindex, 0);
148}
149
150SEC("l2_to_ip6tun_ingress_redirect")
151int _l2_to_ip6tun_ingress_redirect(struct __sk_buff *skb)
152{
153 struct bpf_tunnel_key tkey = {};
154 void *data = (void *)(long)skb->data;
155 struct eth_hdr *eth = data;
156 void *data_end = (void *)(long)skb->data_end;
157 int key = 0, *ifindex;
158
159 if (data + sizeof(*eth) > data_end)
160 return TC_ACT_OK;
161
162 ifindex = bpf_map_lookup_elem(&tun_iface, &key);
163 if (!ifindex)
164 return TC_ACT_OK;
165
166 if (eth->h_proto == htons(ETH_P_IP)) {
167 char fmt4[] = "e/ingress redirect daddr4:%x to ifindex:%d\n";
168 struct iphdr *iph = data + sizeof(*eth);
169
170 if (data + sizeof(*eth) + sizeof(*iph) > data_end)
171 return TC_ACT_OK;
172
173 if (!is_vip_addr(eth->h_proto, iph->daddr))
174 return TC_ACT_OK;
175
176 bpf_trace_printk(fmt4, sizeof(fmt4), _htonl(iph->daddr),
177 *ifindex);
178 } else if (eth->h_proto == htons(ETH_P_IPV6)) {
179 char fmt6[] = "e/ingress redirect daddr6:%x to ifindex:%d\n";
180 struct ipv6hdr *ip6h = data + sizeof(*eth);
181
182 if (data + sizeof(*eth) + sizeof(*ip6h) > data_end)
183 return TC_ACT_OK;
184
185 if (!is_vip_addr(eth->h_proto, ip6h->daddr.s6_addr32[0]))
186 return TC_ACT_OK;
187
188 bpf_trace_printk(fmt6, sizeof(fmt6),
189 _htonl(ip6h->daddr.s6_addr32[0]), *ifindex);
190 } else {
191 return TC_ACT_OK;
192 }
193
194 tkey.tunnel_id = 10000;
195 tkey.tunnel_ttl = 64;
196 /* 2401:db02:0:0:0:0:0:66 */
197 tkey.remote_ipv6[0] = _htonl(0x2401db02);
198 tkey.remote_ipv6[1] = 0;
199 tkey.remote_ipv6[2] = 0;
200 tkey.remote_ipv6[3] = _htonl(0x00000066);
201 bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), BPF_F_TUNINFO_IPV6);
202 return bpf_redirect(*ifindex, 0);
203}
204
205SEC("drop_non_tun_vip")
206int _drop_non_tun_vip(struct __sk_buff *skb)
207{
208 struct bpf_tunnel_key tkey = {};
209 void *data = (void *)(long)skb->data;
210 struct eth_hdr *eth = data;
211 void *data_end = (void *)(long)skb->data_end;
212
213 if (data + sizeof(*eth) > data_end)
214 return TC_ACT_OK;
215
216 if (eth->h_proto == htons(ETH_P_IP)) {
217 struct iphdr *iph = data + sizeof(*eth);
218
219 if (data + sizeof(*eth) + sizeof(*iph) > data_end)
220 return TC_ACT_OK;
221
222 if (is_vip_addr(eth->h_proto, iph->daddr))
223 return TC_ACT_SHOT;
224 } else if (eth->h_proto == htons(ETH_P_IPV6)) {
225 struct ipv6hdr *ip6h = data + sizeof(*eth);
226
227 if (data + sizeof(*eth) + sizeof(*ip6h) > data_end)
228 return TC_ACT_OK;
229
230 if (is_vip_addr(eth->h_proto, ip6h->daddr.s6_addr32[0]))
231 return TC_ACT_SHOT;
232 }
233
234 return TC_ACT_OK;
235}
236
237char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tc_l2_redirect_user.c b/samples/bpf/tc_l2_redirect_user.c
new file mode 100644
index 000000000..d11a6e1e9
--- /dev/null
+++ b/samples/bpf/tc_l2_redirect_user.c
@@ -0,0 +1,70 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright (c) 2016 Facebook
3 */
4#include <linux/unistd.h>
5#include <linux/bpf.h>
6
7#include <stdlib.h>
8#include <stdio.h>
9#include <unistd.h>
10#include <string.h>
11#include <errno.h>
12
13#include <bpf/bpf.h>
14
15static void usage(void)
16{
17 printf("Usage: tc_l2_ipip_redirect [...]\n");
18 printf(" -U <file> Update an already pinned BPF array\n");
19 printf(" -i <ifindex> Interface index\n");
20 printf(" -h Display this help\n");
21}
22
23int main(int argc, char **argv)
24{
25 const char *pinned_file = NULL;
26 int ifindex = -1;
27 int array_key = 0;
28 int array_fd = -1;
29 int ret = -1;
30 int opt;
31
32 while ((opt = getopt(argc, argv, "F:U:i:")) != -1) {
33 switch (opt) {
34 /* General args */
35 case 'U':
36 pinned_file = optarg;
37 break;
38 case 'i':
39 ifindex = atoi(optarg);
40 break;
41 default:
42 usage();
43 goto out;
44 }
45 }
46
47 if (ifindex < 0 || !pinned_file) {
48 usage();
49 goto out;
50 }
51
52 array_fd = bpf_obj_get(pinned_file);
53 if (array_fd < 0) {
54 fprintf(stderr, "bpf_obj_get(%s): %s(%d)\n",
55 pinned_file, strerror(errno), errno);
56 goto out;
57 }
58
59 /* bpf_tunnel_key.remote_ipv4 expects host byte orders */
60 ret = bpf_map_update_elem(array_fd, &array_key, &ifindex, 0);
61 if (ret) {
62 perror("bpf_map_update_elem");
63 goto out;
64 }
65
66out:
67 if (array_fd != -1)
68 close(array_fd);
69 return ret;
70}
diff --git a/samples/bpf/tcbpf1_kern.c b/samples/bpf/tcbpf1_kern.c
new file mode 100644
index 000000000..e9356130f
--- /dev/null
+++ b/samples/bpf/tcbpf1_kern.c
@@ -0,0 +1,91 @@
1#define KBUILD_MODNAME "foo"
2#include <uapi/linux/bpf.h>
3#include <uapi/linux/if_ether.h>
4#include <uapi/linux/if_packet.h>
5#include <uapi/linux/ip.h>
6#include <uapi/linux/in.h>
7#include <uapi/linux/tcp.h>
8#include <uapi/linux/filter.h>
9#include <uapi/linux/pkt_cls.h>
10#include <bpf/bpf_helpers.h>
11#include "bpf_legacy.h"
12
13/* compiler workaround */
14#define _htonl __builtin_bswap32
15
16static inline void set_dst_mac(struct __sk_buff *skb, char *mac)
17{
18 bpf_skb_store_bytes(skb, 0, mac, ETH_ALEN, 1);
19}
20
21#define IP_CSUM_OFF (ETH_HLEN + offsetof(struct iphdr, check))
22#define TOS_OFF (ETH_HLEN + offsetof(struct iphdr, tos))
23
24static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos)
25{
26 __u8 old_tos = load_byte(skb, TOS_OFF);
27
28 bpf_l3_csum_replace(skb, IP_CSUM_OFF, htons(old_tos), htons(new_tos), 2);
29 bpf_skb_store_bytes(skb, TOS_OFF, &new_tos, sizeof(new_tos), 0);
30}
31
32#define TCP_CSUM_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, check))
33#define IP_SRC_OFF (ETH_HLEN + offsetof(struct iphdr, saddr))
34
35#define IS_PSEUDO 0x10
36
37static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip)
38{
39 __u32 old_ip = _htonl(load_word(skb, IP_SRC_OFF));
40
41 bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_ip, new_ip, IS_PSEUDO | sizeof(new_ip));
42 bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip));
43 bpf_skb_store_bytes(skb, IP_SRC_OFF, &new_ip, sizeof(new_ip), 0);
44}
45
46#define TCP_DPORT_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, dest))
47static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port)
48{
49 __u16 old_port = htons(load_half(skb, TCP_DPORT_OFF));
50
51 bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_port, new_port, sizeof(new_port));
52 bpf_skb_store_bytes(skb, TCP_DPORT_OFF, &new_port, sizeof(new_port), 0);
53}
54
55SEC("classifier")
56int bpf_prog1(struct __sk_buff *skb)
57{
58 __u8 proto = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
59 long *value;
60
61 if (proto == IPPROTO_TCP) {
62 set_ip_tos(skb, 8);
63 set_tcp_ip_src(skb, 0xA010101);
64 set_tcp_dest_port(skb, 5001);
65 }
66
67 return 0;
68}
69SEC("redirect_xmit")
70int _redirect_xmit(struct __sk_buff *skb)
71{
72 return bpf_redirect(skb->ifindex + 1, 0);
73}
74SEC("redirect_recv")
75int _redirect_recv(struct __sk_buff *skb)
76{
77 return bpf_redirect(skb->ifindex + 1, 1);
78}
79SEC("clone_redirect_xmit")
80int _clone_redirect_xmit(struct __sk_buff *skb)
81{
82 bpf_clone_redirect(skb, skb->ifindex + 1, 0);
83 return TC_ACT_SHOT;
84}
85SEC("clone_redirect_recv")
86int _clone_redirect_recv(struct __sk_buff *skb)
87{
88 bpf_clone_redirect(skb, skb->ifindex + 1, 1);
89 return TC_ACT_SHOT;
90}
91char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_basertt_kern.c b/samples/bpf/tcp_basertt_kern.c
new file mode 100644
index 000000000..8dfe09a92
--- /dev/null
+++ b/samples/bpf/tcp_basertt_kern.c
@@ -0,0 +1,71 @@
1/* Copyright (c) 2017 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * BPF program to set base_rtt to 80us when host is running TCP-NV and
8 * both hosts are in the same datacenter (as determined by IPv6 prefix).
9 *
10 * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
11 */
12
13#include <uapi/linux/bpf.h>
14#include <uapi/linux/tcp.h>
15#include <uapi/linux/if_ether.h>
16#include <uapi/linux/if_packet.h>
17#include <uapi/linux/ip.h>
18#include <linux/socket.h>
19#include <bpf/bpf_helpers.h>
20#include <bpf/bpf_endian.h>
21
22#define DEBUG 1
23
24SEC("sockops")
25int bpf_basertt(struct bpf_sock_ops *skops)
26{
27 char cong[20];
28 char nv[] = "nv";
29 int rv = 0, n;
30 int op;
31
32 op = (int) skops->op;
33
34#ifdef DEBUG
35 bpf_printk("BPF command: %d\n", op);
36#endif
37
38 /* Check if both hosts are in the same datacenter. For this
39 * example they are if the 1st 5.5 bytes in the IPv6 address
40 * are the same.
41 */
42 if (skops->family == AF_INET6 &&
43 skops->local_ip6[0] == skops->remote_ip6[0] &&
44 (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) ==
45 (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) {
46 switch (op) {
47 case BPF_SOCK_OPS_BASE_RTT:
48 n = bpf_getsockopt(skops, SOL_TCP, TCP_CONGESTION,
49 cong, sizeof(cong));
50 if (!n && !__builtin_memcmp(cong, nv, sizeof(nv)+1)) {
51 /* Set base_rtt to 80us */
52 rv = 80;
53 } else if (n) {
54 rv = n;
55 } else {
56 rv = -1;
57 }
58 break;
59 default:
60 rv = -1;
61 }
62 } else {
63 rv = -1;
64 }
65#ifdef DEBUG
66 bpf_printk("Returning %d\n", rv);
67#endif
68 skops->reply = rv;
69 return 1;
70}
71char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_bpf.readme b/samples/bpf/tcp_bpf.readme
new file mode 100644
index 000000000..78e247f62
--- /dev/null
+++ b/samples/bpf/tcp_bpf.readme
@@ -0,0 +1,28 @@
1This file describes how to run the tcp_*_kern.o tcp_bpf (or socket_ops)
2programs. These programs attach to a cgroupv2. The following commands create
3a cgroupv2 and attach a bash shell to the group.
4
5 mkdir -p /tmp/cgroupv2
6 mount -t cgroup2 none /tmp/cgroupv2
7 mkdir -p /tmp/cgroupv2/foo
8 bash
9 echo $$ >> /tmp/cgroupv2/foo/cgroup.procs
10
11Anything that runs under this shell belongs to the foo cgroupv2. To load
12(attach) one of the tcp_*_kern.o programs:
13
14 bpftool prog load tcp_basertt_kern.o /sys/fs/bpf/tcp_prog
15 bpftool cgroup attach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog
16 bpftool prog tracelog
17
18"bpftool prog tracelog" will continue to run printing the BPF log buffer.
19The tcp_*_kern.o programs use special print functions to print logging
20information (if enabled by the ifdef).
21
22If using netperf/netserver to create traffic, you need to run them under the
23cgroupv2 to which the BPF programs are attached (i.e. under bash shell
24attached to the cgroupv2).
25
26To remove (unattach) a socket_ops BPF program from a cgroupv2:
27
28 bpftool cgroup detach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog
diff --git a/samples/bpf/tcp_bufs_kern.c b/samples/bpf/tcp_bufs_kern.c
new file mode 100644
index 000000000..6a80d0895
--- /dev/null
+++ b/samples/bpf/tcp_bufs_kern.c
@@ -0,0 +1,81 @@
1/* Copyright (c) 2017 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * BPF program to set initial receive window to 40 packets and send
8 * and receive buffers to 1.5MB. This would usually be done after
9 * doing appropriate checks that indicate the hosts are far enough
10 * away (i.e. large RTT).
11 *
12 * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
13 */
14
15#include <uapi/linux/bpf.h>
16#include <uapi/linux/if_ether.h>
17#include <uapi/linux/if_packet.h>
18#include <uapi/linux/ip.h>
19#include <linux/socket.h>
20#include <bpf/bpf_helpers.h>
21#include <bpf/bpf_endian.h>
22
23#define DEBUG 1
24
25SEC("sockops")
26int bpf_bufs(struct bpf_sock_ops *skops)
27{
28 int bufsize = 1500000;
29 int rwnd_init = 40;
30 int rv = 0;
31 int op;
32
33 /* For testing purposes, only execute rest of BPF program
34 * if neither port numberis 55601
35 */
36 if (bpf_ntohl(skops->remote_port) != 55601 &&
37 skops->local_port != 55601) {
38 skops->reply = -1;
39 return 1;
40 }
41
42 op = (int) skops->op;
43
44#ifdef DEBUG
45 bpf_printk("Returning %d\n", rv);
46#endif
47
48 /* Usually there would be a check to insure the hosts are far
49 * from each other so it makes sense to increase buffer sizes
50 */
51 switch (op) {
52 case BPF_SOCK_OPS_RWND_INIT:
53 rv = rwnd_init;
54 break;
55 case BPF_SOCK_OPS_TCP_CONNECT_CB:
56 /* Set sndbuf and rcvbuf of active connections */
57 rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
58 sizeof(bufsize));
59 rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
60 &bufsize, sizeof(bufsize));
61 break;
62 case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
63 /* Nothing to do */
64 break;
65 case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
66 /* Set sndbuf and rcvbuf of passive connections */
67 rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
68 sizeof(bufsize));
69 rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
70 &bufsize, sizeof(bufsize));
71 break;
72 default:
73 rv = -1;
74 }
75#ifdef DEBUG
76 bpf_printk("Returning %d\n", rv);
77#endif
78 skops->reply = rv;
79 return 1;
80}
81char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_clamp_kern.c b/samples/bpf/tcp_clamp_kern.c
new file mode 100644
index 000000000..e88bd9ab0
--- /dev/null
+++ b/samples/bpf/tcp_clamp_kern.c
@@ -0,0 +1,97 @@
1/* Copyright (c) 2017 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * Sample BPF program to set send and receive buffers to 150KB, sndcwnd clamp
8 * to 100 packets and SYN and SYN_ACK RTOs to 10ms when both hosts are within
9 * the same datacenter. For his example, we assume they are within the same
10 * datacenter when the first 5.5 bytes of their IPv6 addresses are the same.
11 *
12 * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
13 */
14
15#include <uapi/linux/bpf.h>
16#include <uapi/linux/if_ether.h>
17#include <uapi/linux/if_packet.h>
18#include <uapi/linux/ip.h>
19#include <linux/socket.h>
20#include <bpf/bpf_helpers.h>
21#include <bpf/bpf_endian.h>
22
23#define DEBUG 1
24
25SEC("sockops")
26int bpf_clamp(struct bpf_sock_ops *skops)
27{
28 int bufsize = 150000;
29 int to_init = 10;
30 int clamp = 100;
31 int rv = 0;
32 int op;
33
34 /* For testing purposes, only execute rest of BPF program
35 * if neither port numberis 55601
36 */
37 if (bpf_ntohl(skops->remote_port) != 55601 && skops->local_port != 55601) {
38 skops->reply = -1;
39 return 0;
40 }
41
42 op = (int) skops->op;
43
44#ifdef DEBUG
45 bpf_printk("BPF command: %d\n", op);
46#endif
47
48 /* Check that both hosts are within same datacenter. For this example
49 * it is the case when the first 5.5 bytes of their IPv6 addresses are
50 * the same.
51 */
52 if (skops->family == AF_INET6 &&
53 skops->local_ip6[0] == skops->remote_ip6[0] &&
54 (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) ==
55 (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) {
56 switch (op) {
57 case BPF_SOCK_OPS_TIMEOUT_INIT:
58 rv = to_init;
59 break;
60 case BPF_SOCK_OPS_TCP_CONNECT_CB:
61 /* Set sndbuf and rcvbuf of active connections */
62 rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF,
63 &bufsize, sizeof(bufsize));
64 rv += bpf_setsockopt(skops, SOL_SOCKET,
65 SO_RCVBUF, &bufsize,
66 sizeof(bufsize));
67 break;
68 case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
69 rv = bpf_setsockopt(skops, SOL_TCP,
70 TCP_BPF_SNDCWND_CLAMP,
71 &clamp, sizeof(clamp));
72 break;
73 case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
74 /* Set sndbuf and rcvbuf of passive connections */
75 rv = bpf_setsockopt(skops, SOL_TCP,
76 TCP_BPF_SNDCWND_CLAMP,
77 &clamp, sizeof(clamp));
78 rv += bpf_setsockopt(skops, SOL_SOCKET,
79 SO_SNDBUF, &bufsize,
80 sizeof(bufsize));
81 rv += bpf_setsockopt(skops, SOL_SOCKET,
82 SO_RCVBUF, &bufsize,
83 sizeof(bufsize));
84 break;
85 default:
86 rv = -1;
87 }
88 } else {
89 rv = -1;
90 }
91#ifdef DEBUG
92 bpf_printk("Returning %d\n", rv);
93#endif
94 skops->reply = rv;
95 return 1;
96}
97char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_cong_kern.c b/samples/bpf/tcp_cong_kern.c
new file mode 100644
index 000000000..2311fc9dd
--- /dev/null
+++ b/samples/bpf/tcp_cong_kern.c
@@ -0,0 +1,78 @@
1/* Copyright (c) 2017 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * BPF program to set congestion control to dctcp when both hosts are
8 * in the same datacenter (as deteremined by IPv6 prefix).
9 *
10 * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
11 */
12
13#include <uapi/linux/bpf.h>
14#include <uapi/linux/tcp.h>
15#include <uapi/linux/if_ether.h>
16#include <uapi/linux/if_packet.h>
17#include <uapi/linux/ip.h>
18#include <linux/socket.h>
19#include <bpf/bpf_helpers.h>
20#include <bpf/bpf_endian.h>
21
22#define DEBUG 1
23
24SEC("sockops")
25int bpf_cong(struct bpf_sock_ops *skops)
26{
27 char cong[] = "dctcp";
28 int rv = 0;
29 int op;
30
31 /* For testing purposes, only execute rest of BPF program
32 * if neither port numberis 55601
33 */
34 if (bpf_ntohl(skops->remote_port) != 55601 &&
35 skops->local_port != 55601) {
36 skops->reply = -1;
37 return 1;
38 }
39
40 op = (int) skops->op;
41
42#ifdef DEBUG
43 bpf_printk("BPF command: %d\n", op);
44#endif
45
46 /* Check if both hosts are in the same datacenter. For this
47 * example they are if the 1st 5.5 bytes in the IPv6 address
48 * are the same.
49 */
50 if (skops->family == AF_INET6 &&
51 skops->local_ip6[0] == skops->remote_ip6[0] &&
52 (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) ==
53 (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) {
54 switch (op) {
55 case BPF_SOCK_OPS_NEEDS_ECN:
56 rv = 1;
57 break;
58 case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
59 rv = bpf_setsockopt(skops, SOL_TCP, TCP_CONGESTION,
60 cong, sizeof(cong));
61 break;
62 case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
63 rv = bpf_setsockopt(skops, SOL_TCP, TCP_CONGESTION,
64 cong, sizeof(cong));
65 break;
66 default:
67 rv = -1;
68 }
69 } else {
70 rv = -1;
71 }
72#ifdef DEBUG
73 bpf_printk("Returning %d\n", rv);
74#endif
75 skops->reply = rv;
76 return 1;
77}
78char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_dumpstats_kern.c b/samples/bpf/tcp_dumpstats_kern.c
new file mode 100644
index 000000000..e80d3afd2
--- /dev/null
+++ b/samples/bpf/tcp_dumpstats_kern.c
@@ -0,0 +1,68 @@
1// SPDX-License-Identifier: GPL-2.0
2/* Refer to samples/bpf/tcp_bpf.readme for the instructions on
3 * how to run this sample program.
4 */
5#include <linux/bpf.h>
6
7#include <bpf/bpf_helpers.h>
8#include <bpf/bpf_endian.h>
9
10#define INTERVAL 1000000000ULL
11
12int _version SEC("version") = 1;
13char _license[] SEC("license") = "GPL";
14
15struct {
16 __u32 type;
17 __u32 map_flags;
18 int *key;
19 __u64 *value;
20} bpf_next_dump SEC(".maps") = {
21 .type = BPF_MAP_TYPE_SK_STORAGE,
22 .map_flags = BPF_F_NO_PREALLOC,
23};
24
25SEC("sockops")
26int _sockops(struct bpf_sock_ops *ctx)
27{
28 struct bpf_tcp_sock *tcp_sk;
29 struct bpf_sock *sk;
30 __u64 *next_dump;
31 __u64 now;
32
33 switch (ctx->op) {
34 case BPF_SOCK_OPS_TCP_CONNECT_CB:
35 bpf_sock_ops_cb_flags_set(ctx, BPF_SOCK_OPS_RTT_CB_FLAG);
36 return 1;
37 case BPF_SOCK_OPS_RTT_CB:
38 break;
39 default:
40 return 1;
41 }
42
43 sk = ctx->sk;
44 if (!sk)
45 return 1;
46
47 next_dump = bpf_sk_storage_get(&bpf_next_dump, sk, 0,
48 BPF_SK_STORAGE_GET_F_CREATE);
49 if (!next_dump)
50 return 1;
51
52 now = bpf_ktime_get_ns();
53 if (now < *next_dump)
54 return 1;
55
56 tcp_sk = bpf_tcp_sock(sk);
57 if (!tcp_sk)
58 return 1;
59
60 *next_dump = now + INTERVAL;
61
62 bpf_printk("dsack_dups=%u delivered=%u\n",
63 tcp_sk->dsack_dups, tcp_sk->delivered);
64 bpf_printk("delivered_ce=%u icsk_retransmits=%u\n",
65 tcp_sk->delivered_ce, tcp_sk->icsk_retransmits);
66
67 return 1;
68}
diff --git a/samples/bpf/tcp_iw_kern.c b/samples/bpf/tcp_iw_kern.c
new file mode 100644
index 000000000..d14445573
--- /dev/null
+++ b/samples/bpf/tcp_iw_kern.c
@@ -0,0 +1,83 @@
1/* Copyright (c) 2017 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * BPF program to set initial congestion window and initial receive
8 * window to 40 packets and send and receive buffers to 1.5MB. This
9 * would usually be done after doing appropriate checks that indicate
10 * the hosts are far enough away (i.e. large RTT).
11 *
12 * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
13 */
14
15#include <uapi/linux/bpf.h>
16#include <uapi/linux/if_ether.h>
17#include <uapi/linux/if_packet.h>
18#include <uapi/linux/ip.h>
19#include <linux/socket.h>
20#include <bpf/bpf_helpers.h>
21#include <bpf/bpf_endian.h>
22
23#define DEBUG 1
24
25SEC("sockops")
26int bpf_iw(struct bpf_sock_ops *skops)
27{
28 int bufsize = 1500000;
29 int rwnd_init = 40;
30 int iw = 40;
31 int rv = 0;
32 int op;
33
34 /* For testing purposes, only execute rest of BPF program
35 * if neither port numberis 55601
36 */
37 if (bpf_ntohl(skops->remote_port) != 55601 &&
38 skops->local_port != 55601) {
39 skops->reply = -1;
40 return 1;
41 }
42
43 op = (int) skops->op;
44
45#ifdef DEBUG
46 bpf_printk("BPF command: %d\n", op);
47#endif
48
49 /* Usually there would be a check to insure the hosts are far
50 * from each other so it makes sense to increase buffer sizes
51 */
52 switch (op) {
53 case BPF_SOCK_OPS_RWND_INIT:
54 rv = rwnd_init;
55 break;
56 case BPF_SOCK_OPS_TCP_CONNECT_CB:
57 /* Set sndbuf and rcvbuf of active connections */
58 rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
59 sizeof(bufsize));
60 rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
61 &bufsize, sizeof(bufsize));
62 break;
63 case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
64 rv = bpf_setsockopt(skops, SOL_TCP, TCP_BPF_IW, &iw,
65 sizeof(iw));
66 break;
67 case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
68 /* Set sndbuf and rcvbuf of passive connections */
69 rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
70 sizeof(bufsize));
71 rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
72 &bufsize, sizeof(bufsize));
73 break;
74 default:
75 rv = -1;
76 }
77#ifdef DEBUG
78 bpf_printk("Returning %d\n", rv);
79#endif
80 skops->reply = rv;
81 return 1;
82}
83char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_rwnd_kern.c b/samples/bpf/tcp_rwnd_kern.c
new file mode 100644
index 000000000..223d9c23b
--- /dev/null
+++ b/samples/bpf/tcp_rwnd_kern.c
@@ -0,0 +1,64 @@
1/* Copyright (c) 2017 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * BPF program to set initial receive window to 40 packets when using IPv6
8 * and the first 5.5 bytes of the IPv6 addresses are not the same (in this
9 * example that means both hosts are not the same datacenter).
10 *
11 * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
12 */
13
14#include <uapi/linux/bpf.h>
15#include <uapi/linux/if_ether.h>
16#include <uapi/linux/if_packet.h>
17#include <uapi/linux/ip.h>
18#include <linux/socket.h>
19#include <bpf/bpf_helpers.h>
20#include <bpf/bpf_endian.h>
21
22#define DEBUG 1
23
24SEC("sockops")
25int bpf_rwnd(struct bpf_sock_ops *skops)
26{
27 int rv = -1;
28 int op;
29
30 /* For testing purposes, only execute rest of BPF program
31 * if neither port numberis 55601
32 */
33 if (bpf_ntohl(skops->remote_port) !=
34 55601 && skops->local_port != 55601) {
35 skops->reply = -1;
36 return 1;
37 }
38
39 op = (int) skops->op;
40
41#ifdef DEBUG
42 bpf_printk("BPF command: %d\n", op);
43#endif
44
45 /* Check for RWND_INIT operation and IPv6 addresses */
46 if (op == BPF_SOCK_OPS_RWND_INIT &&
47 skops->family == AF_INET6) {
48
49 /* If the first 5.5 bytes of the IPv6 address are not the same
50 * then both hosts are not in the same datacenter
51 * so use a larger initial advertized window (40 packets)
52 */
53 if (skops->local_ip6[0] != skops->remote_ip6[0] ||
54 (bpf_ntohl(skops->local_ip6[1]) & 0xfffff000) !=
55 (bpf_ntohl(skops->remote_ip6[1]) & 0xfffff000))
56 rv = 40;
57 }
58#ifdef DEBUG
59 bpf_printk("Returning %d\n", rv);
60#endif
61 skops->reply = rv;
62 return 1;
63}
64char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_synrto_kern.c b/samples/bpf/tcp_synrto_kern.c
new file mode 100644
index 000000000..d58004eef
--- /dev/null
+++ b/samples/bpf/tcp_synrto_kern.c
@@ -0,0 +1,64 @@
1/* Copyright (c) 2017 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * BPF program to set SYN and SYN-ACK RTOs to 10ms when using IPv6 addresses
8 * and the first 5.5 bytes of the IPv6 addresses are the same (in this example
9 * that means both hosts are in the same datacenter).
10 *
11 * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
12 */
13
14#include <uapi/linux/bpf.h>
15#include <uapi/linux/if_ether.h>
16#include <uapi/linux/if_packet.h>
17#include <uapi/linux/ip.h>
18#include <linux/socket.h>
19#include <bpf/bpf_helpers.h>
20#include <bpf/bpf_endian.h>
21
22#define DEBUG 1
23
24SEC("sockops")
25int bpf_synrto(struct bpf_sock_ops *skops)
26{
27 int rv = -1;
28 int op;
29
30 /* For testing purposes, only execute rest of BPF program
31 * if neither port numberis 55601
32 */
33 if (bpf_ntohl(skops->remote_port) != 55601 &&
34 skops->local_port != 55601) {
35 skops->reply = -1;
36 return 1;
37 }
38
39 op = (int) skops->op;
40
41#ifdef DEBUG
42 bpf_printk("BPF command: %d\n", op);
43#endif
44
45 /* Check for TIMEOUT_INIT operation and IPv6 addresses */
46 if (op == BPF_SOCK_OPS_TIMEOUT_INIT &&
47 skops->family == AF_INET6) {
48
49 /* If the first 5.5 bytes of the IPv6 address are the same
50 * then both hosts are in the same datacenter
51 * so use an RTO of 10ms
52 */
53 if (skops->local_ip6[0] == skops->remote_ip6[0] &&
54 (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) ==
55 (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000))
56 rv = 10;
57 }
58#ifdef DEBUG
59 bpf_printk("Returning %d\n", rv);
60#endif
61 skops->reply = rv;
62 return 1;
63}
64char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_tos_reflect_kern.c b/samples/bpf/tcp_tos_reflect_kern.c
new file mode 100644
index 000000000..953fedc79
--- /dev/null
+++ b/samples/bpf/tcp_tos_reflect_kern.c
@@ -0,0 +1,80 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2018 Facebook
4 *
5 * BPF program to automatically reflect TOS option from received syn packet
6 *
7 * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
8 */
9
10#include <uapi/linux/bpf.h>
11#include <uapi/linux/tcp.h>
12#include <uapi/linux/if_ether.h>
13#include <uapi/linux/if_packet.h>
14#include <uapi/linux/ip.h>
15#include <uapi/linux/ipv6.h>
16#include <uapi/linux/in.h>
17#include <linux/socket.h>
18#include <bpf/bpf_helpers.h>
19#include <bpf/bpf_endian.h>
20
21#define DEBUG 1
22
23SEC("sockops")
24int bpf_basertt(struct bpf_sock_ops *skops)
25{
26 char header[sizeof(struct ipv6hdr)];
27 struct ipv6hdr *hdr6;
28 struct iphdr *hdr;
29 int hdr_size = 0;
30 int save_syn = 1;
31 int tos = 0;
32 int rv = 0;
33 int op;
34
35 op = (int) skops->op;
36
37#ifdef DEBUG
38 bpf_printk("BPF command: %d\n", op);
39#endif
40 switch (op) {
41 case BPF_SOCK_OPS_TCP_LISTEN_CB:
42 rv = bpf_setsockopt(skops, SOL_TCP, TCP_SAVE_SYN,
43 &save_syn, sizeof(save_syn));
44 break;
45 case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
46 if (skops->family == AF_INET)
47 hdr_size = sizeof(struct iphdr);
48 else
49 hdr_size = sizeof(struct ipv6hdr);
50 rv = bpf_getsockopt(skops, SOL_TCP, TCP_SAVED_SYN,
51 header, hdr_size);
52 if (!rv) {
53 if (skops->family == AF_INET) {
54 hdr = (struct iphdr *) header;
55 tos = hdr->tos;
56 if (tos != 0)
57 bpf_setsockopt(skops, SOL_IP, IP_TOS,
58 &tos, sizeof(tos));
59 } else {
60 hdr6 = (struct ipv6hdr *) header;
61 tos = ((hdr6->priority) << 4 |
62 (hdr6->flow_lbl[0]) >> 4);
63 if (tos)
64 bpf_setsockopt(skops, SOL_IPV6,
65 IPV6_TCLASS,
66 &tos, sizeof(tos));
67 }
68 rv = 0;
69 }
70 break;
71 default:
72 rv = -1;
73 }
74#ifdef DEBUG
75 bpf_printk("Returning %d\n", rv);
76#endif
77 skops->reply = rv;
78 return 1;
79}
80char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_cgrp2_array_pin.c b/samples/bpf/test_cgrp2_array_pin.c
new file mode 100644
index 000000000..6d564aa75
--- /dev/null
+++ b/samples/bpf/test_cgrp2_array_pin.c
@@ -0,0 +1,106 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright (c) 2016 Facebook
3 */
4#include <linux/unistd.h>
5#include <linux/bpf.h>
6
7#include <stdio.h>
8#include <stdint.h>
9#include <unistd.h>
10#include <string.h>
11#include <errno.h>
12#include <fcntl.h>
13
14#include <bpf/bpf.h>
15
16static void usage(void)
17{
18 printf("Usage: test_cgrp2_array_pin [...]\n");
19 printf(" -F <file> File to pin an BPF cgroup array\n");
20 printf(" -U <file> Update an already pinned BPF cgroup array\n");
21 printf(" -v <value> Full path of the cgroup2\n");
22 printf(" -h Display this help\n");
23}
24
25int main(int argc, char **argv)
26{
27 const char *pinned_file = NULL, *cg2 = NULL;
28 int create_array = 1;
29 int array_key = 0;
30 int array_fd = -1;
31 int cg2_fd = -1;
32 int ret = -1;
33 int opt;
34
35 while ((opt = getopt(argc, argv, "F:U:v:")) != -1) {
36 switch (opt) {
37 /* General args */
38 case 'F':
39 pinned_file = optarg;
40 break;
41 case 'U':
42 pinned_file = optarg;
43 create_array = 0;
44 break;
45 case 'v':
46 cg2 = optarg;
47 break;
48 default:
49 usage();
50 goto out;
51 }
52 }
53
54 if (!cg2 || !pinned_file) {
55 usage();
56 goto out;
57 }
58
59 cg2_fd = open(cg2, O_RDONLY);
60 if (cg2_fd < 0) {
61 fprintf(stderr, "open(%s,...): %s(%d)\n",
62 cg2, strerror(errno), errno);
63 goto out;
64 }
65
66 if (create_array) {
67 array_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_ARRAY,
68 sizeof(uint32_t), sizeof(uint32_t),
69 1, 0);
70 if (array_fd < 0) {
71 fprintf(stderr,
72 "bpf_create_map(BPF_MAP_TYPE_CGROUP_ARRAY,...): %s(%d)\n",
73 strerror(errno), errno);
74 goto out;
75 }
76 } else {
77 array_fd = bpf_obj_get(pinned_file);
78 if (array_fd < 0) {
79 fprintf(stderr, "bpf_obj_get(%s): %s(%d)\n",
80 pinned_file, strerror(errno), errno);
81 goto out;
82 }
83 }
84
85 ret = bpf_map_update_elem(array_fd, &array_key, &cg2_fd, 0);
86 if (ret) {
87 perror("bpf_map_update_elem");
88 goto out;
89 }
90
91 if (create_array) {
92 ret = bpf_obj_pin(array_fd, pinned_file);
93 if (ret) {
94 fprintf(stderr, "bpf_obj_pin(..., %s): %s(%d)\n",
95 pinned_file, strerror(errno), errno);
96 goto out;
97 }
98 }
99
100out:
101 if (array_fd != -1)
102 close(array_fd);
103 if (cg2_fd != -1)
104 close(cg2_fd);
105 return ret;
106}
diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c
new file mode 100644
index 000000000..20fbd1241
--- /dev/null
+++ b/samples/bpf/test_cgrp2_attach.c
@@ -0,0 +1,172 @@
1/* eBPF example program:
2 *
3 * - Creates arraymap in kernel with 4 bytes keys and 8 byte values
4 *
5 * - Loads eBPF program
6 *
7 * The eBPF program accesses the map passed in to store two pieces of
8 * information. The number of invocations of the program, which maps
9 * to the number of packets received, is stored to key 0. Key 1 is
10 * incremented on each iteration by the number of bytes stored in
11 * the skb.
12 *
13 * - Attaches the new program to a cgroup using BPF_PROG_ATTACH
14 *
15 * - Every second, reads map[0] and map[1] to see how many bytes and
16 * packets were seen on any socket of tasks in the given cgroup.
17 */
18
19#define _GNU_SOURCE
20
21#include <stdio.h>
22#include <stdlib.h>
23#include <stddef.h>
24#include <string.h>
25#include <unistd.h>
26#include <assert.h>
27#include <errno.h>
28#include <fcntl.h>
29
30#include <linux/bpf.h>
31#include <bpf/bpf.h>
32
33#include "bpf_insn.h"
34
35enum {
36 MAP_KEY_PACKETS,
37 MAP_KEY_BYTES,
38};
39
40char bpf_log_buf[BPF_LOG_BUF_SIZE];
41
42static int prog_load(int map_fd, int verdict)
43{
44 struct bpf_insn prog[] = {
45 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), /* save r6 so it's not clobbered by BPF_CALL */
46
47 /* Count packets */
48 BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
49 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
50 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
51 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
52 BPF_LD_MAP_FD(BPF_REG_1, map_fd), /* load map fd to r1 */
53 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
54 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
55 BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
56 BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
57
58 /* Count bytes */
59 BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
60 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
61 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
62 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
63 BPF_LD_MAP_FD(BPF_REG_1, map_fd),
64 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
65 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
66 BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
67 BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
68
69 BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */
70 BPF_EXIT_INSN(),
71 };
72 size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
73
74 return bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB,
75 prog, insns_cnt, "GPL", 0,
76 bpf_log_buf, BPF_LOG_BUF_SIZE);
77}
78
79static int usage(const char *argv0)
80{
81 printf("Usage: %s [-d] [-D] <cg-path> <egress|ingress>\n", argv0);
82 printf(" -d Drop Traffic\n");
83 printf(" -D Detach filter, and exit\n");
84 return EXIT_FAILURE;
85}
86
87static int attach_filter(int cg_fd, int type, int verdict)
88{
89 int prog_fd, map_fd, ret, key;
90 long long pkt_cnt, byte_cnt;
91
92 map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY,
93 sizeof(key), sizeof(byte_cnt),
94 256, 0);
95 if (map_fd < 0) {
96 printf("Failed to create map: '%s'\n", strerror(errno));
97 return EXIT_FAILURE;
98 }
99
100 prog_fd = prog_load(map_fd, verdict);
101 printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf);
102
103 if (prog_fd < 0) {
104 printf("Failed to load prog: '%s'\n", strerror(errno));
105 return EXIT_FAILURE;
106 }
107
108 ret = bpf_prog_attach(prog_fd, cg_fd, type, 0);
109 if (ret < 0) {
110 printf("Failed to attach prog to cgroup: '%s'\n",
111 strerror(errno));
112 return EXIT_FAILURE;
113 }
114 while (1) {
115 key = MAP_KEY_PACKETS;
116 assert(bpf_map_lookup_elem(map_fd, &key, &pkt_cnt) == 0);
117
118 key = MAP_KEY_BYTES;
119 assert(bpf_map_lookup_elem(map_fd, &key, &byte_cnt) == 0);
120
121 printf("cgroup received %lld packets, %lld bytes\n",
122 pkt_cnt, byte_cnt);
123 sleep(1);
124 }
125
126 return EXIT_SUCCESS;
127}
128
129int main(int argc, char **argv)
130{
131 int detach_only = 0, verdict = 1;
132 enum bpf_attach_type type;
133 int opt, cg_fd, ret;
134
135 while ((opt = getopt(argc, argv, "Dd")) != -1) {
136 switch (opt) {
137 case 'd':
138 verdict = 0;
139 break;
140 case 'D':
141 detach_only = 1;
142 break;
143 default:
144 return usage(argv[0]);
145 }
146 }
147
148 if (argc - optind < 2)
149 return usage(argv[0]);
150
151 if (strcmp(argv[optind + 1], "ingress") == 0)
152 type = BPF_CGROUP_INET_INGRESS;
153 else if (strcmp(argv[optind + 1], "egress") == 0)
154 type = BPF_CGROUP_INET_EGRESS;
155 else
156 return usage(argv[0]);
157
158 cg_fd = open(argv[optind], O_DIRECTORY | O_RDONLY);
159 if (cg_fd < 0) {
160 printf("Failed to open cgroup path: '%s'\n", strerror(errno));
161 return EXIT_FAILURE;
162 }
163
164 if (detach_only) {
165 ret = bpf_prog_detach(cg_fd, type);
166 printf("bpf_prog_detach() returned '%s' (%d)\n",
167 strerror(errno), errno);
168 } else
169 ret = attach_filter(cg_fd, type, verdict);
170
171 return ret;
172}
diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c
new file mode 100644
index 000000000..b0811da5a
--- /dev/null
+++ b/samples/bpf/test_cgrp2_sock.c
@@ -0,0 +1,290 @@
1/* eBPF example program:
2 *
3 * - Loads eBPF program
4 *
5 * The eBPF program sets the sk_bound_dev_if index in new AF_INET{6}
6 * sockets opened by processes in the cgroup.
7 *
8 * - Attaches the new program to a cgroup using BPF_PROG_ATTACH
9 */
10
11#define _GNU_SOURCE
12
13#include <stdio.h>
14#include <stdlib.h>
15#include <stddef.h>
16#include <string.h>
17#include <unistd.h>
18#include <assert.h>
19#include <errno.h>
20#include <fcntl.h>
21#include <net/if.h>
22#include <inttypes.h>
23#include <linux/bpf.h>
24#include <bpf/bpf.h>
25
26#include "bpf_insn.h"
27
28char bpf_log_buf[BPF_LOG_BUF_SIZE];
29
30static int prog_load(__u32 idx, __u32 mark, __u32 prio)
31{
32 /* save pointer to context */
33 struct bpf_insn prog_start[] = {
34 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
35 };
36 struct bpf_insn prog_end[] = {
37 BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = verdict */
38 BPF_EXIT_INSN(),
39 };
40
41 /* set sk_bound_dev_if on socket */
42 struct bpf_insn prog_dev[] = {
43 BPF_MOV64_IMM(BPF_REG_3, idx),
44 BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, bound_dev_if)),
45 BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, bound_dev_if)),
46 };
47
48 /* set mark on socket */
49 struct bpf_insn prog_mark[] = {
50 /* get uid of process */
51 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
52 BPF_FUNC_get_current_uid_gid),
53 BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xffffffff),
54
55 /* if uid is 0, use given mark, else use the uid as the mark */
56 BPF_MOV64_REG(BPF_REG_3, BPF_REG_0),
57 BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
58 BPF_MOV64_IMM(BPF_REG_3, mark),
59
60 /* set the mark on the new socket */
61 BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
62 BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, mark)),
63 BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, mark)),
64 };
65
66 /* set priority on socket */
67 struct bpf_insn prog_prio[] = {
68 BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
69 BPF_MOV64_IMM(BPF_REG_3, prio),
70 BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, priority)),
71 BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, priority)),
72 };
73
74 struct bpf_insn *prog;
75 size_t insns_cnt;
76 void *p;
77 int ret;
78
79 insns_cnt = sizeof(prog_start) + sizeof(prog_end);
80 if (idx)
81 insns_cnt += sizeof(prog_dev);
82
83 if (mark)
84 insns_cnt += sizeof(prog_mark);
85
86 if (prio)
87 insns_cnt += sizeof(prog_prio);
88
89 p = prog = malloc(insns_cnt);
90 if (!prog) {
91 fprintf(stderr, "Failed to allocate memory for instructions\n");
92 return EXIT_FAILURE;
93 }
94
95 memcpy(p, prog_start, sizeof(prog_start));
96 p += sizeof(prog_start);
97
98 if (idx) {
99 memcpy(p, prog_dev, sizeof(prog_dev));
100 p += sizeof(prog_dev);
101 }
102
103 if (mark) {
104 memcpy(p, prog_mark, sizeof(prog_mark));
105 p += sizeof(prog_mark);
106 }
107
108 if (prio) {
109 memcpy(p, prog_prio, sizeof(prog_prio));
110 p += sizeof(prog_prio);
111 }
112
113 memcpy(p, prog_end, sizeof(prog_end));
114 p += sizeof(prog_end);
115
116 insns_cnt /= sizeof(struct bpf_insn);
117
118 ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SOCK, prog, insns_cnt,
119 "GPL", 0, bpf_log_buf, BPF_LOG_BUF_SIZE);
120
121 free(prog);
122
123 return ret;
124}
125
126static int get_bind_to_device(int sd, char *name, size_t len)
127{
128 socklen_t optlen = len;
129 int rc;
130
131 name[0] = '\0';
132 rc = getsockopt(sd, SOL_SOCKET, SO_BINDTODEVICE, name, &optlen);
133 if (rc < 0)
134 perror("setsockopt(SO_BINDTODEVICE)");
135
136 return rc;
137}
138
139static unsigned int get_somark(int sd)
140{
141 unsigned int mark = 0;
142 socklen_t optlen = sizeof(mark);
143 int rc;
144
145 rc = getsockopt(sd, SOL_SOCKET, SO_MARK, &mark, &optlen);
146 if (rc < 0)
147 perror("getsockopt(SO_MARK)");
148
149 return mark;
150}
151
152static unsigned int get_priority(int sd)
153{
154 unsigned int prio = 0;
155 socklen_t optlen = sizeof(prio);
156 int rc;
157
158 rc = getsockopt(sd, SOL_SOCKET, SO_PRIORITY, &prio, &optlen);
159 if (rc < 0)
160 perror("getsockopt(SO_PRIORITY)");
161
162 return prio;
163}
164
165static int show_sockopts(int family)
166{
167 unsigned int mark, prio;
168 char name[16];
169 int sd;
170
171 sd = socket(family, SOCK_DGRAM, 17);
172 if (sd < 0) {
173 perror("socket");
174 return 1;
175 }
176
177 if (get_bind_to_device(sd, name, sizeof(name)) < 0)
178 return 1;
179
180 mark = get_somark(sd);
181 prio = get_priority(sd);
182
183 close(sd);
184
185 printf("sd %d: dev %s, mark %u, priority %u\n", sd, name, mark, prio);
186
187 return 0;
188}
189
190static int usage(const char *argv0)
191{
192 printf("Usage:\n");
193 printf(" Attach a program\n");
194 printf(" %s -b bind-to-dev -m mark -p prio cg-path\n", argv0);
195 printf("\n");
196 printf(" Detach a program\n");
197 printf(" %s -d cg-path\n", argv0);
198 printf("\n");
199 printf(" Show inherited socket settings (mark, priority, and device)\n");
200 printf(" %s [-6]\n", argv0);
201 return EXIT_FAILURE;
202}
203
204int main(int argc, char **argv)
205{
206 __u32 idx = 0, mark = 0, prio = 0;
207 const char *cgrp_path = NULL;
208 int cg_fd, prog_fd, ret;
209 int family = PF_INET;
210 int do_attach = 1;
211 int rc;
212
213 while ((rc = getopt(argc, argv, "db:m:p:6")) != -1) {
214 switch (rc) {
215 case 'd':
216 do_attach = 0;
217 break;
218 case 'b':
219 idx = if_nametoindex(optarg);
220 if (!idx) {
221 idx = strtoumax(optarg, NULL, 0);
222 if (!idx) {
223 printf("Invalid device name\n");
224 return EXIT_FAILURE;
225 }
226 }
227 break;
228 case 'm':
229 mark = strtoumax(optarg, NULL, 0);
230 break;
231 case 'p':
232 prio = strtoumax(optarg, NULL, 0);
233 break;
234 case '6':
235 family = PF_INET6;
236 break;
237 default:
238 return usage(argv[0]);
239 }
240 }
241
242 if (optind == argc)
243 return show_sockopts(family);
244
245 cgrp_path = argv[optind];
246 if (!cgrp_path) {
247 fprintf(stderr, "cgroup path not given\n");
248 return EXIT_FAILURE;
249 }
250
251 if (do_attach && !idx && !mark && !prio) {
252 fprintf(stderr,
253 "One of device, mark or priority must be given\n");
254 return EXIT_FAILURE;
255 }
256
257 cg_fd = open(cgrp_path, O_DIRECTORY | O_RDONLY);
258 if (cg_fd < 0) {
259 printf("Failed to open cgroup path: '%s'\n", strerror(errno));
260 return EXIT_FAILURE;
261 }
262
263 if (do_attach) {
264 prog_fd = prog_load(idx, mark, prio);
265 if (prog_fd < 0) {
266 printf("Failed to load prog: '%s'\n", strerror(errno));
267 printf("Output from kernel verifier:\n%s\n-------\n",
268 bpf_log_buf);
269 return EXIT_FAILURE;
270 }
271
272 ret = bpf_prog_attach(prog_fd, cg_fd,
273 BPF_CGROUP_INET_SOCK_CREATE, 0);
274 if (ret < 0) {
275 printf("Failed to attach prog to cgroup: '%s'\n",
276 strerror(errno));
277 return EXIT_FAILURE;
278 }
279 } else {
280 ret = bpf_prog_detach(cg_fd, BPF_CGROUP_INET_SOCK_CREATE);
281 if (ret < 0) {
282 printf("Failed to detach prog from cgroup: '%s'\n",
283 strerror(errno));
284 return EXIT_FAILURE;
285 }
286 }
287
288 close(cg_fd);
289 return EXIT_SUCCESS;
290}
diff --git a/samples/bpf/test_cgrp2_sock.sh b/samples/bpf/test_cgrp2_sock.sh
new file mode 100755
index 000000000..9f6174236
--- /dev/null
+++ b/samples/bpf/test_cgrp2_sock.sh
@@ -0,0 +1,135 @@
1#!/bin/sh
2# SPDX-License-Identifier: GPL-2.0
3
4# Test various socket options that can be set by attaching programs to cgroups.
5
6CGRP_MNT="/tmp/cgroupv2-test_cgrp2_sock"
7
8################################################################################
9#
10print_result()
11{
12 local rc=$1
13 local status=" OK "
14
15 [ $rc -ne 0 ] && status="FAIL"
16
17 printf "%-50s [%4s]\n" "$2" "$status"
18}
19
20check_sock()
21{
22 out=$(test_cgrp2_sock)
23 echo $out | grep -q "$1"
24 if [ $? -ne 0 ]; then
25 print_result 1 "IPv4: $2"
26 echo " expected: $1"
27 echo " have: $out"
28 rc=1
29 else
30 print_result 0 "IPv4: $2"
31 fi
32}
33
34check_sock6()
35{
36 out=$(test_cgrp2_sock -6)
37 echo $out | grep -q "$1"
38 if [ $? -ne 0 ]; then
39 print_result 1 "IPv6: $2"
40 echo " expected: $1"
41 echo " have: $out"
42 rc=1
43 else
44 print_result 0 "IPv6: $2"
45 fi
46}
47
48################################################################################
49#
50
51cleanup()
52{
53 echo $$ >> ${CGRP_MNT}/cgroup.procs
54 rmdir ${CGRP_MNT}/sockopts
55}
56
57cleanup_and_exit()
58{
59 local rc=$1
60 local msg="$2"
61
62 [ -n "$msg" ] && echo "ERROR: $msg"
63
64 test_cgrp2_sock -d ${CGRP_MNT}/sockopts
65 ip li del cgrp2_sock
66 umount ${CGRP_MNT}
67
68 exit $rc
69}
70
71
72################################################################################
73# main
74
75rc=0
76
77ip li add cgrp2_sock type dummy 2>/dev/null
78
79set -e
80mkdir -p ${CGRP_MNT}
81mount -t cgroup2 none ${CGRP_MNT}
82set +e
83
84
85# make sure we have a known start point
86cleanup 2>/dev/null
87
88mkdir -p ${CGRP_MNT}/sockopts
89[ $? -ne 0 ] && cleanup_and_exit 1 "Failed to create cgroup hierarchy"
90
91
92# set pid into cgroup
93echo $$ > ${CGRP_MNT}/sockopts/cgroup.procs
94
95# no bpf program attached, so socket should show no settings
96check_sock "dev , mark 0, priority 0" "No programs attached"
97check_sock6 "dev , mark 0, priority 0" "No programs attached"
98
99# verify device is set
100#
101test_cgrp2_sock -b cgrp2_sock ${CGRP_MNT}/sockopts
102if [ $? -ne 0 ]; then
103 cleanup_and_exit 1 "Failed to install program to set device"
104fi
105check_sock "dev cgrp2_sock, mark 0, priority 0" "Device set"
106check_sock6 "dev cgrp2_sock, mark 0, priority 0" "Device set"
107
108# verify mark is set
109#
110test_cgrp2_sock -m 666 ${CGRP_MNT}/sockopts
111if [ $? -ne 0 ]; then
112 cleanup_and_exit 1 "Failed to install program to set mark"
113fi
114check_sock "dev , mark 666, priority 0" "Mark set"
115check_sock6 "dev , mark 666, priority 0" "Mark set"
116
117# verify priority is set
118#
119test_cgrp2_sock -p 123 ${CGRP_MNT}/sockopts
120if [ $? -ne 0 ]; then
121 cleanup_and_exit 1 "Failed to install program to set priority"
122fi
123check_sock "dev , mark 0, priority 123" "Priority set"
124check_sock6 "dev , mark 0, priority 123" "Priority set"
125
126# all 3 at once
127#
128test_cgrp2_sock -b cgrp2_sock -m 666 -p 123 ${CGRP_MNT}/sockopts
129if [ $? -ne 0 ]; then
130 cleanup_and_exit 1 "Failed to install program to set device, mark and priority"
131fi
132check_sock "dev cgrp2_sock, mark 666, priority 123" "Priority set"
133check_sock6 "dev cgrp2_sock, mark 666, priority 123" "Priority set"
134
135cleanup_and_exit $rc
diff --git a/samples/bpf/test_cgrp2_sock2.c b/samples/bpf/test_cgrp2_sock2.c
new file mode 100644
index 000000000..a9277b118
--- /dev/null
+++ b/samples/bpf/test_cgrp2_sock2.c
@@ -0,0 +1,68 @@
1// SPDX-License-Identifier: GPL-2.0
2/* eBPF example program:
3 *
4 * - Loads eBPF program
5 *
6 * The eBPF program loads a filter from file and attaches the
7 * program to a cgroup using BPF_PROG_ATTACH
8 */
9
10#define _GNU_SOURCE
11
12#include <stdio.h>
13#include <stdlib.h>
14#include <stddef.h>
15#include <string.h>
16#include <unistd.h>
17#include <assert.h>
18#include <errno.h>
19#include <fcntl.h>
20#include <net/if.h>
21#include <linux/bpf.h>
22#include <bpf/bpf.h>
23
24#include "bpf_insn.h"
25#include "bpf_load.h"
26
27static int usage(const char *argv0)
28{
29 printf("Usage: %s cg-path filter-path [filter-id]\n", argv0);
30 return EXIT_FAILURE;
31}
32
33int main(int argc, char **argv)
34{
35 int cg_fd, ret, filter_id = 0;
36
37 if (argc < 3)
38 return usage(argv[0]);
39
40 cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY);
41 if (cg_fd < 0) {
42 printf("Failed to open cgroup path: '%s'\n", strerror(errno));
43 return EXIT_FAILURE;
44 }
45
46 if (load_bpf_file(argv[2]))
47 return EXIT_FAILURE;
48
49 printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf);
50
51 if (argc > 3)
52 filter_id = atoi(argv[3]);
53
54 if (filter_id >= prog_cnt) {
55 printf("Invalid program id; program not found in file\n");
56 return EXIT_FAILURE;
57 }
58
59 ret = bpf_prog_attach(prog_fd[filter_id], cg_fd,
60 BPF_CGROUP_INET_SOCK_CREATE, 0);
61 if (ret < 0) {
62 printf("Failed to attach prog to cgroup: '%s'\n",
63 strerror(errno));
64 return EXIT_FAILURE;
65 }
66
67 return EXIT_SUCCESS;
68}
diff --git a/samples/bpf/test_cgrp2_sock2.sh b/samples/bpf/test_cgrp2_sock2.sh
new file mode 100755
index 000000000..0f396a86e
--- /dev/null
+++ b/samples/bpf/test_cgrp2_sock2.sh
@@ -0,0 +1,85 @@
1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3
4function config_device {
5 ip netns add at_ns0
6 ip link add veth0 type veth peer name veth0b
7 ip link set veth0b up
8 ip link set veth0 netns at_ns0
9 ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
10 ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad
11 ip netns exec at_ns0 ip link set dev veth0 up
12 ip addr add 172.16.1.101/24 dev veth0b
13 ip addr add 2401:db00::2/64 dev veth0b nodad
14}
15
16function config_cgroup {
17 rm -rf /tmp/cgroupv2
18 mkdir -p /tmp/cgroupv2
19 mount -t cgroup2 none /tmp/cgroupv2
20 mkdir -p /tmp/cgroupv2/foo
21 echo $$ >> /tmp/cgroupv2/foo/cgroup.procs
22}
23
24
25function attach_bpf {
26 test_cgrp2_sock2 /tmp/cgroupv2/foo sock_flags_kern.o $1
27 [ $? -ne 0 ] && exit 1
28}
29
30function cleanup {
31 if [ -d /tmp/cgroupv2/foo ]; then
32 test_cgrp2_sock -d /tmp/cgroupv2/foo
33 fi
34 ip link del veth0b
35 ip netns delete at_ns0
36 umount /tmp/cgroupv2
37 rm -rf /tmp/cgroupv2
38}
39
40cleanup 2>/dev/null
41
42set -e
43config_device
44config_cgroup
45set +e
46
47#
48# Test 1 - fail ping6
49#
50attach_bpf 0
51ping -c1 -w1 172.16.1.100
52if [ $? -ne 0 ]; then
53 echo "ping failed when it should succeed"
54 cleanup
55 exit 1
56fi
57
58ping6 -c1 -w1 2401:db00::1
59if [ $? -eq 0 ]; then
60 echo "ping6 succeeded when it should not"
61 cleanup
62 exit 1
63fi
64
65#
66# Test 2 - fail ping
67#
68attach_bpf 1
69ping6 -c1 -w1 2401:db00::1
70if [ $? -ne 0 ]; then
71 echo "ping6 failed when it should succeed"
72 cleanup
73 exit 1
74fi
75
76ping -c1 -w1 172.16.1.100
77if [ $? -eq 0 ]; then
78 echo "ping succeeded when it should not"
79 cleanup
80 exit 1
81fi
82
83cleanup
84echo
85echo "*** PASS ***"
diff --git a/samples/bpf/test_cgrp2_tc.sh b/samples/bpf/test_cgrp2_tc.sh
new file mode 100755
index 000000000..12faf5847
--- /dev/null
+++ b/samples/bpf/test_cgrp2_tc.sh
@@ -0,0 +1,185 @@
1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3
4MY_DIR=$(dirname $0)
5# Details on the bpf prog
6BPF_CGRP2_ARRAY_NAME='test_cgrp2_array_pin'
7BPF_PROG="$MY_DIR/test_cgrp2_tc_kern.o"
8BPF_SECTION='filter'
9
10[ -z "$TC" ] && TC='tc'
11[ -z "$IP" ] && IP='ip'
12
13# Names of the veth interface, net namespace...etc.
14HOST_IFC='ve'
15NS_IFC='vens'
16NS='ns'
17
18find_mnt() {
19 cat /proc/mounts | \
20 awk '{ if ($3 == "'$1'" && mnt == "") { mnt = $2 }} END { print mnt }'
21}
22
23# Init cgroup2 vars
24init_cgrp2_vars() {
25 CGRP2_ROOT=$(find_mnt cgroup2)
26 if [ -z "$CGRP2_ROOT" ]
27 then
28 CGRP2_ROOT='/mnt/cgroup2'
29 MOUNT_CGRP2="yes"
30 fi
31 CGRP2_TC="$CGRP2_ROOT/tc"
32 CGRP2_TC_LEAF="$CGRP2_TC/leaf"
33}
34
35# Init bpf fs vars
36init_bpf_fs_vars() {
37 local bpf_fs_root=$(find_mnt bpf)
38 [ -n "$bpf_fs_root" ] || return -1
39 BPF_FS_TC_SHARE="$bpf_fs_root/tc/globals"
40}
41
42setup_cgrp2() {
43 case $1 in
44 start)
45 if [ "$MOUNT_CGRP2" == 'yes' ]
46 then
47 [ -d $CGRP2_ROOT ] || mkdir -p $CGRP2_ROOT
48 mount -t cgroup2 none $CGRP2_ROOT || return $?
49 fi
50 mkdir -p $CGRP2_TC_LEAF
51 ;;
52 *)
53 rmdir $CGRP2_TC_LEAF && rmdir $CGRP2_TC
54 [ "$MOUNT_CGRP2" == 'yes' ] && umount $CGRP2_ROOT
55 ;;
56 esac
57}
58
59setup_bpf_cgrp2_array() {
60 local bpf_cgrp2_array="$BPF_FS_TC_SHARE/$BPF_CGRP2_ARRAY_NAME"
61 case $1 in
62 start)
63 $MY_DIR/test_cgrp2_array_pin -U $bpf_cgrp2_array -v $CGRP2_TC
64 ;;
65 *)
66 [ -d "$BPF_FS_TC_SHARE" ] && rm -f $bpf_cgrp2_array
67 ;;
68 esac
69}
70
71setup_net() {
72 case $1 in
73 start)
74 $IP link add $HOST_IFC type veth peer name $NS_IFC || return $?
75 $IP link set dev $HOST_IFC up || return $?
76 sysctl -q net.ipv6.conf.$HOST_IFC.accept_dad=0
77
78 $IP netns add ns || return $?
79 $IP link set dev $NS_IFC netns ns || return $?
80 $IP -n $NS link set dev $NS_IFC up || return $?
81 $IP netns exec $NS sysctl -q net.ipv6.conf.$NS_IFC.accept_dad=0
82 $TC qdisc add dev $HOST_IFC clsact || return $?
83 $TC filter add dev $HOST_IFC egress bpf da obj $BPF_PROG sec $BPF_SECTION || return $?
84 ;;
85 *)
86 $IP netns del $NS
87 $IP link del $HOST_IFC
88 ;;
89 esac
90}
91
92run_in_cgrp() {
93 # Fork another bash and move it under the specified cgroup.
94 # It makes the cgroup cleanup easier at the end of the test.
95 cmd='echo $$ > '
96 cmd="$cmd $1/cgroup.procs; exec $2"
97 bash -c "$cmd"
98}
99
100do_test() {
101 run_in_cgrp $CGRP2_TC_LEAF "ping -6 -c3 ff02::1%$HOST_IFC >& /dev/null"
102 local dropped=$($TC -s qdisc show dev $HOST_IFC | tail -3 | \
103 awk '/drop/{print substr($7, 0, index($7, ",")-1)}')
104 if [[ $dropped -eq 0 ]]
105 then
106 echo "FAIL"
107 return 1
108 else
109 echo "Successfully filtered $dropped packets"
110 return 0
111 fi
112}
113
114do_exit() {
115 if [ "$DEBUG" == "yes" ] && [ "$MODE" != 'cleanuponly' ]
116 then
117 echo "------ DEBUG ------"
118 echo "mount: "; mount | egrep '(cgroup2|bpf)'; echo
119 echo "$CGRP2_TC_LEAF: "; ls -l $CGRP2_TC_LEAF; echo
120 if [ -d "$BPF_FS_TC_SHARE" ]
121 then
122 echo "$BPF_FS_TC_SHARE: "; ls -l $BPF_FS_TC_SHARE; echo
123 fi
124 echo "Host net:"
125 $IP netns
126 $IP link show dev $HOST_IFC
127 $IP -6 a show dev $HOST_IFC
128 $TC -s qdisc show dev $HOST_IFC
129 echo
130 echo "$NS net:"
131 $IP -n $NS link show dev $NS_IFC
132 $IP -n $NS -6 link show dev $NS_IFC
133 echo "------ DEBUG ------"
134 echo
135 fi
136
137 if [ "$MODE" != 'nocleanup' ]
138 then
139 setup_net stop
140 setup_bpf_cgrp2_array stop
141 setup_cgrp2 stop
142 fi
143}
144
145init_cgrp2_vars
146init_bpf_fs_vars
147
148while [[ $# -ge 1 ]]
149do
150 a="$1"
151 case $a in
152 debug)
153 DEBUG='yes'
154 shift 1
155 ;;
156 cleanup-only)
157 MODE='cleanuponly'
158 shift 1
159 ;;
160 no-cleanup)
161 MODE='nocleanup'
162 shift 1
163 ;;
164 *)
165 echo "test_cgrp2_tc [debug] [cleanup-only | no-cleanup]"
166 echo " debug: Print cgrp and network setup details at the end of the test"
167 echo " cleanup-only: Try to cleanup things from last test. No test will be run"
168 echo " no-cleanup: Run the test but don't do cleanup at the end"
169 echo "[Note: If no arg is given, it will run the test and do cleanup at the end]"
170 echo
171 exit -1
172 ;;
173 esac
174done
175
176trap do_exit 0
177
178[ "$MODE" == 'cleanuponly' ] && exit
179
180setup_cgrp2 start || exit $?
181setup_net start || exit $?
182init_bpf_fs_vars || exit $?
183setup_bpf_cgrp2_array start || exit $?
184do_test
185echo
diff --git a/samples/bpf/test_cgrp2_tc_kern.c b/samples/bpf/test_cgrp2_tc_kern.c
new file mode 100644
index 000000000..4dd532a31
--- /dev/null
+++ b/samples/bpf/test_cgrp2_tc_kern.c
@@ -0,0 +1,70 @@
1/* Copyright (c) 2016 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#define KBUILD_MODNAME "foo"
8#include <uapi/linux/if_ether.h>
9#include <uapi/linux/in6.h>
10#include <uapi/linux/ipv6.h>
11#include <uapi/linux/pkt_cls.h>
12#include <uapi/linux/bpf.h>
13#include <bpf/bpf_helpers.h>
14
15/* copy of 'struct ethhdr' without __packed */
16struct eth_hdr {
17 unsigned char h_dest[ETH_ALEN];
18 unsigned char h_source[ETH_ALEN];
19 unsigned short h_proto;
20};
21
22#define PIN_GLOBAL_NS 2
23struct bpf_elf_map {
24 __u32 type;
25 __u32 size_key;
26 __u32 size_value;
27 __u32 max_elem;
28 __u32 flags;
29 __u32 id;
30 __u32 pinning;
31};
32
33struct bpf_elf_map SEC("maps") test_cgrp2_array_pin = {
34 .type = BPF_MAP_TYPE_CGROUP_ARRAY,
35 .size_key = sizeof(uint32_t),
36 .size_value = sizeof(uint32_t),
37 .pinning = PIN_GLOBAL_NS,
38 .max_elem = 1,
39};
40
41SEC("filter")
42int handle_egress(struct __sk_buff *skb)
43{
44 void *data = (void *)(long)skb->data;
45 struct eth_hdr *eth = data;
46 struct ipv6hdr *ip6h = data + sizeof(*eth);
47 void *data_end = (void *)(long)skb->data_end;
48 char dont_care_msg[] = "dont care %04x %d\n";
49 char pass_msg[] = "pass\n";
50 char reject_msg[] = "reject\n";
51
52 /* single length check */
53 if (data + sizeof(*eth) + sizeof(*ip6h) > data_end)
54 return TC_ACT_OK;
55
56 if (eth->h_proto != htons(ETH_P_IPV6) ||
57 ip6h->nexthdr != IPPROTO_ICMPV6) {
58 bpf_trace_printk(dont_care_msg, sizeof(dont_care_msg),
59 eth->h_proto, ip6h->nexthdr);
60 return TC_ACT_OK;
61 } else if (bpf_skb_under_cgroup(skb, &test_cgrp2_array_pin, 0) != 1) {
62 bpf_trace_printk(pass_msg, sizeof(pass_msg));
63 return TC_ACT_OK;
64 } else {
65 bpf_trace_printk(reject_msg, sizeof(reject_msg));
66 return TC_ACT_SHOT;
67 }
68}
69
70char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_cls_bpf.sh b/samples/bpf/test_cls_bpf.sh
new file mode 100755
index 000000000..aaddd67b3
--- /dev/null
+++ b/samples/bpf/test_cls_bpf.sh
@@ -0,0 +1,38 @@
1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3
4function pktgen {
5 ../pktgen/pktgen_bench_xmit_mode_netif_receive.sh -i $IFC -s 64 \
6 -m 90:e2:ba:ff:ff:ff -d 192.168.0.1 -t 4
7 local dropped=`tc -s qdisc show dev $IFC | tail -3 | awk '/drop/{print $7}'`
8 if [ "$dropped" == "0," ]; then
9 echo "FAIL"
10 else
11 echo "Successfully filtered " $dropped " packets"
12 fi
13}
14
15function test {
16 echo -n "Loading bpf program '$2'... "
17 tc qdisc add dev $IFC clsact
18 tc filter add dev $IFC ingress bpf da obj $1 sec $2
19 local status=$?
20 if [ $status -ne 0 ]; then
21 echo "FAIL"
22 else
23 echo "ok"
24 pktgen
25 fi
26 tc qdisc del dev $IFC clsact
27}
28
29IFC=test_veth
30
31ip link add name $IFC type veth peer name pair_$IFC
32ip link set $IFC up
33ip link set pair_$IFC up
34
35test ./parse_simple.o simple
36test ./parse_varlen.o varlen
37test ./parse_ldabs.o ldabs
38ip link del dev $IFC
diff --git a/samples/bpf/test_current_task_under_cgroup_kern.c b/samples/bpf/test_current_task_under_cgroup_kern.c
new file mode 100644
index 000000000..fbd43e2bb
--- /dev/null
+++ b/samples/bpf/test_current_task_under_cgroup_kern.c
@@ -0,0 +1,44 @@
1/* Copyright (c) 2016 Sargun Dhillon <sargun@sargun.me>
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7
8#include <linux/ptrace.h>
9#include <uapi/linux/bpf.h>
10#include <linux/version.h>
11#include <bpf/bpf_helpers.h>
12#include <uapi/linux/utsname.h>
13#include "trace_common.h"
14
15struct {
16 __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY);
17 __uint(key_size, sizeof(u32));
18 __uint(value_size, sizeof(u32));
19 __uint(max_entries, 1);
20} cgroup_map SEC(".maps");
21
22struct {
23 __uint(type, BPF_MAP_TYPE_ARRAY);
24 __type(key, u32);
25 __type(value, u64);
26 __uint(max_entries, 1);
27} perf_map SEC(".maps");
28
29/* Writes the last PID that called sync to a map at index 0 */
30SEC("kprobe/" SYSCALL(sys_sync))
31int bpf_prog1(struct pt_regs *ctx)
32{
33 u64 pid = bpf_get_current_pid_tgid();
34 int idx = 0;
35
36 if (!bpf_current_task_under_cgroup(&cgroup_map, 0))
37 return 0;
38
39 bpf_map_update_elem(&perf_map, &idx, &pid, BPF_ANY);
40 return 0;
41}
42
43char _license[] SEC("license") = "GPL";
44u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/test_current_task_under_cgroup_user.c b/samples/bpf/test_current_task_under_cgroup_user.c
new file mode 100644
index 000000000..ac251a417
--- /dev/null
+++ b/samples/bpf/test_current_task_under_cgroup_user.c
@@ -0,0 +1,113 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright (c) 2016 Sargun Dhillon <sargun@sargun.me>
3 */
4
5#define _GNU_SOURCE
6#include <stdio.h>
7#include <unistd.h>
8#include <bpf/bpf.h>
9#include <bpf/libbpf.h>
10#include "cgroup_helpers.h"
11
12#define CGROUP_PATH "/my-cgroup"
13
14int main(int argc, char **argv)
15{
16 pid_t remote_pid, local_pid = getpid();
17 struct bpf_link *link = NULL;
18 struct bpf_program *prog;
19 int cg2, idx = 0, rc = 1;
20 struct bpf_object *obj;
21 char filename[256];
22 int map_fd[2];
23
24 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
25 obj = bpf_object__open_file(filename, NULL);
26 if (libbpf_get_error(obj)) {
27 fprintf(stderr, "ERROR: opening BPF object file failed\n");
28 return 0;
29 }
30
31 prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
32 if (!prog) {
33 printf("finding a prog in obj file failed\n");
34 goto cleanup;
35 }
36
37 /* load BPF program */
38 if (bpf_object__load(obj)) {
39 fprintf(stderr, "ERROR: loading BPF object file failed\n");
40 goto cleanup;
41 }
42
43 map_fd[0] = bpf_object__find_map_fd_by_name(obj, "cgroup_map");
44 map_fd[1] = bpf_object__find_map_fd_by_name(obj, "perf_map");
45 if (map_fd[0] < 0 || map_fd[1] < 0) {
46 fprintf(stderr, "ERROR: finding a map in obj file failed\n");
47 goto cleanup;
48 }
49
50 link = bpf_program__attach(prog);
51 if (libbpf_get_error(link)) {
52 fprintf(stderr, "ERROR: bpf_program__attach failed\n");
53 link = NULL;
54 goto cleanup;
55 }
56
57 if (setup_cgroup_environment())
58 goto err;
59
60 cg2 = create_and_get_cgroup(CGROUP_PATH);
61
62 if (cg2 < 0)
63 goto err;
64
65 if (bpf_map_update_elem(map_fd[0], &idx, &cg2, BPF_ANY)) {
66 log_err("Adding target cgroup to map");
67 goto err;
68 }
69
70 if (join_cgroup(CGROUP_PATH))
71 goto err;
72
73 /*
74 * The installed helper program catched the sync call, and should
75 * write it to the map.
76 */
77
78 sync();
79 bpf_map_lookup_elem(map_fd[1], &idx, &remote_pid);
80
81 if (local_pid != remote_pid) {
82 fprintf(stderr,
83 "BPF Helper didn't write correct PID to map, but: %d\n",
84 remote_pid);
85 goto err;
86 }
87
88 /* Verify the negative scenario; leave the cgroup */
89 if (join_cgroup("/"))
90 goto err;
91
92 remote_pid = 0;
93 bpf_map_update_elem(map_fd[1], &idx, &remote_pid, BPF_ANY);
94
95 sync();
96 bpf_map_lookup_elem(map_fd[1], &idx, &remote_pid);
97
98 if (local_pid == remote_pid) {
99 fprintf(stderr, "BPF cgroup negative test did not work\n");
100 goto err;
101 }
102
103 rc = 0;
104
105err:
106 close(cg2);
107 cleanup_cgroup_environment();
108
109cleanup:
110 bpf_link__destroy(link);
111 bpf_object__close(obj);
112 return rc;
113}
diff --git a/samples/bpf/test_ipip.sh b/samples/bpf/test_ipip.sh
new file mode 100755
index 000000000..9e507c305
--- /dev/null
+++ b/samples/bpf/test_ipip.sh
@@ -0,0 +1,179 @@
1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3
4function config_device {
5 ip netns add at_ns0
6 ip netns add at_ns1
7 ip netns add at_ns2
8 ip link add veth0 type veth peer name veth0b
9 ip link add veth1 type veth peer name veth1b
10 ip link add veth2 type veth peer name veth2b
11 ip link set veth0b up
12 ip link set veth1b up
13 ip link set veth2b up
14 ip link set dev veth0b mtu 1500
15 ip link set dev veth1b mtu 1500
16 ip link set dev veth2b mtu 1500
17 ip link set veth0 netns at_ns0
18 ip link set veth1 netns at_ns1
19 ip link set veth2 netns at_ns2
20 ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
21 ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad
22 ip netns exec at_ns0 ip link set dev veth0 up
23 ip netns exec at_ns1 ip addr add 172.16.1.101/24 dev veth1
24 ip netns exec at_ns1 ip addr add 2401:db00::2/64 dev veth1 nodad
25 ip netns exec at_ns1 ip link set dev veth1 up
26 ip netns exec at_ns2 ip addr add 172.16.1.200/24 dev veth2
27 ip netns exec at_ns2 ip addr add 2401:db00::3/64 dev veth2 nodad
28 ip netns exec at_ns2 ip link set dev veth2 up
29 ip link add br0 type bridge
30 ip link set br0 up
31 ip link set dev br0 mtu 1500
32 ip link set veth0b master br0
33 ip link set veth1b master br0
34 ip link set veth2b master br0
35}
36
37function add_ipip_tunnel {
38 ip netns exec at_ns0 \
39 ip link add dev $DEV_NS type ipip local 172.16.1.100 remote 172.16.1.200
40 ip netns exec at_ns0 ip link set dev $DEV_NS up
41 ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
42 ip netns exec at_ns1 \
43 ip link add dev $DEV_NS type ipip local 172.16.1.101 remote 172.16.1.200
44 ip netns exec at_ns1 ip link set dev $DEV_NS up
45 # same inner IP address in at_ns0 and at_ns1
46 ip netns exec at_ns1 ip addr add dev $DEV_NS 10.1.1.100/24
47
48 ip netns exec at_ns2 ip link add dev $DEV type ipip external
49 ip netns exec at_ns2 ip link set dev $DEV up
50 ip netns exec at_ns2 ip addr add dev $DEV 10.1.1.200/24
51}
52
53function add_ipip6_tunnel {
54 ip netns exec at_ns0 \
55 ip link add dev $DEV_NS type ip6tnl mode ipip6 local 2401:db00::1/64 remote 2401:db00::3/64
56 ip netns exec at_ns0 ip link set dev $DEV_NS up
57 ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
58 ip netns exec at_ns1 \
59 ip link add dev $DEV_NS type ip6tnl mode ipip6 local 2401:db00::2/64 remote 2401:db00::3/64
60 ip netns exec at_ns1 ip link set dev $DEV_NS up
61 # same inner IP address in at_ns0 and at_ns1
62 ip netns exec at_ns1 ip addr add dev $DEV_NS 10.1.1.100/24
63
64 ip netns exec at_ns2 ip link add dev $DEV type ip6tnl mode ipip6 external
65 ip netns exec at_ns2 ip link set dev $DEV up
66 ip netns exec at_ns2 ip addr add dev $DEV 10.1.1.200/24
67}
68
69function add_ip6ip6_tunnel {
70 ip netns exec at_ns0 \
71 ip link add dev $DEV_NS type ip6tnl mode ip6ip6 local 2401:db00::1/64 remote 2401:db00::3/64
72 ip netns exec at_ns0 ip link set dev $DEV_NS up
73 ip netns exec at_ns0 ip addr add dev $DEV_NS 2601:646::1/64
74 ip netns exec at_ns1 \
75 ip link add dev $DEV_NS type ip6tnl mode ip6ip6 local 2401:db00::2/64 remote 2401:db00::3/64
76 ip netns exec at_ns1 ip link set dev $DEV_NS up
77 # same inner IP address in at_ns0 and at_ns1
78 ip netns exec at_ns1 ip addr add dev $DEV_NS 2601:646::1/64
79
80 ip netns exec at_ns2 ip link add dev $DEV type ip6tnl mode ip6ip6 external
81 ip netns exec at_ns2 ip link set dev $DEV up
82 ip netns exec at_ns2 ip addr add dev $DEV 2601:646::2/64
83}
84
85function attach_bpf {
86 DEV=$1
87 SET_TUNNEL=$2
88 GET_TUNNEL=$3
89 ip netns exec at_ns2 tc qdisc add dev $DEV clsact
90 ip netns exec at_ns2 tc filter add dev $DEV egress bpf da obj tcbpf2_kern.o sec $SET_TUNNEL
91 ip netns exec at_ns2 tc filter add dev $DEV ingress bpf da obj tcbpf2_kern.o sec $GET_TUNNEL
92}
93
94function test_ipip {
95 DEV_NS=ipip_std
96 DEV=ipip_bpf
97 config_device
98# tcpdump -nei br0 &
99 cat /sys/kernel/debug/tracing/trace_pipe &
100
101 add_ipip_tunnel
102 attach_bpf $DEV ipip_set_tunnel ipip_get_tunnel
103
104 ip netns exec at_ns0 ping -c 1 10.1.1.200
105 ip netns exec at_ns2 ping -c 1 10.1.1.100
106 ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null
107 ip netns exec at_ns1 iperf -sD -p 5201 > /dev/null
108 sleep 0.2
109 # tcp check _same_ IP over different tunnels
110 ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5200
111 ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5201
112 cleanup
113}
114
115# IPv4 over IPv6 tunnel
116function test_ipip6 {
117 DEV_NS=ipip_std
118 DEV=ipip_bpf
119 config_device
120# tcpdump -nei br0 &
121 cat /sys/kernel/debug/tracing/trace_pipe &
122
123 add_ipip6_tunnel
124 attach_bpf $DEV ipip6_set_tunnel ipip6_get_tunnel
125
126 ip netns exec at_ns0 ping -c 1 10.1.1.200
127 ip netns exec at_ns2 ping -c 1 10.1.1.100
128 ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null
129 ip netns exec at_ns1 iperf -sD -p 5201 > /dev/null
130 sleep 0.2
131 # tcp check _same_ IP over different tunnels
132 ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5200
133 ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5201
134 cleanup
135}
136
137# IPv6 over IPv6 tunnel
138function test_ip6ip6 {
139 DEV_NS=ipip_std
140 DEV=ipip_bpf
141 config_device
142# tcpdump -nei br0 &
143 cat /sys/kernel/debug/tracing/trace_pipe &
144
145 add_ip6ip6_tunnel
146 attach_bpf $DEV ip6ip6_set_tunnel ip6ip6_get_tunnel
147
148 ip netns exec at_ns0 ping -6 -c 1 2601:646::2
149 ip netns exec at_ns2 ping -6 -c 1 2601:646::1
150 ip netns exec at_ns0 iperf -6sD -p 5200 > /dev/null
151 ip netns exec at_ns1 iperf -6sD -p 5201 > /dev/null
152 sleep 0.2
153 # tcp check _same_ IP over different tunnels
154 ip netns exec at_ns2 iperf -6c 2601:646::1 -n 5k -p 5200
155 ip netns exec at_ns2 iperf -6c 2601:646::1 -n 5k -p 5201
156 cleanup
157}
158
159function cleanup {
160 set +ex
161 pkill iperf
162 ip netns delete at_ns0
163 ip netns delete at_ns1
164 ip netns delete at_ns2
165 ip link del veth0
166 ip link del veth1
167 ip link del veth2
168 ip link del br0
169 pkill tcpdump
170 pkill cat
171 set -ex
172}
173
174cleanup
175echo "Testing IP tunnels..."
176test_ipip
177test_ipip6
178test_ip6ip6
179echo "*** PASS ***"
diff --git a/samples/bpf/test_lru_dist.c b/samples/bpf/test_lru_dist.c
new file mode 100644
index 000000000..b313dba41
--- /dev/null
+++ b/samples/bpf/test_lru_dist.c
@@ -0,0 +1,540 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (c) 2016 Facebook
4 */
5#define _GNU_SOURCE
6#include <linux/types.h>
7#include <stdio.h>
8#include <unistd.h>
9#include <linux/bpf.h>
10#include <errno.h>
11#include <string.h>
12#include <assert.h>
13#include <sched.h>
14#include <sys/wait.h>
15#include <sys/stat.h>
16#include <sys/resource.h>
17#include <fcntl.h>
18#include <stdlib.h>
19#include <time.h>
20
21#include <bpf/bpf.h>
22#include "bpf_util.h"
23
24#define min(a, b) ((a) < (b) ? (a) : (b))
25#ifndef offsetof
26# define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER)
27#endif
28#define container_of(ptr, type, member) ({ \
29 const typeof( ((type *)0)->member ) *__mptr = (ptr); \
30 (type *)( (char *)__mptr - offsetof(type,member) );})
31
32static int nr_cpus;
33static unsigned long long *dist_keys;
34static unsigned int dist_key_counts;
35
36struct list_head {
37 struct list_head *next, *prev;
38};
39
40static inline void INIT_LIST_HEAD(struct list_head *list)
41{
42 list->next = list;
43 list->prev = list;
44}
45
46static inline int list_empty(const struct list_head *head)
47{
48 return head->next == head;
49}
50
51static inline void __list_add(struct list_head *new,
52 struct list_head *prev,
53 struct list_head *next)
54{
55 next->prev = new;
56 new->next = next;
57 new->prev = prev;
58 prev->next = new;
59}
60
61static inline void list_add(struct list_head *new, struct list_head *head)
62{
63 __list_add(new, head, head->next);
64}
65
66static inline void __list_del(struct list_head *prev, struct list_head *next)
67{
68 next->prev = prev;
69 prev->next = next;
70}
71
72static inline void __list_del_entry(struct list_head *entry)
73{
74 __list_del(entry->prev, entry->next);
75}
76
77static inline void list_move(struct list_head *list, struct list_head *head)
78{
79 __list_del_entry(list);
80 list_add(list, head);
81}
82
83#define list_entry(ptr, type, member) \
84 container_of(ptr, type, member)
85
86#define list_last_entry(ptr, type, member) \
87 list_entry((ptr)->prev, type, member)
88
89struct pfect_lru_node {
90 struct list_head list;
91 unsigned long long key;
92};
93
94struct pfect_lru {
95 struct list_head list;
96 struct pfect_lru_node *free_nodes;
97 unsigned int cur_size;
98 unsigned int lru_size;
99 unsigned int nr_unique;
100 unsigned int nr_misses;
101 unsigned int total;
102 int map_fd;
103};
104
105static void pfect_lru_init(struct pfect_lru *lru, unsigned int lru_size,
106 unsigned int nr_possible_elems)
107{
108 lru->map_fd = bpf_create_map(BPF_MAP_TYPE_HASH,
109 sizeof(unsigned long long),
110 sizeof(struct pfect_lru_node *),
111 nr_possible_elems, 0);
112 assert(lru->map_fd != -1);
113
114 lru->free_nodes = malloc(lru_size * sizeof(struct pfect_lru_node));
115 assert(lru->free_nodes);
116
117 INIT_LIST_HEAD(&lru->list);
118 lru->cur_size = 0;
119 lru->lru_size = lru_size;
120 lru->nr_unique = lru->nr_misses = lru->total = 0;
121}
122
123static void pfect_lru_destroy(struct pfect_lru *lru)
124{
125 close(lru->map_fd);
126 free(lru->free_nodes);
127}
128
129static int pfect_lru_lookup_or_insert(struct pfect_lru *lru,
130 unsigned long long key)
131{
132 struct pfect_lru_node *node = NULL;
133 int seen = 0;
134
135 lru->total++;
136 if (!bpf_map_lookup_elem(lru->map_fd, &key, &node)) {
137 if (node) {
138 list_move(&node->list, &lru->list);
139 return 1;
140 }
141 seen = 1;
142 }
143
144 if (lru->cur_size < lru->lru_size) {
145 node = &lru->free_nodes[lru->cur_size++];
146 INIT_LIST_HEAD(&node->list);
147 } else {
148 struct pfect_lru_node *null_node = NULL;
149
150 node = list_last_entry(&lru->list,
151 struct pfect_lru_node,
152 list);
153 bpf_map_update_elem(lru->map_fd, &node->key, &null_node, BPF_EXIST);
154 }
155
156 node->key = key;
157 list_move(&node->list, &lru->list);
158
159 lru->nr_misses++;
160 if (seen) {
161 assert(!bpf_map_update_elem(lru->map_fd, &key, &node, BPF_EXIST));
162 } else {
163 lru->nr_unique++;
164 assert(!bpf_map_update_elem(lru->map_fd, &key, &node, BPF_NOEXIST));
165 }
166
167 return seen;
168}
169
170static unsigned int read_keys(const char *dist_file,
171 unsigned long long **keys)
172{
173 struct stat fst;
174 unsigned long long *retkeys;
175 unsigned int counts = 0;
176 int dist_fd;
177 char *b, *l;
178 int i;
179
180 dist_fd = open(dist_file, 0);
181 assert(dist_fd != -1);
182
183 assert(fstat(dist_fd, &fst) == 0);
184 b = malloc(fst.st_size);
185 assert(b);
186
187 assert(read(dist_fd, b, fst.st_size) == fst.st_size);
188 close(dist_fd);
189 for (i = 0; i < fst.st_size; i++) {
190 if (b[i] == '\n')
191 counts++;
192 }
193 counts++; /* in case the last line has no \n */
194
195 retkeys = malloc(counts * sizeof(unsigned long long));
196 assert(retkeys);
197
198 counts = 0;
199 for (l = strtok(b, "\n"); l; l = strtok(NULL, "\n"))
200 retkeys[counts++] = strtoull(l, NULL, 10);
201 free(b);
202
203 *keys = retkeys;
204
205 return counts;
206}
207
208static int create_map(int map_type, int map_flags, unsigned int size)
209{
210 int map_fd;
211
212 map_fd = bpf_create_map(map_type, sizeof(unsigned long long),
213 sizeof(unsigned long long), size, map_flags);
214
215 if (map_fd == -1)
216 perror("bpf_create_map");
217
218 return map_fd;
219}
220
221static int sched_next_online(int pid, int next_to_try)
222{
223 cpu_set_t cpuset;
224
225 if (next_to_try == nr_cpus)
226 return -1;
227
228 while (next_to_try < nr_cpus) {
229 CPU_ZERO(&cpuset);
230 CPU_SET(next_to_try++, &cpuset);
231 if (!sched_setaffinity(pid, sizeof(cpuset), &cpuset))
232 break;
233 }
234
235 return next_to_try;
236}
237
238static void run_parallel(unsigned int tasks, void (*fn)(int i, void *data),
239 void *data)
240{
241 int next_sched_cpu = 0;
242 pid_t pid[tasks];
243 int i;
244
245 for (i = 0; i < tasks; i++) {
246 pid[i] = fork();
247 if (pid[i] == 0) {
248 next_sched_cpu = sched_next_online(0, next_sched_cpu);
249 fn(i, data);
250 exit(0);
251 } else if (pid[i] == -1) {
252 printf("couldn't spawn #%d process\n", i);
253 exit(1);
254 }
255 /* It is mostly redundant and just allow the parent
256 * process to update next_shced_cpu for the next child
257 * process
258 */
259 next_sched_cpu = sched_next_online(pid[i], next_sched_cpu);
260 }
261 for (i = 0; i < tasks; i++) {
262 int status;
263
264 assert(waitpid(pid[i], &status, 0) == pid[i]);
265 assert(status == 0);
266 }
267}
268
269static void do_test_lru_dist(int task, void *data)
270{
271 unsigned int nr_misses = 0;
272 struct pfect_lru pfect_lru;
273 unsigned long long key, value = 1234;
274 unsigned int i;
275
276 unsigned int lru_map_fd = ((unsigned int *)data)[0];
277 unsigned int lru_size = ((unsigned int *)data)[1];
278 unsigned long long key_offset = task * dist_key_counts;
279
280 pfect_lru_init(&pfect_lru, lru_size, dist_key_counts);
281
282 for (i = 0; i < dist_key_counts; i++) {
283 key = dist_keys[i] + key_offset;
284
285 pfect_lru_lookup_or_insert(&pfect_lru, key);
286
287 if (!bpf_map_lookup_elem(lru_map_fd, &key, &value))
288 continue;
289
290 if (bpf_map_update_elem(lru_map_fd, &key, &value, BPF_NOEXIST)) {
291 printf("bpf_map_update_elem(lru_map_fd, %llu): errno:%d\n",
292 key, errno);
293 assert(0);
294 }
295
296 nr_misses++;
297 }
298
299 printf(" task:%d BPF LRU: nr_unique:%u(/%u) nr_misses:%u(/%u)\n",
300 task, pfect_lru.nr_unique, dist_key_counts, nr_misses,
301 dist_key_counts);
302 printf(" task:%d Perfect LRU: nr_unique:%u(/%u) nr_misses:%u(/%u)\n",
303 task, pfect_lru.nr_unique, pfect_lru.total,
304 pfect_lru.nr_misses, pfect_lru.total);
305
306 pfect_lru_destroy(&pfect_lru);
307 close(lru_map_fd);
308}
309
310static void test_parallel_lru_dist(int map_type, int map_flags,
311 int nr_tasks, unsigned int lru_size)
312{
313 int child_data[2];
314 int lru_map_fd;
315
316 printf("%s (map_type:%d map_flags:0x%X):\n", __func__, map_type,
317 map_flags);
318
319 if (map_flags & BPF_F_NO_COMMON_LRU)
320 lru_map_fd = create_map(map_type, map_flags,
321 nr_cpus * lru_size);
322 else
323 lru_map_fd = create_map(map_type, map_flags,
324 nr_tasks * lru_size);
325 assert(lru_map_fd != -1);
326
327 child_data[0] = lru_map_fd;
328 child_data[1] = lru_size;
329
330 run_parallel(nr_tasks, do_test_lru_dist, child_data);
331
332 close(lru_map_fd);
333}
334
335static void test_lru_loss0(int map_type, int map_flags)
336{
337 unsigned long long key, value[nr_cpus];
338 unsigned int old_unused_losses = 0;
339 unsigned int new_unused_losses = 0;
340 unsigned int used_losses = 0;
341 int map_fd;
342
343 printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
344 map_flags);
345
346 assert(sched_next_online(0, 0) != -1);
347
348 if (map_flags & BPF_F_NO_COMMON_LRU)
349 map_fd = create_map(map_type, map_flags, 900 * nr_cpus);
350 else
351 map_fd = create_map(map_type, map_flags, 900);
352
353 assert(map_fd != -1);
354
355 value[0] = 1234;
356
357 for (key = 1; key <= 1000; key++) {
358 int start_key, end_key;
359
360 assert(bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST) == 0);
361
362 start_key = 101;
363 end_key = min(key, 900);
364
365 while (start_key <= end_key) {
366 bpf_map_lookup_elem(map_fd, &start_key, value);
367 start_key++;
368 }
369 }
370
371 for (key = 1; key <= 1000; key++) {
372 if (bpf_map_lookup_elem(map_fd, &key, value)) {
373 if (key <= 100)
374 old_unused_losses++;
375 else if (key <= 900)
376 used_losses++;
377 else
378 new_unused_losses++;
379 }
380 }
381
382 close(map_fd);
383
384 printf("older-elem-losses:%d(/100) active-elem-losses:%d(/800) "
385 "newer-elem-losses:%d(/100)\n",
386 old_unused_losses, used_losses, new_unused_losses);
387}
388
389static void test_lru_loss1(int map_type, int map_flags)
390{
391 unsigned long long key, value[nr_cpus];
392 int map_fd;
393 unsigned int nr_losses = 0;
394
395 printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
396 map_flags);
397
398 assert(sched_next_online(0, 0) != -1);
399
400 if (map_flags & BPF_F_NO_COMMON_LRU)
401 map_fd = create_map(map_type, map_flags, 1000 * nr_cpus);
402 else
403 map_fd = create_map(map_type, map_flags, 1000);
404
405 assert(map_fd != -1);
406
407 value[0] = 1234;
408
409 for (key = 1; key <= 1000; key++)
410 assert(!bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST));
411
412 for (key = 1; key <= 1000; key++) {
413 if (bpf_map_lookup_elem(map_fd, &key, value))
414 nr_losses++;
415 }
416
417 close(map_fd);
418
419 printf("nr_losses:%d(/1000)\n", nr_losses);
420}
421
422static void do_test_parallel_lru_loss(int task, void *data)
423{
424 const unsigned int nr_stable_elems = 1000;
425 const unsigned int nr_repeats = 100000;
426
427 int map_fd = *(int *)data;
428 unsigned long long stable_base;
429 unsigned long long key, value[nr_cpus];
430 unsigned long long next_ins_key;
431 unsigned int nr_losses = 0;
432 unsigned int i;
433
434 stable_base = task * nr_repeats * 2 + 1;
435 next_ins_key = stable_base;
436 value[0] = 1234;
437 for (i = 0; i < nr_stable_elems; i++) {
438 assert(bpf_map_update_elem(map_fd, &next_ins_key, value,
439 BPF_NOEXIST) == 0);
440 next_ins_key++;
441 }
442
443 for (i = 0; i < nr_repeats; i++) {
444 int rn;
445
446 rn = rand();
447
448 if (rn % 10) {
449 key = rn % nr_stable_elems + stable_base;
450 bpf_map_lookup_elem(map_fd, &key, value);
451 } else {
452 bpf_map_update_elem(map_fd, &next_ins_key, value,
453 BPF_NOEXIST);
454 next_ins_key++;
455 }
456 }
457
458 key = stable_base;
459 for (i = 0; i < nr_stable_elems; i++) {
460 if (bpf_map_lookup_elem(map_fd, &key, value))
461 nr_losses++;
462 key++;
463 }
464
465 printf(" task:%d nr_losses:%u\n", task, nr_losses);
466}
467
468static void test_parallel_lru_loss(int map_type, int map_flags, int nr_tasks)
469{
470 int map_fd;
471
472 printf("%s (map_type:%d map_flags:0x%X):\n", __func__, map_type,
473 map_flags);
474
475 /* Give 20% more than the active working set */
476 if (map_flags & BPF_F_NO_COMMON_LRU)
477 map_fd = create_map(map_type, map_flags,
478 nr_cpus * (1000 + 200));
479 else
480 map_fd = create_map(map_type, map_flags,
481 nr_tasks * (1000 + 200));
482
483 assert(map_fd != -1);
484
485 run_parallel(nr_tasks, do_test_parallel_lru_loss, &map_fd);
486
487 close(map_fd);
488}
489
490int main(int argc, char **argv)
491{
492 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
493 int map_flags[] = {0, BPF_F_NO_COMMON_LRU};
494 const char *dist_file;
495 int nr_tasks = 1;
496 int lru_size;
497 int f;
498
499 if (argc < 4) {
500 printf("Usage: %s <dist-file> <lru-size> <nr-tasks>\n",
501 argv[0]);
502 return -1;
503 }
504
505 dist_file = argv[1];
506 lru_size = atoi(argv[2]);
507 nr_tasks = atoi(argv[3]);
508
509 setbuf(stdout, NULL);
510
511 assert(!setrlimit(RLIMIT_MEMLOCK, &r));
512
513 srand(time(NULL));
514
515 nr_cpus = bpf_num_possible_cpus();
516 assert(nr_cpus != -1);
517 printf("nr_cpus:%d\n\n", nr_cpus);
518
519 nr_tasks = min(nr_tasks, nr_cpus);
520
521 dist_key_counts = read_keys(dist_file, &dist_keys);
522 if (!dist_key_counts) {
523 printf("%s has no key\n", dist_file);
524 return -1;
525 }
526
527 for (f = 0; f < sizeof(map_flags) / sizeof(*map_flags); f++) {
528 test_lru_loss0(BPF_MAP_TYPE_LRU_HASH, map_flags[f]);
529 test_lru_loss1(BPF_MAP_TYPE_LRU_HASH, map_flags[f]);
530 test_parallel_lru_loss(BPF_MAP_TYPE_LRU_HASH, map_flags[f],
531 nr_tasks);
532 test_parallel_lru_dist(BPF_MAP_TYPE_LRU_HASH, map_flags[f],
533 nr_tasks, lru_size);
534 printf("\n");
535 }
536
537 free(dist_keys);
538
539 return 0;
540}
diff --git a/samples/bpf/test_lwt_bpf.c b/samples/bpf/test_lwt_bpf.c
new file mode 100644
index 000000000..1b568575a
--- /dev/null
+++ b/samples/bpf/test_lwt_bpf.c
@@ -0,0 +1,253 @@
1/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12
13#include <stdint.h>
14#include <stddef.h>
15#include <linux/bpf.h>
16#include <linux/ip.h>
17#include <linux/in.h>
18#include <linux/in6.h>
19#include <linux/tcp.h>
20#include <linux/udp.h>
21#include <linux/icmpv6.h>
22#include <linux/if_ether.h>
23#include <bpf/bpf_helpers.h>
24#include <string.h>
25
26# define printk(fmt, ...) \
27 ({ \
28 char ____fmt[] = fmt; \
29 bpf_trace_printk(____fmt, sizeof(____fmt), \
30 ##__VA_ARGS__); \
31 })
32
33#define CB_MAGIC 1234
34
35/* Test: Pass all packets through */
36SEC("nop")
37int do_nop(struct __sk_buff *skb)
38{
39 return BPF_OK;
40}
41
42/* Test: Verify context information can be accessed */
43SEC("test_ctx")
44int do_test_ctx(struct __sk_buff *skb)
45{
46 skb->cb[0] = CB_MAGIC;
47 printk("len %d hash %d protocol %d\n", skb->len, skb->hash,
48 skb->protocol);
49 printk("cb %d ingress_ifindex %d ifindex %d\n", skb->cb[0],
50 skb->ingress_ifindex, skb->ifindex);
51
52 return BPF_OK;
53}
54
55/* Test: Ensure skb->cb[] buffer is cleared */
56SEC("test_cb")
57int do_test_cb(struct __sk_buff *skb)
58{
59 printk("cb0: %x cb1: %x cb2: %x\n", skb->cb[0], skb->cb[1],
60 skb->cb[2]);
61 printk("cb3: %x cb4: %x\n", skb->cb[3], skb->cb[4]);
62
63 return BPF_OK;
64}
65
66/* Test: Verify skb data can be read */
67SEC("test_data")
68int do_test_data(struct __sk_buff *skb)
69{
70 void *data = (void *)(long)skb->data;
71 void *data_end = (void *)(long)skb->data_end;
72 struct iphdr *iph = data;
73
74 if (data + sizeof(*iph) > data_end) {
75 printk("packet truncated\n");
76 return BPF_DROP;
77 }
78
79 printk("src: %x dst: %x\n", iph->saddr, iph->daddr);
80
81 return BPF_OK;
82}
83
84#define IP_CSUM_OFF offsetof(struct iphdr, check)
85#define IP_DST_OFF offsetof(struct iphdr, daddr)
86#define IP_SRC_OFF offsetof(struct iphdr, saddr)
87#define IP_PROTO_OFF offsetof(struct iphdr, protocol)
88#define TCP_CSUM_OFF offsetof(struct tcphdr, check)
89#define UDP_CSUM_OFF offsetof(struct udphdr, check)
90#define IS_PSEUDO 0x10
91
92static inline int rewrite(struct __sk_buff *skb, uint32_t old_ip,
93 uint32_t new_ip, int rw_daddr)
94{
95 int ret, off = 0, flags = IS_PSEUDO;
96 uint8_t proto;
97
98 ret = bpf_skb_load_bytes(skb, IP_PROTO_OFF, &proto, 1);
99 if (ret < 0) {
100 printk("bpf_l4_csum_replace failed: %d\n", ret);
101 return BPF_DROP;
102 }
103
104 switch (proto) {
105 case IPPROTO_TCP:
106 off = TCP_CSUM_OFF;
107 break;
108
109 case IPPROTO_UDP:
110 off = UDP_CSUM_OFF;
111 flags |= BPF_F_MARK_MANGLED_0;
112 break;
113
114 case IPPROTO_ICMPV6:
115 off = offsetof(struct icmp6hdr, icmp6_cksum);
116 break;
117 }
118
119 if (off) {
120 ret = bpf_l4_csum_replace(skb, off, old_ip, new_ip,
121 flags | sizeof(new_ip));
122 if (ret < 0) {
123 printk("bpf_l4_csum_replace failed: %d\n");
124 return BPF_DROP;
125 }
126 }
127
128 ret = bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip));
129 if (ret < 0) {
130 printk("bpf_l3_csum_replace failed: %d\n", ret);
131 return BPF_DROP;
132 }
133
134 if (rw_daddr)
135 ret = bpf_skb_store_bytes(skb, IP_DST_OFF, &new_ip, sizeof(new_ip), 0);
136 else
137 ret = bpf_skb_store_bytes(skb, IP_SRC_OFF, &new_ip, sizeof(new_ip), 0);
138
139 if (ret < 0) {
140 printk("bpf_skb_store_bytes() failed: %d\n", ret);
141 return BPF_DROP;
142 }
143
144 return BPF_OK;
145}
146
147/* Test: Verify skb data can be modified */
148SEC("test_rewrite")
149int do_test_rewrite(struct __sk_buff *skb)
150{
151 uint32_t old_ip, new_ip = 0x3fea8c0;
152 int ret;
153
154 ret = bpf_skb_load_bytes(skb, IP_DST_OFF, &old_ip, 4);
155 if (ret < 0) {
156 printk("bpf_skb_load_bytes failed: %d\n", ret);
157 return BPF_DROP;
158 }
159
160 if (old_ip == 0x2fea8c0) {
161 printk("out: rewriting from %x to %x\n", old_ip, new_ip);
162 return rewrite(skb, old_ip, new_ip, 1);
163 }
164
165 return BPF_OK;
166}
167
168static inline int __do_push_ll_and_redirect(struct __sk_buff *skb)
169{
170 uint64_t smac = SRC_MAC, dmac = DST_MAC;
171 int ret, ifindex = DST_IFINDEX;
172 struct ethhdr ehdr;
173
174 ret = bpf_skb_change_head(skb, 14, 0);
175 if (ret < 0) {
176 printk("skb_change_head() failed: %d\n", ret);
177 }
178
179 ehdr.h_proto = __constant_htons(ETH_P_IP);
180 memcpy(&ehdr.h_source, &smac, 6);
181 memcpy(&ehdr.h_dest, &dmac, 6);
182
183 ret = bpf_skb_store_bytes(skb, 0, &ehdr, sizeof(ehdr), 0);
184 if (ret < 0) {
185 printk("skb_store_bytes() failed: %d\n", ret);
186 return BPF_DROP;
187 }
188
189 return bpf_redirect(ifindex, 0);
190}
191
192SEC("push_ll_and_redirect_silent")
193int do_push_ll_and_redirect_silent(struct __sk_buff *skb)
194{
195 return __do_push_ll_and_redirect(skb);
196}
197
198SEC("push_ll_and_redirect")
199int do_push_ll_and_redirect(struct __sk_buff *skb)
200{
201 int ret, ifindex = DST_IFINDEX;
202
203 ret = __do_push_ll_and_redirect(skb);
204 if (ret >= 0)
205 printk("redirected to %d\n", ifindex);
206
207 return ret;
208}
209
210static inline void __fill_garbage(struct __sk_buff *skb)
211{
212 uint64_t f = 0xFFFFFFFFFFFFFFFF;
213
214 bpf_skb_store_bytes(skb, 0, &f, sizeof(f), 0);
215 bpf_skb_store_bytes(skb, 8, &f, sizeof(f), 0);
216 bpf_skb_store_bytes(skb, 16, &f, sizeof(f), 0);
217 bpf_skb_store_bytes(skb, 24, &f, sizeof(f), 0);
218 bpf_skb_store_bytes(skb, 32, &f, sizeof(f), 0);
219 bpf_skb_store_bytes(skb, 40, &f, sizeof(f), 0);
220 bpf_skb_store_bytes(skb, 48, &f, sizeof(f), 0);
221 bpf_skb_store_bytes(skb, 56, &f, sizeof(f), 0);
222 bpf_skb_store_bytes(skb, 64, &f, sizeof(f), 0);
223 bpf_skb_store_bytes(skb, 72, &f, sizeof(f), 0);
224 bpf_skb_store_bytes(skb, 80, &f, sizeof(f), 0);
225 bpf_skb_store_bytes(skb, 88, &f, sizeof(f), 0);
226}
227
228SEC("fill_garbage")
229int do_fill_garbage(struct __sk_buff *skb)
230{
231 __fill_garbage(skb);
232 printk("Set initial 96 bytes of header to FF\n");
233 return BPF_OK;
234}
235
236SEC("fill_garbage_and_redirect")
237int do_fill_garbage_and_redirect(struct __sk_buff *skb)
238{
239 int ifindex = DST_IFINDEX;
240 __fill_garbage(skb);
241 printk("redirected to %d\n", ifindex);
242 return bpf_redirect(ifindex, 0);
243}
244
245/* Drop all packets */
246SEC("drop_all")
247int do_drop_all(struct __sk_buff *skb)
248{
249 printk("dropping with: %d\n", BPF_DROP);
250 return BPF_DROP;
251}
252
253char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_lwt_bpf.sh b/samples/bpf/test_lwt_bpf.sh
new file mode 100755
index 000000000..65a976058
--- /dev/null
+++ b/samples/bpf/test_lwt_bpf.sh
@@ -0,0 +1,400 @@
1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3
4# Uncomment to see generated bytecode
5#VERBOSE=verbose
6
7NS1=lwt_ns1
8NS2=lwt_ns2
9VETH0=tst_lwt1a
10VETH1=tst_lwt1b
11VETH2=tst_lwt2a
12VETH3=tst_lwt2b
13IPVETH0="192.168.254.1"
14IPVETH1="192.168.254.2"
15IPVETH1b="192.168.254.3"
16
17IPVETH2="192.168.111.1"
18IPVETH3="192.168.111.2"
19
20IP_LOCAL="192.168.99.1"
21
22TRACE_ROOT=/sys/kernel/debug/tracing
23
24function lookup_mac()
25{
26 set +x
27 if [ ! -z "$2" ]; then
28 MAC=$(ip netns exec $2 ip link show $1 | grep ether | awk '{print $2}')
29 else
30 MAC=$(ip link show $1 | grep ether | awk '{print $2}')
31 fi
32 MAC="${MAC//:/}"
33 echo "0x${MAC:10:2}${MAC:8:2}${MAC:6:2}${MAC:4:2}${MAC:2:2}${MAC:0:2}"
34 set -x
35}
36
37function cleanup {
38 set +ex
39 rm test_lwt_bpf.o 2> /dev/null
40 ip link del $VETH0 2> /dev/null
41 ip link del $VETH1 2> /dev/null
42 ip link del $VETH2 2> /dev/null
43 ip link del $VETH3 2> /dev/null
44 ip netns exec $NS1 killall netserver
45 ip netns delete $NS1 2> /dev/null
46 ip netns delete $NS2 2> /dev/null
47 set -ex
48}
49
50function setup_one_veth {
51 ip netns add $1
52 ip link add $2 type veth peer name $3
53 ip link set dev $2 up
54 ip addr add $4/24 dev $2
55 ip link set $3 netns $1
56 ip netns exec $1 ip link set dev $3 up
57 ip netns exec $1 ip addr add $5/24 dev $3
58
59 if [ "$6" ]; then
60 ip netns exec $1 ip addr add $6/32 dev $3
61 fi
62}
63
64function get_trace {
65 set +x
66 cat ${TRACE_ROOT}/trace | grep -v '^#'
67 set -x
68}
69
70function cleanup_routes {
71 ip route del ${IPVETH1}/32 dev $VETH0 2> /dev/null || true
72 ip route del table local local ${IP_LOCAL}/32 dev lo 2> /dev/null || true
73}
74
75function install_test {
76 cleanup_routes
77 cp /dev/null ${TRACE_ROOT}/trace
78
79 OPTS="encap bpf headroom 14 $1 obj test_lwt_bpf.o section $2 $VERBOSE"
80
81 if [ "$1" == "in" ]; then
82 ip route add table local local ${IP_LOCAL}/32 $OPTS dev lo
83 else
84 ip route add ${IPVETH1}/32 $OPTS dev $VETH0
85 fi
86}
87
88function remove_prog {
89 if [ "$1" == "in" ]; then
90 ip route del table local local ${IP_LOCAL}/32 dev lo
91 else
92 ip route del ${IPVETH1}/32 dev $VETH0
93 fi
94}
95
96function filter_trace {
97 # Add newline to allow starting EXPECT= variables on newline
98 NL=$'\n'
99 echo "${NL}$*" | sed -e 's/^.*: : //g'
100}
101
102function expect_fail {
103 set +x
104 echo "FAIL:"
105 echo "Expected: $1"
106 echo "Got: $2"
107 set -x
108 exit 1
109}
110
111function match_trace {
112 set +x
113 RET=0
114 TRACE=$1
115 EXPECT=$2
116 GOT="$(filter_trace "$TRACE")"
117
118 [ "$GOT" != "$EXPECT" ] && {
119 expect_fail "$EXPECT" "$GOT"
120 RET=1
121 }
122 set -x
123 return $RET
124}
125
126function test_start {
127 set +x
128 echo "----------------------------------------------------------------"
129 echo "Starting test: $*"
130 echo "----------------------------------------------------------------"
131 set -x
132}
133
134function failure {
135 get_trace
136 echo "FAIL: $*"
137 exit 1
138}
139
140function test_ctx_xmit {
141 test_start "test_ctx on lwt xmit"
142 install_test xmit test_ctx
143 ping -c 3 $IPVETH1 || {
144 failure "test_ctx xmit: packets are dropped"
145 }
146 match_trace "$(get_trace)" "
147len 84 hash 0 protocol 8
148cb 1234 ingress_ifindex 0 ifindex $DST_IFINDEX
149len 84 hash 0 protocol 8
150cb 1234 ingress_ifindex 0 ifindex $DST_IFINDEX
151len 84 hash 0 protocol 8
152cb 1234 ingress_ifindex 0 ifindex $DST_IFINDEX" || exit 1
153 remove_prog xmit
154}
155
156function test_ctx_out {
157 test_start "test_ctx on lwt out"
158 install_test out test_ctx
159 ping -c 3 $IPVETH1 || {
160 failure "test_ctx out: packets are dropped"
161 }
162 match_trace "$(get_trace)" "
163len 84 hash 0 protocol 0
164cb 1234 ingress_ifindex 0 ifindex 0
165len 84 hash 0 protocol 0
166cb 1234 ingress_ifindex 0 ifindex 0
167len 84 hash 0 protocol 0
168cb 1234 ingress_ifindex 0 ifindex 0" || exit 1
169 remove_prog out
170}
171
172function test_ctx_in {
173 test_start "test_ctx on lwt in"
174 install_test in test_ctx
175 ping -c 3 $IP_LOCAL || {
176 failure "test_ctx out: packets are dropped"
177 }
178 # We will both request & reply packets as the packets will
179 # be from $IP_LOCAL => $IP_LOCAL
180 match_trace "$(get_trace)" "
181len 84 hash 0 protocol 8
182cb 1234 ingress_ifindex 1 ifindex 1
183len 84 hash 0 protocol 8
184cb 1234 ingress_ifindex 1 ifindex 1
185len 84 hash 0 protocol 8
186cb 1234 ingress_ifindex 1 ifindex 1
187len 84 hash 0 protocol 8
188cb 1234 ingress_ifindex 1 ifindex 1
189len 84 hash 0 protocol 8
190cb 1234 ingress_ifindex 1 ifindex 1
191len 84 hash 0 protocol 8
192cb 1234 ingress_ifindex 1 ifindex 1" || exit 1
193 remove_prog in
194}
195
196function test_data {
197 test_start "test_data on lwt $1"
198 install_test $1 test_data
199 ping -c 3 $IPVETH1 || {
200 failure "test_data ${1}: packets are dropped"
201 }
202 match_trace "$(get_trace)" "
203src: 1fea8c0 dst: 2fea8c0
204src: 1fea8c0 dst: 2fea8c0
205src: 1fea8c0 dst: 2fea8c0" || exit 1
206 remove_prog $1
207}
208
209function test_data_in {
210 test_start "test_data on lwt in"
211 install_test in test_data
212 ping -c 3 $IP_LOCAL || {
213 failure "test_data in: packets are dropped"
214 }
215 # We will both request & reply packets as the packets will
216 # be from $IP_LOCAL => $IP_LOCAL
217 match_trace "$(get_trace)" "
218src: 163a8c0 dst: 163a8c0
219src: 163a8c0 dst: 163a8c0
220src: 163a8c0 dst: 163a8c0
221src: 163a8c0 dst: 163a8c0
222src: 163a8c0 dst: 163a8c0
223src: 163a8c0 dst: 163a8c0" || exit 1
224 remove_prog in
225}
226
227function test_cb {
228 test_start "test_cb on lwt $1"
229 install_test $1 test_cb
230 ping -c 3 $IPVETH1 || {
231 failure "test_cb ${1}: packets are dropped"
232 }
233 match_trace "$(get_trace)" "
234cb0: 0 cb1: 0 cb2: 0
235cb3: 0 cb4: 0
236cb0: 0 cb1: 0 cb2: 0
237cb3: 0 cb4: 0
238cb0: 0 cb1: 0 cb2: 0
239cb3: 0 cb4: 0" || exit 1
240 remove_prog $1
241}
242
243function test_cb_in {
244 test_start "test_cb on lwt in"
245 install_test in test_cb
246 ping -c 3 $IP_LOCAL || {
247 failure "test_cb in: packets are dropped"
248 }
249 # We will both request & reply packets as the packets will
250 # be from $IP_LOCAL => $IP_LOCAL
251 match_trace "$(get_trace)" "
252cb0: 0 cb1: 0 cb2: 0
253cb3: 0 cb4: 0
254cb0: 0 cb1: 0 cb2: 0
255cb3: 0 cb4: 0
256cb0: 0 cb1: 0 cb2: 0
257cb3: 0 cb4: 0
258cb0: 0 cb1: 0 cb2: 0
259cb3: 0 cb4: 0
260cb0: 0 cb1: 0 cb2: 0
261cb3: 0 cb4: 0
262cb0: 0 cb1: 0 cb2: 0
263cb3: 0 cb4: 0" || exit 1
264 remove_prog in
265}
266
267function test_drop_all {
268 test_start "test_drop_all on lwt $1"
269 install_test $1 drop_all
270 ping -c 3 $IPVETH1 && {
271 failure "test_drop_all ${1}: Unexpected success of ping"
272 }
273 match_trace "$(get_trace)" "
274dropping with: 2
275dropping with: 2
276dropping with: 2" || exit 1
277 remove_prog $1
278}
279
280function test_drop_all_in {
281 test_start "test_drop_all on lwt in"
282 install_test in drop_all
283 ping -c 3 $IP_LOCAL && {
284 failure "test_drop_all in: Unexpected success of ping"
285 }
286 match_trace "$(get_trace)" "
287dropping with: 2
288dropping with: 2
289dropping with: 2" || exit 1
290 remove_prog in
291}
292
293function test_push_ll_and_redirect {
294 test_start "test_push_ll_and_redirect on lwt xmit"
295 install_test xmit push_ll_and_redirect
296 ping -c 3 $IPVETH1 || {
297 failure "Redirected packets appear to be dropped"
298 }
299 match_trace "$(get_trace)" "
300redirected to $DST_IFINDEX
301redirected to $DST_IFINDEX
302redirected to $DST_IFINDEX" || exit 1
303 remove_prog xmit
304}
305
306function test_no_l2_and_redirect {
307 test_start "test_no_l2_and_redirect on lwt xmit"
308 install_test xmit fill_garbage_and_redirect
309 ping -c 3 $IPVETH1 && {
310 failure "Unexpected success despite lack of L2 header"
311 }
312 match_trace "$(get_trace)" "
313redirected to $DST_IFINDEX
314redirected to $DST_IFINDEX
315redirected to $DST_IFINDEX" || exit 1
316 remove_prog xmit
317}
318
319function test_rewrite {
320 test_start "test_rewrite on lwt xmit"
321 install_test xmit test_rewrite
322 ping -c 3 $IPVETH1 || {
323 failure "Rewritten packets appear to be dropped"
324 }
325 match_trace "$(get_trace)" "
326out: rewriting from 2fea8c0 to 3fea8c0
327out: rewriting from 2fea8c0 to 3fea8c0
328out: rewriting from 2fea8c0 to 3fea8c0" || exit 1
329 remove_prog out
330}
331
332function test_fill_garbage {
333 test_start "test_fill_garbage on lwt xmit"
334 install_test xmit fill_garbage
335 ping -c 3 $IPVETH1 && {
336 failure "test_drop_all ${1}: Unexpected success of ping"
337 }
338 match_trace "$(get_trace)" "
339Set initial 96 bytes of header to FF
340Set initial 96 bytes of header to FF
341Set initial 96 bytes of header to FF" || exit 1
342 remove_prog xmit
343}
344
345function test_netperf_nop {
346 test_start "test_netperf_nop on lwt xmit"
347 install_test xmit nop
348 netperf -H $IPVETH1 -t TCP_STREAM || {
349 failure "packets appear to be dropped"
350 }
351 match_trace "$(get_trace)" ""|| exit 1
352 remove_prog xmit
353}
354
355function test_netperf_redirect {
356 test_start "test_netperf_redirect on lwt xmit"
357 install_test xmit push_ll_and_redirect_silent
358 netperf -H $IPVETH1 -t TCP_STREAM || {
359 failure "Rewritten packets appear to be dropped"
360 }
361 match_trace "$(get_trace)" ""|| exit 1
362 remove_prog xmit
363}
364
365cleanup
366setup_one_veth $NS1 $VETH0 $VETH1 $IPVETH0 $IPVETH1 $IPVETH1b
367setup_one_veth $NS2 $VETH2 $VETH3 $IPVETH2 $IPVETH3
368ip netns exec $NS1 netserver
369echo 1 > ${TRACE_ROOT}/tracing_on
370
371DST_MAC=$(lookup_mac $VETH1 $NS1)
372SRC_MAC=$(lookup_mac $VETH0)
373DST_IFINDEX=$(cat /sys/class/net/$VETH0/ifindex)
374
375CLANG_OPTS="-O2 -target bpf -I ../include/"
376CLANG_OPTS+=" -DSRC_MAC=$SRC_MAC -DDST_MAC=$DST_MAC -DDST_IFINDEX=$DST_IFINDEX"
377clang $CLANG_OPTS -c test_lwt_bpf.c -o test_lwt_bpf.o
378
379test_ctx_xmit
380test_ctx_out
381test_ctx_in
382test_data "xmit"
383test_data "out"
384test_data_in
385test_cb "xmit"
386test_cb "out"
387test_cb_in
388test_drop_all "xmit"
389test_drop_all "out"
390test_drop_all_in
391test_rewrite
392test_push_ll_and_redirect
393test_no_l2_and_redirect
394test_fill_garbage
395test_netperf_nop
396test_netperf_redirect
397
398cleanup
399echo 0 > ${TRACE_ROOT}/tracing_on
400exit 0
diff --git a/samples/bpf/test_map_in_map_kern.c b/samples/bpf/test_map_in_map_kern.c
new file mode 100644
index 000000000..b0200c8ea
--- /dev/null
+++ b/samples/bpf/test_map_in_map_kern.c
@@ -0,0 +1,176 @@
1/*
2 * Copyright (c) 2017 Facebook
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 */
8#define KBUILD_MODNAME "foo"
9#include <linux/ptrace.h>
10#include <linux/version.h>
11#include <uapi/linux/bpf.h>
12#include <uapi/linux/in6.h>
13#include <bpf/bpf_helpers.h>
14#include <bpf/bpf_tracing.h>
15#include <bpf/bpf_core_read.h>
16#include "trace_common.h"
17
18#define MAX_NR_PORTS 65536
19
20/* map #0 */
21struct inner_a {
22 __uint(type, BPF_MAP_TYPE_ARRAY);
23 __type(key, u32);
24 __type(value, int);
25 __uint(max_entries, MAX_NR_PORTS);
26} port_a SEC(".maps");
27
28/* map #1 */
29struct inner_h {
30 __uint(type, BPF_MAP_TYPE_HASH);
31 __type(key, u32);
32 __type(value, int);
33 __uint(max_entries, 1);
34} port_h SEC(".maps");
35
36/* map #2 */
37struct {
38 __uint(type, BPF_MAP_TYPE_HASH);
39 __type(key, u32);
40 __type(value, int);
41 __uint(max_entries, 1);
42} reg_result_h SEC(".maps");
43
44/* map #3 */
45struct {
46 __uint(type, BPF_MAP_TYPE_HASH);
47 __type(key, u32);
48 __type(value, int);
49 __uint(max_entries, 1);
50} inline_result_h SEC(".maps");
51
52/* map #4 */ /* Test case #0 */
53struct {
54 __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
55 __uint(max_entries, MAX_NR_PORTS);
56 __uint(key_size, sizeof(u32));
57 __array(values, struct inner_a); /* use inner_a as inner map */
58} a_of_port_a SEC(".maps");
59
60/* map #5 */ /* Test case #1 */
61struct {
62 __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
63 __uint(max_entries, 1);
64 __uint(key_size, sizeof(u32));
65 __array(values, struct inner_a); /* use inner_a as inner map */
66} h_of_port_a SEC(".maps");
67
68/* map #6 */ /* Test case #2 */
69struct {
70 __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
71 __uint(max_entries, 1);
72 __uint(key_size, sizeof(u32));
73 __array(values, struct inner_h); /* use inner_h as inner map */
74} h_of_port_h SEC(".maps");
75
76static __always_inline int do_reg_lookup(void *inner_map, u32 port)
77{
78 int *result;
79
80 result = bpf_map_lookup_elem(inner_map, &port);
81 return result ? *result : -ENOENT;
82}
83
84static __always_inline int do_inline_array_lookup(void *inner_map, u32 port)
85{
86 int *result;
87
88 if (inner_map != &port_a)
89 return -EINVAL;
90
91 result = bpf_map_lookup_elem(&port_a, &port);
92 return result ? *result : -ENOENT;
93}
94
95static __always_inline int do_inline_hash_lookup(void *inner_map, u32 port)
96{
97 int *result;
98
99 if (inner_map != &port_h)
100 return -EINVAL;
101
102 result = bpf_map_lookup_elem(&port_h, &port);
103 return result ? *result : -ENOENT;
104}
105
106SEC("kprobe/__sys_connect")
107int trace_sys_connect(struct pt_regs *ctx)
108{
109 struct sockaddr_in6 *in6;
110 u16 test_case, port, dst6[8];
111 int addrlen, ret, inline_ret, ret_key = 0;
112 u32 port_key;
113 void *outer_map, *inner_map;
114 bool inline_hash = false;
115
116 in6 = (struct sockaddr_in6 *)PT_REGS_PARM2_CORE(ctx);
117 addrlen = (int)PT_REGS_PARM3_CORE(ctx);
118
119 if (addrlen != sizeof(*in6))
120 return 0;
121
122 ret = bpf_probe_read_user(dst6, sizeof(dst6), &in6->sin6_addr);
123 if (ret) {
124 inline_ret = ret;
125 goto done;
126 }
127
128 if (dst6[0] != 0xdead || dst6[1] != 0xbeef)
129 return 0;
130
131 test_case = dst6[7];
132
133 ret = bpf_probe_read_user(&port, sizeof(port), &in6->sin6_port);
134 if (ret) {
135 inline_ret = ret;
136 goto done;
137 }
138
139 port_key = port;
140
141 ret = -ENOENT;
142 if (test_case == 0) {
143 outer_map = &a_of_port_a;
144 } else if (test_case == 1) {
145 outer_map = &h_of_port_a;
146 } else if (test_case == 2) {
147 outer_map = &h_of_port_h;
148 } else {
149 ret = __LINE__;
150 inline_ret = ret;
151 goto done;
152 }
153
154 inner_map = bpf_map_lookup_elem(outer_map, &port_key);
155 if (!inner_map) {
156 ret = __LINE__;
157 inline_ret = ret;
158 goto done;
159 }
160
161 ret = do_reg_lookup(inner_map, port_key);
162
163 if (test_case == 0 || test_case == 1)
164 inline_ret = do_inline_array_lookup(inner_map, port_key);
165 else
166 inline_ret = do_inline_hash_lookup(inner_map, port_key);
167
168done:
169 bpf_map_update_elem(&reg_result_h, &ret_key, &ret, BPF_ANY);
170 bpf_map_update_elem(&inline_result_h, &ret_key, &inline_ret, BPF_ANY);
171
172 return 0;
173}
174
175char _license[] SEC("license") = "GPL";
176u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/test_map_in_map_user.c b/samples/bpf/test_map_in_map_user.c
new file mode 100644
index 000000000..98656de56
--- /dev/null
+++ b/samples/bpf/test_map_in_map_user.c
@@ -0,0 +1,173 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (c) 2017 Facebook
4 */
5#include <sys/resource.h>
6#include <sys/socket.h>
7#include <arpa/inet.h>
8#include <stdint.h>
9#include <assert.h>
10#include <errno.h>
11#include <stdlib.h>
12#include <stdio.h>
13#include <bpf/bpf.h>
14#include <bpf/libbpf.h>
15
16static int map_fd[7];
17
18#define PORT_A (map_fd[0])
19#define PORT_H (map_fd[1])
20#define REG_RESULT_H (map_fd[2])
21#define INLINE_RESULT_H (map_fd[3])
22#define A_OF_PORT_A (map_fd[4]) /* Test case #0 */
23#define H_OF_PORT_A (map_fd[5]) /* Test case #1 */
24#define H_OF_PORT_H (map_fd[6]) /* Test case #2 */
25
26static const char * const test_names[] = {
27 "Array of Array",
28 "Hash of Array",
29 "Hash of Hash",
30};
31
32#define NR_TESTS (sizeof(test_names) / sizeof(*test_names))
33
34static void check_map_id(int inner_map_fd, int map_in_map_fd, uint32_t key)
35{
36 struct bpf_map_info info = {};
37 uint32_t info_len = sizeof(info);
38 int ret, id;
39
40 ret = bpf_obj_get_info_by_fd(inner_map_fd, &info, &info_len);
41 assert(!ret);
42
43 ret = bpf_map_lookup_elem(map_in_map_fd, &key, &id);
44 assert(!ret);
45 assert(id == info.id);
46}
47
48static void populate_map(uint32_t port_key, int magic_result)
49{
50 int ret;
51
52 ret = bpf_map_update_elem(PORT_A, &port_key, &magic_result, BPF_ANY);
53 assert(!ret);
54
55 ret = bpf_map_update_elem(PORT_H, &port_key, &magic_result,
56 BPF_NOEXIST);
57 assert(!ret);
58
59 ret = bpf_map_update_elem(A_OF_PORT_A, &port_key, &PORT_A, BPF_ANY);
60 assert(!ret);
61 check_map_id(PORT_A, A_OF_PORT_A, port_key);
62
63 ret = bpf_map_update_elem(H_OF_PORT_A, &port_key, &PORT_A, BPF_NOEXIST);
64 assert(!ret);
65 check_map_id(PORT_A, H_OF_PORT_A, port_key);
66
67 ret = bpf_map_update_elem(H_OF_PORT_H, &port_key, &PORT_H, BPF_NOEXIST);
68 assert(!ret);
69 check_map_id(PORT_H, H_OF_PORT_H, port_key);
70}
71
72static void test_map_in_map(void)
73{
74 struct sockaddr_in6 in6 = { .sin6_family = AF_INET6 };
75 uint32_t result_key = 0, port_key;
76 int result, inline_result;
77 int magic_result = 0xfaceb00c;
78 int ret;
79 int i;
80
81 port_key = rand() & 0x00FF;
82 populate_map(port_key, magic_result);
83
84 in6.sin6_addr.s6_addr16[0] = 0xdead;
85 in6.sin6_addr.s6_addr16[1] = 0xbeef;
86 in6.sin6_port = port_key;
87
88 for (i = 0; i < NR_TESTS; i++) {
89 printf("%s: ", test_names[i]);
90
91 in6.sin6_addr.s6_addr16[7] = i;
92 ret = connect(-1, (struct sockaddr *)&in6, sizeof(in6));
93 assert(ret == -1 && errno == EBADF);
94
95 ret = bpf_map_lookup_elem(REG_RESULT_H, &result_key, &result);
96 assert(!ret);
97
98 ret = bpf_map_lookup_elem(INLINE_RESULT_H, &result_key,
99 &inline_result);
100 assert(!ret);
101
102 if (result != magic_result || inline_result != magic_result) {
103 printf("Error. result:%d inline_result:%d\n",
104 result, inline_result);
105 exit(1);
106 }
107
108 bpf_map_delete_elem(REG_RESULT_H, &result_key);
109 bpf_map_delete_elem(INLINE_RESULT_H, &result_key);
110
111 printf("Pass\n");
112 }
113}
114
115int main(int argc, char **argv)
116{
117 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
118 struct bpf_link *link = NULL;
119 struct bpf_program *prog;
120 struct bpf_object *obj;
121 char filename[256];
122
123 if (setrlimit(RLIMIT_MEMLOCK, &r)) {
124 perror("setrlimit(RLIMIT_MEMLOCK)");
125 return 1;
126 }
127
128 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
129 obj = bpf_object__open_file(filename, NULL);
130 if (libbpf_get_error(obj)) {
131 fprintf(stderr, "ERROR: opening BPF object file failed\n");
132 return 0;
133 }
134
135 prog = bpf_object__find_program_by_name(obj, "trace_sys_connect");
136 if (!prog) {
137 printf("finding a prog in obj file failed\n");
138 goto cleanup;
139 }
140
141 /* load BPF program */
142 if (bpf_object__load(obj)) {
143 fprintf(stderr, "ERROR: loading BPF object file failed\n");
144 goto cleanup;
145 }
146
147 map_fd[0] = bpf_object__find_map_fd_by_name(obj, "port_a");
148 map_fd[1] = bpf_object__find_map_fd_by_name(obj, "port_h");
149 map_fd[2] = bpf_object__find_map_fd_by_name(obj, "reg_result_h");
150 map_fd[3] = bpf_object__find_map_fd_by_name(obj, "inline_result_h");
151 map_fd[4] = bpf_object__find_map_fd_by_name(obj, "a_of_port_a");
152 map_fd[5] = bpf_object__find_map_fd_by_name(obj, "h_of_port_a");
153 map_fd[6] = bpf_object__find_map_fd_by_name(obj, "h_of_port_h");
154 if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0 ||
155 map_fd[3] < 0 || map_fd[4] < 0 || map_fd[5] < 0 || map_fd[6] < 0) {
156 fprintf(stderr, "ERROR: finding a map in obj file failed\n");
157 goto cleanup;
158 }
159
160 link = bpf_program__attach(prog);
161 if (libbpf_get_error(link)) {
162 fprintf(stderr, "ERROR: bpf_program__attach failed\n");
163 link = NULL;
164 goto cleanup;
165 }
166
167 test_map_in_map();
168
169cleanup:
170 bpf_link__destroy(link);
171 bpf_object__close(obj);
172 return 0;
173}
diff --git a/samples/bpf/test_overhead_kprobe_kern.c b/samples/bpf/test_overhead_kprobe_kern.c
new file mode 100644
index 000000000..f6d593e47
--- /dev/null
+++ b/samples/bpf/test_overhead_kprobe_kern.c
@@ -0,0 +1,48 @@
1/* Copyright (c) 2016 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#include <linux/version.h>
8#include <linux/ptrace.h>
9#include <uapi/linux/bpf.h>
10#include <bpf/bpf_helpers.h>
11#include <bpf/bpf_tracing.h>
12
13#define _(P) \
14 ({ \
15 typeof(P) val = 0; \
16 bpf_probe_read_kernel(&val, sizeof(val), &(P)); \
17 val; \
18 })
19
20SEC("kprobe/__set_task_comm")
21int prog(struct pt_regs *ctx)
22{
23 struct signal_struct *signal;
24 struct task_struct *tsk;
25 char oldcomm[16] = {};
26 char newcomm[16] = {};
27 u16 oom_score_adj;
28 u32 pid;
29
30 tsk = (void *)PT_REGS_PARM1(ctx);
31
32 pid = _(tsk->pid);
33 bpf_probe_read_kernel(oldcomm, sizeof(oldcomm), &tsk->comm);
34 bpf_probe_read_kernel(newcomm, sizeof(newcomm),
35 (void *)PT_REGS_PARM2(ctx));
36 signal = _(tsk->signal);
37 oom_score_adj = _(signal->oom_score_adj);
38 return 0;
39}
40
41SEC("kprobe/urandom_read")
42int prog2(struct pt_regs *ctx)
43{
44 return 0;
45}
46
47char _license[] SEC("license") = "GPL";
48u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/test_overhead_raw_tp_kern.c b/samples/bpf/test_overhead_raw_tp_kern.c
new file mode 100644
index 000000000..8763181a3
--- /dev/null
+++ b/samples/bpf/test_overhead_raw_tp_kern.c
@@ -0,0 +1,17 @@
1// SPDX-License-Identifier: GPL-2.0
2/* Copyright (c) 2018 Facebook */
3#include <uapi/linux/bpf.h>
4#include <bpf/bpf_helpers.h>
5
6SEC("raw_tracepoint/task_rename")
7int prog(struct bpf_raw_tracepoint_args *ctx)
8{
9 return 0;
10}
11
12SEC("raw_tracepoint/urandom_read")
13int prog2(struct bpf_raw_tracepoint_args *ctx)
14{
15 return 0;
16}
17char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_overhead_tp_kern.c b/samples/bpf/test_overhead_tp_kern.c
new file mode 100644
index 000000000..eaa32693f
--- /dev/null
+++ b/samples/bpf/test_overhead_tp_kern.c
@@ -0,0 +1,36 @@
1/* Copyright (c) 2016 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#include <uapi/linux/bpf.h>
8#include <bpf/bpf_helpers.h>
9
10/* from /sys/kernel/debug/tracing/events/task/task_rename/format */
11struct task_rename {
12 __u64 pad;
13 __u32 pid;
14 char oldcomm[16];
15 char newcomm[16];
16 __u16 oom_score_adj;
17};
18SEC("tracepoint/task/task_rename")
19int prog(struct task_rename *ctx)
20{
21 return 0;
22}
23
24/* from /sys/kernel/debug/tracing/events/random/urandom_read/format */
25struct urandom_read {
26 __u64 pad;
27 int got_bits;
28 int pool_left;
29 int input_left;
30};
31SEC("tracepoint/random/urandom_read")
32int prog2(struct urandom_read *ctx)
33{
34 return 0;
35}
36char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_overhead_user.c b/samples/bpf/test_overhead_user.c
new file mode 100644
index 000000000..94f74112a
--- /dev/null
+++ b/samples/bpf/test_overhead_user.c
@@ -0,0 +1,182 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright (c) 2016 Facebook
3 */
4#define _GNU_SOURCE
5#include <sched.h>
6#include <errno.h>
7#include <stdio.h>
8#include <sys/types.h>
9#include <asm/unistd.h>
10#include <fcntl.h>
11#include <unistd.h>
12#include <assert.h>
13#include <sys/wait.h>
14#include <stdlib.h>
15#include <signal.h>
16#include <linux/bpf.h>
17#include <string.h>
18#include <time.h>
19#include <sys/resource.h>
20#include <bpf/bpf.h>
21#include "bpf_load.h"
22
23#define MAX_CNT 1000000
24
25static __u64 time_get_ns(void)
26{
27 struct timespec ts;
28
29 clock_gettime(CLOCK_MONOTONIC, &ts);
30 return ts.tv_sec * 1000000000ull + ts.tv_nsec;
31}
32
33static void test_task_rename(int cpu)
34{
35 __u64 start_time;
36 char buf[] = "test\n";
37 int i, fd;
38
39 fd = open("/proc/self/comm", O_WRONLY|O_TRUNC);
40 if (fd < 0) {
41 printf("couldn't open /proc\n");
42 exit(1);
43 }
44 start_time = time_get_ns();
45 for (i = 0; i < MAX_CNT; i++) {
46 if (write(fd, buf, sizeof(buf)) < 0) {
47 printf("task rename failed: %s\n", strerror(errno));
48 close(fd);
49 return;
50 }
51 }
52 printf("task_rename:%d: %lld events per sec\n",
53 cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
54 close(fd);
55}
56
57static void test_urandom_read(int cpu)
58{
59 __u64 start_time;
60 char buf[4];
61 int i, fd;
62
63 fd = open("/dev/urandom", O_RDONLY);
64 if (fd < 0) {
65 printf("couldn't open /dev/urandom\n");
66 exit(1);
67 }
68 start_time = time_get_ns();
69 for (i = 0; i < MAX_CNT; i++) {
70 if (read(fd, buf, sizeof(buf)) < 0) {
71 printf("failed to read from /dev/urandom: %s\n", strerror(errno));
72 close(fd);
73 return;
74 }
75 }
76 printf("urandom_read:%d: %lld events per sec\n",
77 cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
78 close(fd);
79}
80
81static void loop(int cpu, int flags)
82{
83 cpu_set_t cpuset;
84
85 CPU_ZERO(&cpuset);
86 CPU_SET(cpu, &cpuset);
87 sched_setaffinity(0, sizeof(cpuset), &cpuset);
88
89 if (flags & 1)
90 test_task_rename(cpu);
91 if (flags & 2)
92 test_urandom_read(cpu);
93}
94
95static void run_perf_test(int tasks, int flags)
96{
97 pid_t pid[tasks];
98 int i;
99
100 for (i = 0; i < tasks; i++) {
101 pid[i] = fork();
102 if (pid[i] == 0) {
103 loop(i, flags);
104 exit(0);
105 } else if (pid[i] == -1) {
106 printf("couldn't spawn #%d process\n", i);
107 exit(1);
108 }
109 }
110 for (i = 0; i < tasks; i++) {
111 int status;
112
113 assert(waitpid(pid[i], &status, 0) == pid[i]);
114 assert(status == 0);
115 }
116}
117
118static void unload_progs(void)
119{
120 close(prog_fd[0]);
121 close(prog_fd[1]);
122 close(event_fd[0]);
123 close(event_fd[1]);
124}
125
126int main(int argc, char **argv)
127{
128 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
129 char filename[256];
130 int num_cpu = 8;
131 int test_flags = ~0;
132
133 setrlimit(RLIMIT_MEMLOCK, &r);
134
135 if (argc > 1)
136 test_flags = atoi(argv[1]) ? : test_flags;
137 if (argc > 2)
138 num_cpu = atoi(argv[2]) ? : num_cpu;
139
140 if (test_flags & 0x3) {
141 printf("BASE\n");
142 run_perf_test(num_cpu, test_flags);
143 }
144
145 if (test_flags & 0xC) {
146 snprintf(filename, sizeof(filename),
147 "%s_kprobe_kern.o", argv[0]);
148 if (load_bpf_file(filename)) {
149 printf("%s", bpf_log_buf);
150 return 1;
151 }
152 printf("w/KPROBE\n");
153 run_perf_test(num_cpu, test_flags >> 2);
154 unload_progs();
155 }
156
157 if (test_flags & 0x30) {
158 snprintf(filename, sizeof(filename),
159 "%s_tp_kern.o", argv[0]);
160 if (load_bpf_file(filename)) {
161 printf("%s", bpf_log_buf);
162 return 1;
163 }
164 printf("w/TRACEPOINT\n");
165 run_perf_test(num_cpu, test_flags >> 4);
166 unload_progs();
167 }
168
169 if (test_flags & 0xC0) {
170 snprintf(filename, sizeof(filename),
171 "%s_raw_tp_kern.o", argv[0]);
172 if (load_bpf_file(filename)) {
173 printf("%s", bpf_log_buf);
174 return 1;
175 }
176 printf("w/RAW_TRACEPOINT\n");
177 run_perf_test(num_cpu, test_flags >> 6);
178 unload_progs();
179 }
180
181 return 0;
182}
diff --git a/samples/bpf/test_override_return.sh b/samples/bpf/test_override_return.sh
new file mode 100755
index 000000000..35db26f73
--- /dev/null
+++ b/samples/bpf/test_override_return.sh
@@ -0,0 +1,16 @@
1#!/bin/bash
2
3rm -r tmpmnt
4rm -f testfile.img
5dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
6DEVICE=$(losetup --show -f testfile.img)
7mkfs.btrfs -f $DEVICE
8mkdir tmpmnt
9./tracex7 $DEVICE
10if [ $? -eq 0 ]
11then
12 echo "SUCCESS!"
13else
14 echo "FAILED!"
15fi
16losetup -d $DEVICE
diff --git a/samples/bpf/test_probe_write_user_kern.c b/samples/bpf/test_probe_write_user_kern.c
new file mode 100644
index 000000000..220a96438
--- /dev/null
+++ b/samples/bpf/test_probe_write_user_kern.c
@@ -0,0 +1,56 @@
1/* Copyright (c) 2016 Sargun Dhillon <sargun@sargun.me>
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#include <linux/skbuff.h>
8#include <linux/netdevice.h>
9#include <uapi/linux/bpf.h>
10#include <linux/version.h>
11#include <bpf/bpf_helpers.h>
12#include <bpf/bpf_tracing.h>
13#include <bpf/bpf_core_read.h>
14#include "trace_common.h"
15
16struct {
17 __uint(type, BPF_MAP_TYPE_HASH);
18 __type(key, struct sockaddr_in);
19 __type(value, struct sockaddr_in);
20 __uint(max_entries, 256);
21} dnat_map SEC(".maps");
22
23/* kprobe is NOT a stable ABI
24 * kernel functions can be removed, renamed or completely change semantics.
25 * Number of arguments and their positions can change, etc.
26 * In such case this bpf+kprobe example will no longer be meaningful
27 *
28 * This example sits on a syscall, and the syscall ABI is relatively stable
29 * of course, across platforms, and over time, the ABI may change.
30 */
31SEC("kprobe/" SYSCALL(sys_connect))
32int bpf_prog1(struct pt_regs *ctx)
33{
34 struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx);
35 void *sockaddr_arg = (void *)PT_REGS_PARM2_CORE(real_regs);
36 int sockaddr_len = (int)PT_REGS_PARM3_CORE(real_regs);
37 struct sockaddr_in new_addr, orig_addr = {};
38 struct sockaddr_in *mapped_addr;
39
40 if (sockaddr_len > sizeof(orig_addr))
41 return 0;
42
43 if (bpf_probe_read_user(&orig_addr, sizeof(orig_addr), sockaddr_arg) != 0)
44 return 0;
45
46 mapped_addr = bpf_map_lookup_elem(&dnat_map, &orig_addr);
47 if (mapped_addr != NULL) {
48 memcpy(&new_addr, mapped_addr, sizeof(new_addr));
49 bpf_probe_write_user(sockaddr_arg, &new_addr,
50 sizeof(new_addr));
51 }
52 return 0;
53}
54
55char _license[] SEC("license") = "GPL";
56u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/test_probe_write_user_user.c b/samples/bpf/test_probe_write_user_user.c
new file mode 100644
index 000000000..00ccfb834
--- /dev/null
+++ b/samples/bpf/test_probe_write_user_user.c
@@ -0,0 +1,108 @@
1// SPDX-License-Identifier: GPL-2.0
2#include <stdio.h>
3#include <assert.h>
4#include <unistd.h>
5#include <bpf/bpf.h>
6#include <bpf/libbpf.h>
7#include <sys/socket.h>
8#include <netinet/in.h>
9#include <arpa/inet.h>
10
11int main(int ac, char **argv)
12{
13 struct sockaddr_in *serv_addr_in, *mapped_addr_in, *tmp_addr_in;
14 struct sockaddr serv_addr, mapped_addr, tmp_addr;
15 int serverfd, serverconnfd, clientfd, map_fd;
16 struct bpf_link *link = NULL;
17 struct bpf_program *prog;
18 struct bpf_object *obj;
19 socklen_t sockaddr_len;
20 char filename[256];
21 char *ip;
22
23 serv_addr_in = (struct sockaddr_in *)&serv_addr;
24 mapped_addr_in = (struct sockaddr_in *)&mapped_addr;
25 tmp_addr_in = (struct sockaddr_in *)&tmp_addr;
26
27 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
28 obj = bpf_object__open_file(filename, NULL);
29 if (libbpf_get_error(obj)) {
30 fprintf(stderr, "ERROR: opening BPF object file failed\n");
31 return 0;
32 }
33
34 prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
35 if (libbpf_get_error(prog)) {
36 fprintf(stderr, "ERROR: finding a prog in obj file failed\n");
37 goto cleanup;
38 }
39
40 /* load BPF program */
41 if (bpf_object__load(obj)) {
42 fprintf(stderr, "ERROR: loading BPF object file failed\n");
43 goto cleanup;
44 }
45
46 map_fd = bpf_object__find_map_fd_by_name(obj, "dnat_map");
47 if (map_fd < 0) {
48 fprintf(stderr, "ERROR: finding a map in obj file failed\n");
49 goto cleanup;
50 }
51
52 link = bpf_program__attach(prog);
53 if (libbpf_get_error(link)) {
54 fprintf(stderr, "ERROR: bpf_program__attach failed\n");
55 link = NULL;
56 goto cleanup;
57 }
58
59 assert((serverfd = socket(AF_INET, SOCK_STREAM, 0)) > 0);
60 assert((clientfd = socket(AF_INET, SOCK_STREAM, 0)) > 0);
61
62 /* Bind server to ephemeral port on lo */
63 memset(&serv_addr, 0, sizeof(serv_addr));
64 serv_addr_in->sin_family = AF_INET;
65 serv_addr_in->sin_port = 0;
66 serv_addr_in->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
67
68 assert(bind(serverfd, &serv_addr, sizeof(serv_addr)) == 0);
69
70 sockaddr_len = sizeof(serv_addr);
71 assert(getsockname(serverfd, &serv_addr, &sockaddr_len) == 0);
72 ip = inet_ntoa(serv_addr_in->sin_addr);
73 printf("Server bound to: %s:%d\n", ip, ntohs(serv_addr_in->sin_port));
74
75 memset(&mapped_addr, 0, sizeof(mapped_addr));
76 mapped_addr_in->sin_family = AF_INET;
77 mapped_addr_in->sin_port = htons(5555);
78 mapped_addr_in->sin_addr.s_addr = inet_addr("255.255.255.255");
79
80 assert(!bpf_map_update_elem(map_fd, &mapped_addr, &serv_addr, BPF_ANY));
81
82 assert(listen(serverfd, 5) == 0);
83
84 ip = inet_ntoa(mapped_addr_in->sin_addr);
85 printf("Client connecting to: %s:%d\n",
86 ip, ntohs(mapped_addr_in->sin_port));
87 assert(connect(clientfd, &mapped_addr, sizeof(mapped_addr)) == 0);
88
89 sockaddr_len = sizeof(tmp_addr);
90 ip = inet_ntoa(tmp_addr_in->sin_addr);
91 assert((serverconnfd = accept(serverfd, &tmp_addr, &sockaddr_len)) > 0);
92 printf("Server received connection from: %s:%d\n",
93 ip, ntohs(tmp_addr_in->sin_port));
94
95 sockaddr_len = sizeof(tmp_addr);
96 assert(getpeername(clientfd, &tmp_addr, &sockaddr_len) == 0);
97 ip = inet_ntoa(tmp_addr_in->sin_addr);
98 printf("Client's peer address: %s:%d\n",
99 ip, ntohs(tmp_addr_in->sin_port));
100
101 /* Is the server's getsockname = the socket getpeername */
102 assert(memcmp(&serv_addr, &tmp_addr, sizeof(struct sockaddr_in)) == 0);
103
104cleanup:
105 bpf_link__destroy(link);
106 bpf_object__close(obj);
107 return 0;
108}
diff --git a/samples/bpf/trace_common.h b/samples/bpf/trace_common.h
new file mode 100644
index 000000000..8cb5400ae
--- /dev/null
+++ b/samples/bpf/trace_common.h
@@ -0,0 +1,13 @@
1// SPDX-License-Identifier: GPL-2.0
2#ifndef __TRACE_COMMON_H
3#define __TRACE_COMMON_H
4
5#ifdef __x86_64__
6#define SYSCALL(SYS) "__x64_" __stringify(SYS)
7#elif defined(__s390x__)
8#define SYSCALL(SYS) "__s390x_" __stringify(SYS)
9#else
10#define SYSCALL(SYS) __stringify(SYS)
11#endif
12
13#endif
diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c
new file mode 100644
index 000000000..7d3c66fb3
--- /dev/null
+++ b/samples/bpf/trace_event_kern.c
@@ -0,0 +1,80 @@
1/* Copyright (c) 2016 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#include <linux/ptrace.h>
8#include <linux/version.h>
9#include <uapi/linux/bpf.h>
10#include <uapi/linux/bpf_perf_event.h>
11#include <uapi/linux/perf_event.h>
12#include <bpf/bpf_helpers.h>
13#include <bpf/bpf_tracing.h>
14
15struct key_t {
16 char comm[TASK_COMM_LEN];
17 u32 kernstack;
18 u32 userstack;
19};
20
21struct {
22 __uint(type, BPF_MAP_TYPE_HASH);
23 __type(key, struct key_t);
24 __type(value, u64);
25 __uint(max_entries, 10000);
26} counts SEC(".maps");
27
28struct {
29 __uint(type, BPF_MAP_TYPE_STACK_TRACE);
30 __uint(key_size, sizeof(u32));
31 __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64));
32 __uint(max_entries, 10000);
33} stackmap SEC(".maps");
34
35#define KERN_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP)
36#define USER_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK)
37
38SEC("perf_event")
39int bpf_prog1(struct bpf_perf_event_data *ctx)
40{
41 char time_fmt1[] = "Time Enabled: %llu, Time Running: %llu";
42 char time_fmt2[] = "Get Time Failed, ErrCode: %d";
43 char addr_fmt[] = "Address recorded on event: %llx";
44 char fmt[] = "CPU-%d period %lld ip %llx";
45 u32 cpu = bpf_get_smp_processor_id();
46 struct bpf_perf_event_value value_buf;
47 struct key_t key;
48 u64 *val, one = 1;
49 int ret;
50
51 if (ctx->sample_period < 10000)
52 /* ignore warmup */
53 return 0;
54 bpf_get_current_comm(&key.comm, sizeof(key.comm));
55 key.kernstack = bpf_get_stackid(ctx, &stackmap, KERN_STACKID_FLAGS);
56 key.userstack = bpf_get_stackid(ctx, &stackmap, USER_STACKID_FLAGS);
57 if ((int)key.kernstack < 0 && (int)key.userstack < 0) {
58 bpf_trace_printk(fmt, sizeof(fmt), cpu, ctx->sample_period,
59 PT_REGS_IP(&ctx->regs));
60 return 0;
61 }
62
63 ret = bpf_perf_prog_read_value(ctx, (void *)&value_buf, sizeof(struct bpf_perf_event_value));
64 if (!ret)
65 bpf_trace_printk(time_fmt1, sizeof(time_fmt1), value_buf.enabled, value_buf.running);
66 else
67 bpf_trace_printk(time_fmt2, sizeof(time_fmt2), ret);
68
69 if (ctx->addr != 0)
70 bpf_trace_printk(addr_fmt, sizeof(addr_fmt), ctx->addr);
71
72 val = bpf_map_lookup_elem(&counts, &key);
73 if (val)
74 (*val)++;
75 else
76 bpf_map_update_elem(&counts, &key, &one, BPF_NOEXIST);
77 return 0;
78}
79
80char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c
new file mode 100644
index 000000000..ac1ba3681
--- /dev/null
+++ b/samples/bpf/trace_event_user.c
@@ -0,0 +1,354 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright (c) 2016 Facebook
3 */
4#include <stdio.h>
5#include <unistd.h>
6#include <stdlib.h>
7#include <stdbool.h>
8#include <string.h>
9#include <linux/perf_event.h>
10#include <linux/bpf.h>
11#include <signal.h>
12#include <errno.h>
13#include <sys/resource.h>
14#include <bpf/bpf.h>
15#include <bpf/libbpf.h>
16#include "perf-sys.h"
17#include "trace_helpers.h"
18
19#define SAMPLE_FREQ 50
20
21static int pid;
22/* counts, stackmap */
23static int map_fd[2];
24struct bpf_program *prog;
25static bool sys_read_seen, sys_write_seen;
26
27static void print_ksym(__u64 addr)
28{
29 struct ksym *sym;
30
31 if (!addr)
32 return;
33 sym = ksym_search(addr);
34 if (!sym) {
35 printf("ksym not found. Is kallsyms loaded?\n");
36 return;
37 }
38
39 printf("%s;", sym->name);
40 if (!strstr(sym->name, "sys_read"))
41 sys_read_seen = true;
42 else if (!strstr(sym->name, "sys_write"))
43 sys_write_seen = true;
44}
45
46static void print_addr(__u64 addr)
47{
48 if (!addr)
49 return;
50 printf("%llx;", addr);
51}
52
53#define TASK_COMM_LEN 16
54
55struct key_t {
56 char comm[TASK_COMM_LEN];
57 __u32 kernstack;
58 __u32 userstack;
59};
60
61static void print_stack(struct key_t *key, __u64 count)
62{
63 __u64 ip[PERF_MAX_STACK_DEPTH] = {};
64 static bool warned;
65 int i;
66
67 printf("%3lld %s;", count, key->comm);
68 if (bpf_map_lookup_elem(map_fd[1], &key->kernstack, ip) != 0) {
69 printf("---;");
70 } else {
71 for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--)
72 print_ksym(ip[i]);
73 }
74 printf("-;");
75 if (bpf_map_lookup_elem(map_fd[1], &key->userstack, ip) != 0) {
76 printf("---;");
77 } else {
78 for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--)
79 print_addr(ip[i]);
80 }
81 if (count < 6)
82 printf("\r");
83 else
84 printf("\n");
85
86 if (key->kernstack == -EEXIST && !warned) {
87 printf("stackmap collisions seen. Consider increasing size\n");
88 warned = true;
89 } else if ((int)key->kernstack < 0 && (int)key->userstack < 0) {
90 printf("err stackid %d %d\n", key->kernstack, key->userstack);
91 }
92}
93
94static void err_exit(int err)
95{
96 kill(pid, SIGKILL);
97 exit(err);
98}
99
100static void print_stacks(void)
101{
102 struct key_t key = {}, next_key;
103 __u64 value;
104 __u32 stackid = 0, next_id;
105 int error = 1, fd = map_fd[0], stack_map = map_fd[1];
106
107 sys_read_seen = sys_write_seen = false;
108 while (bpf_map_get_next_key(fd, &key, &next_key) == 0) {
109 bpf_map_lookup_elem(fd, &next_key, &value);
110 print_stack(&next_key, value);
111 bpf_map_delete_elem(fd, &next_key);
112 key = next_key;
113 }
114 printf("\n");
115 if (!sys_read_seen || !sys_write_seen) {
116 printf("BUG kernel stack doesn't contain sys_read() and sys_write()\n");
117 err_exit(error);
118 }
119
120 /* clear stack map */
121 while (bpf_map_get_next_key(stack_map, &stackid, &next_id) == 0) {
122 bpf_map_delete_elem(stack_map, &next_id);
123 stackid = next_id;
124 }
125}
126
127static inline int generate_load(void)
128{
129 if (system("dd if=/dev/zero of=/dev/null count=5000k status=none") < 0) {
130 printf("failed to generate some load with dd: %s\n", strerror(errno));
131 return -1;
132 }
133
134 return 0;
135}
136
137static void test_perf_event_all_cpu(struct perf_event_attr *attr)
138{
139 int nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
140 struct bpf_link **links = calloc(nr_cpus, sizeof(struct bpf_link *));
141 int i, pmu_fd, error = 1;
142
143 if (!links) {
144 printf("malloc of links failed\n");
145 goto err;
146 }
147
148 /* system wide perf event, no need to inherit */
149 attr->inherit = 0;
150
151 /* open perf_event on all cpus */
152 for (i = 0; i < nr_cpus; i++) {
153 pmu_fd = sys_perf_event_open(attr, -1, i, -1, 0);
154 if (pmu_fd < 0) {
155 printf("sys_perf_event_open failed\n");
156 goto all_cpu_err;
157 }
158 links[i] = bpf_program__attach_perf_event(prog, pmu_fd);
159 if (libbpf_get_error(links[i])) {
160 printf("bpf_program__attach_perf_event failed\n");
161 links[i] = NULL;
162 close(pmu_fd);
163 goto all_cpu_err;
164 }
165 }
166
167 if (generate_load() < 0)
168 goto all_cpu_err;
169
170 print_stacks();
171 error = 0;
172all_cpu_err:
173 for (i--; i >= 0; i--)
174 bpf_link__destroy(links[i]);
175err:
176 free(links);
177 if (error)
178 err_exit(error);
179}
180
181static void test_perf_event_task(struct perf_event_attr *attr)
182{
183 struct bpf_link *link = NULL;
184 int pmu_fd, error = 1;
185
186 /* per task perf event, enable inherit so the "dd ..." command can be traced properly.
187 * Enabling inherit will cause bpf_perf_prog_read_time helper failure.
188 */
189 attr->inherit = 1;
190
191 /* open task bound event */
192 pmu_fd = sys_perf_event_open(attr, 0, -1, -1, 0);
193 if (pmu_fd < 0) {
194 printf("sys_perf_event_open failed\n");
195 goto err;
196 }
197 link = bpf_program__attach_perf_event(prog, pmu_fd);
198 if (libbpf_get_error(link)) {
199 printf("bpf_program__attach_perf_event failed\n");
200 link = NULL;
201 close(pmu_fd);
202 goto err;
203 }
204
205 if (generate_load() < 0)
206 goto err;
207
208 print_stacks();
209 error = 0;
210err:
211 bpf_link__destroy(link);
212 if (error)
213 err_exit(error);
214}
215
216static void test_bpf_perf_event(void)
217{
218 struct perf_event_attr attr_type_hw = {
219 .sample_freq = SAMPLE_FREQ,
220 .freq = 1,
221 .type = PERF_TYPE_HARDWARE,
222 .config = PERF_COUNT_HW_CPU_CYCLES,
223 };
224 struct perf_event_attr attr_type_sw = {
225 .sample_freq = SAMPLE_FREQ,
226 .freq = 1,
227 .type = PERF_TYPE_SOFTWARE,
228 .config = PERF_COUNT_SW_CPU_CLOCK,
229 };
230 struct perf_event_attr attr_hw_cache_l1d = {
231 .sample_freq = SAMPLE_FREQ,
232 .freq = 1,
233 .type = PERF_TYPE_HW_CACHE,
234 .config =
235 PERF_COUNT_HW_CACHE_L1D |
236 (PERF_COUNT_HW_CACHE_OP_READ << 8) |
237 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16),
238 };
239 struct perf_event_attr attr_hw_cache_branch_miss = {
240 .sample_freq = SAMPLE_FREQ,
241 .freq = 1,
242 .type = PERF_TYPE_HW_CACHE,
243 .config =
244 PERF_COUNT_HW_CACHE_BPU |
245 (PERF_COUNT_HW_CACHE_OP_READ << 8) |
246 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16),
247 };
248 struct perf_event_attr attr_type_raw = {
249 .sample_freq = SAMPLE_FREQ,
250 .freq = 1,
251 .type = PERF_TYPE_RAW,
252 /* Intel Instruction Retired */
253 .config = 0xc0,
254 };
255 struct perf_event_attr attr_type_raw_lock_load = {
256 .sample_freq = SAMPLE_FREQ,
257 .freq = 1,
258 .type = PERF_TYPE_RAW,
259 /* Intel MEM_UOPS_RETIRED.LOCK_LOADS */
260 .config = 0x21d0,
261 /* Request to record lock address from PEBS */
262 .sample_type = PERF_SAMPLE_ADDR,
263 /* Record address value requires precise event */
264 .precise_ip = 2,
265 };
266
267 printf("Test HW_CPU_CYCLES\n");
268 test_perf_event_all_cpu(&attr_type_hw);
269 test_perf_event_task(&attr_type_hw);
270
271 printf("Test SW_CPU_CLOCK\n");
272 test_perf_event_all_cpu(&attr_type_sw);
273 test_perf_event_task(&attr_type_sw);
274
275 printf("Test HW_CACHE_L1D\n");
276 test_perf_event_all_cpu(&attr_hw_cache_l1d);
277 test_perf_event_task(&attr_hw_cache_l1d);
278
279 printf("Test HW_CACHE_BPU\n");
280 test_perf_event_all_cpu(&attr_hw_cache_branch_miss);
281 test_perf_event_task(&attr_hw_cache_branch_miss);
282
283 printf("Test Instruction Retired\n");
284 test_perf_event_all_cpu(&attr_type_raw);
285 test_perf_event_task(&attr_type_raw);
286
287 printf("Test Lock Load\n");
288 test_perf_event_all_cpu(&attr_type_raw_lock_load);
289 test_perf_event_task(&attr_type_raw_lock_load);
290
291 printf("*** PASS ***\n");
292}
293
294
295int main(int argc, char **argv)
296{
297 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
298 struct bpf_object *obj = NULL;
299 char filename[256];
300 int error = 1;
301
302 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
303 setrlimit(RLIMIT_MEMLOCK, &r);
304
305 signal(SIGINT, err_exit);
306 signal(SIGTERM, err_exit);
307
308 if (load_kallsyms()) {
309 printf("failed to process /proc/kallsyms\n");
310 goto cleanup;
311 }
312
313 obj = bpf_object__open_file(filename, NULL);
314 if (libbpf_get_error(obj)) {
315 printf("opening BPF object file failed\n");
316 obj = NULL;
317 goto cleanup;
318 }
319
320 prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
321 if (!prog) {
322 printf("finding a prog in obj file failed\n");
323 goto cleanup;
324 }
325
326 /* load BPF program */
327 if (bpf_object__load(obj)) {
328 printf("loading BPF object file failed\n");
329 goto cleanup;
330 }
331
332 map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counts");
333 map_fd[1] = bpf_object__find_map_fd_by_name(obj, "stackmap");
334 if (map_fd[0] < 0 || map_fd[1] < 0) {
335 printf("finding a counts/stackmap map in obj file failed\n");
336 goto cleanup;
337 }
338
339 pid = fork();
340 if (pid == 0) {
341 read_trace_pipe();
342 return 0;
343 } else if (pid == -1) {
344 printf("couldn't spawn process\n");
345 goto cleanup;
346 }
347
348 test_bpf_perf_event();
349 error = 0;
350
351cleanup:
352 bpf_object__close(obj);
353 err_exit(error);
354}
diff --git a/samples/bpf/trace_output_kern.c b/samples/bpf/trace_output_kern.c
new file mode 100644
index 000000000..b64815af0
--- /dev/null
+++ b/samples/bpf/trace_output_kern.c
@@ -0,0 +1,31 @@
1#include <linux/ptrace.h>
2#include <linux/version.h>
3#include <uapi/linux/bpf.h>
4#include <bpf/bpf_helpers.h>
5#include "trace_common.h"
6
7struct {
8 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
9 __uint(key_size, sizeof(int));
10 __uint(value_size, sizeof(u32));
11 __uint(max_entries, 2);
12} my_map SEC(".maps");
13
14SEC("kprobe/" SYSCALL(sys_write))
15int bpf_prog1(struct pt_regs *ctx)
16{
17 struct S {
18 u64 pid;
19 u64 cookie;
20 } data;
21
22 data.pid = bpf_get_current_pid_tgid();
23 data.cookie = 0x12345678;
24
25 bpf_perf_event_output(ctx, &my_map, 0, &data, sizeof(data));
26
27 return 0;
28}
29
30char _license[] SEC("license") = "GPL";
31u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c
new file mode 100644
index 000000000..364b98764
--- /dev/null
+++ b/samples/bpf/trace_output_user.c
@@ -0,0 +1,107 @@
1// SPDX-License-Identifier: GPL-2.0-only
2#include <stdio.h>
3#include <fcntl.h>
4#include <poll.h>
5#include <time.h>
6#include <signal.h>
7#include <bpf/libbpf.h>
8
9static __u64 time_get_ns(void)
10{
11 struct timespec ts;
12
13 clock_gettime(CLOCK_MONOTONIC, &ts);
14 return ts.tv_sec * 1000000000ull + ts.tv_nsec;
15}
16
17static __u64 start_time;
18static __u64 cnt;
19
20#define MAX_CNT 100000ll
21
22static void print_bpf_output(void *ctx, int cpu, void *data, __u32 size)
23{
24 struct {
25 __u64 pid;
26 __u64 cookie;
27 } *e = data;
28
29 if (e->cookie != 0x12345678) {
30 printf("BUG pid %llx cookie %llx sized %d\n",
31 e->pid, e->cookie, size);
32 return;
33 }
34
35 cnt++;
36
37 if (cnt == MAX_CNT) {
38 printf("recv %lld events per sec\n",
39 MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
40 return;
41 }
42}
43
44int main(int argc, char **argv)
45{
46 struct perf_buffer_opts pb_opts = {};
47 struct bpf_link *link = NULL;
48 struct bpf_program *prog;
49 struct perf_buffer *pb;
50 struct bpf_object *obj;
51 int map_fd, ret = 0;
52 char filename[256];
53 FILE *f;
54
55 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
56 obj = bpf_object__open_file(filename, NULL);
57 if (libbpf_get_error(obj)) {
58 fprintf(stderr, "ERROR: opening BPF object file failed\n");
59 return 0;
60 }
61
62 /* load BPF program */
63 if (bpf_object__load(obj)) {
64 fprintf(stderr, "ERROR: loading BPF object file failed\n");
65 goto cleanup;
66 }
67
68 map_fd = bpf_object__find_map_fd_by_name(obj, "my_map");
69 if (map_fd < 0) {
70 fprintf(stderr, "ERROR: finding a map in obj file failed\n");
71 goto cleanup;
72 }
73
74 prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
75 if (libbpf_get_error(prog)) {
76 fprintf(stderr, "ERROR: finding a prog in obj file failed\n");
77 goto cleanup;
78 }
79
80 link = bpf_program__attach(prog);
81 if (libbpf_get_error(link)) {
82 fprintf(stderr, "ERROR: bpf_program__attach failed\n");
83 link = NULL;
84 goto cleanup;
85 }
86
87 pb_opts.sample_cb = print_bpf_output;
88 pb = perf_buffer__new(map_fd, 8, &pb_opts);
89 ret = libbpf_get_error(pb);
90 if (ret) {
91 printf("failed to setup perf_buffer: %d\n", ret);
92 return 1;
93 }
94
95 f = popen("taskset 1 dd if=/dev/zero of=/dev/null", "r");
96 (void) f;
97
98 start_time = time_get_ns();
99 while ((ret = perf_buffer__poll(pb, 1000)) >= 0 && cnt < MAX_CNT) {
100 }
101 kill(0, SIGINT);
102
103cleanup:
104 bpf_link__destroy(link);
105 bpf_object__close(obj);
106 return ret;
107}
diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c
new file mode 100644
index 000000000..ef30d2b35
--- /dev/null
+++ b/samples/bpf/tracex1_kern.c
@@ -0,0 +1,54 @@
1/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#include <linux/skbuff.h>
8#include <linux/netdevice.h>
9#include <uapi/linux/bpf.h>
10#include <linux/version.h>
11#include <bpf/bpf_helpers.h>
12#include <bpf/bpf_tracing.h>
13
14#define _(P) \
15 ({ \
16 typeof(P) val = 0; \
17 bpf_probe_read_kernel(&val, sizeof(val), &(P)); \
18 val; \
19 })
20
21/* kprobe is NOT a stable ABI
22 * kernel functions can be removed, renamed or completely change semantics.
23 * Number of arguments and their positions can change, etc.
24 * In such case this bpf+kprobe example will no longer be meaningful
25 */
26SEC("kprobe/__netif_receive_skb_core")
27int bpf_prog1(struct pt_regs *ctx)
28{
29 /* attaches to kprobe __netif_receive_skb_core,
30 * looks for packets on loobpack device and prints them
31 */
32 char devname[IFNAMSIZ];
33 struct net_device *dev;
34 struct sk_buff *skb;
35 int len;
36
37 /* non-portable! works for the given kernel only */
38 bpf_probe_read_kernel(&skb, sizeof(skb), (void *)PT_REGS_PARM1(ctx));
39 dev = _(skb->dev);
40 len = _(skb->len);
41
42 bpf_probe_read_kernel(devname, sizeof(devname), dev->name);
43
44 if (devname[0] == 'l' && devname[1] == 'o') {
45 char fmt[] = "skb %p len %d\n";
46 /* using bpf_trace_printk() for DEBUG ONLY */
47 bpf_trace_printk(fmt, sizeof(fmt), skb, len);
48 }
49
50 return 0;
51}
52
53char _license[] SEC("license") = "GPL";
54u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c
new file mode 100644
index 000000000..9d4adb7fd
--- /dev/null
+++ b/samples/bpf/tracex1_user.c
@@ -0,0 +1,50 @@
1// SPDX-License-Identifier: GPL-2.0
2#include <stdio.h>
3#include <unistd.h>
4#include <bpf/libbpf.h>
5#include "trace_helpers.h"
6
7int main(int ac, char **argv)
8{
9 struct bpf_link *link = NULL;
10 struct bpf_program *prog;
11 struct bpf_object *obj;
12 char filename[256];
13 FILE *f;
14
15 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
16 obj = bpf_object__open_file(filename, NULL);
17 if (libbpf_get_error(obj)) {
18 fprintf(stderr, "ERROR: opening BPF object file failed\n");
19 return 0;
20 }
21
22 prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
23 if (!prog) {
24 fprintf(stderr, "ERROR: finding a prog in obj file failed\n");
25 goto cleanup;
26 }
27
28 /* load BPF program */
29 if (bpf_object__load(obj)) {
30 fprintf(stderr, "ERROR: loading BPF object file failed\n");
31 goto cleanup;
32 }
33
34 link = bpf_program__attach(prog);
35 if (libbpf_get_error(link)) {
36 fprintf(stderr, "ERROR: bpf_program__attach failed\n");
37 link = NULL;
38 goto cleanup;
39 }
40
41 f = popen("taskset 1 ping -c5 localhost", "r");
42 (void) f;
43
44 read_trace_pipe();
45
46cleanup:
47 bpf_link__destroy(link);
48 bpf_object__close(obj);
49 return 0;
50}
diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c
new file mode 100644
index 000000000..5bc696bac
--- /dev/null
+++ b/samples/bpf/tracex2_kern.c
@@ -0,0 +1,102 @@
1/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#include <linux/skbuff.h>
8#include <linux/netdevice.h>
9#include <linux/version.h>
10#include <uapi/linux/bpf.h>
11#include <bpf/bpf_helpers.h>
12#include <bpf/bpf_tracing.h>
13#include "trace_common.h"
14
15struct {
16 __uint(type, BPF_MAP_TYPE_HASH);
17 __type(key, long);
18 __type(value, long);
19 __uint(max_entries, 1024);
20} my_map SEC(".maps");
21
22/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
23 * example will no longer be meaningful
24 */
25SEC("kprobe/kfree_skb")
26int bpf_prog2(struct pt_regs *ctx)
27{
28 long loc = 0;
29 long init_val = 1;
30 long *value;
31
32 /* read ip of kfree_skb caller.
33 * non-portable version of __builtin_return_address(0)
34 */
35 BPF_KPROBE_READ_RET_IP(loc, ctx);
36
37 value = bpf_map_lookup_elem(&my_map, &loc);
38 if (value)
39 *value += 1;
40 else
41 bpf_map_update_elem(&my_map, &loc, &init_val, BPF_ANY);
42 return 0;
43}
44
45static unsigned int log2(unsigned int v)
46{
47 unsigned int r;
48 unsigned int shift;
49
50 r = (v > 0xFFFF) << 4; v >>= r;
51 shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
52 shift = (v > 0xF) << 2; v >>= shift; r |= shift;
53 shift = (v > 0x3) << 1; v >>= shift; r |= shift;
54 r |= (v >> 1);
55 return r;
56}
57
58static unsigned int log2l(unsigned long v)
59{
60 unsigned int hi = v >> 32;
61 if (hi)
62 return log2(hi) + 32;
63 else
64 return log2(v);
65}
66
67struct hist_key {
68 char comm[16];
69 u64 pid_tgid;
70 u64 uid_gid;
71 u64 index;
72};
73
74struct {
75 __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
76 __uint(key_size, sizeof(struct hist_key));
77 __uint(value_size, sizeof(long));
78 __uint(max_entries, 1024);
79} my_hist_map SEC(".maps");
80
81SEC("kprobe/" SYSCALL(sys_write))
82int bpf_prog3(struct pt_regs *ctx)
83{
84 long write_size = PT_REGS_PARM3(ctx);
85 long init_val = 1;
86 long *value;
87 struct hist_key key;
88
89 key.index = log2l(write_size);
90 key.pid_tgid = bpf_get_current_pid_tgid();
91 key.uid_gid = bpf_get_current_uid_gid();
92 bpf_get_current_comm(&key.comm, sizeof(key.comm));
93
94 value = bpf_map_lookup_elem(&my_hist_map, &key);
95 if (value)
96 __sync_fetch_and_add(value, 1);
97 else
98 bpf_map_update_elem(&my_hist_map, &key, &init_val, BPF_ANY);
99 return 0;
100}
101char _license[] SEC("license") = "GPL";
102u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c
new file mode 100644
index 000000000..3d6eab711
--- /dev/null
+++ b/samples/bpf/tracex2_user.c
@@ -0,0 +1,193 @@
1// SPDX-License-Identifier: GPL-2.0
2#include <stdio.h>
3#include <unistd.h>
4#include <stdlib.h>
5#include <signal.h>
6#include <string.h>
7#include <sys/resource.h>
8
9#include <bpf/bpf.h>
10#include <bpf/libbpf.h>
11#include "bpf_util.h"
12
13#define MAX_INDEX 64
14#define MAX_STARS 38
15
16/* my_map, my_hist_map */
17static int map_fd[2];
18
19static void stars(char *str, long val, long max, int width)
20{
21 int i;
22
23 for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
24 str[i] = '*';
25 if (val > max)
26 str[i - 1] = '+';
27 str[i] = '\0';
28}
29
30struct task {
31 char comm[16];
32 __u64 pid_tgid;
33 __u64 uid_gid;
34};
35
36struct hist_key {
37 struct task t;
38 __u32 index;
39};
40
41#define SIZE sizeof(struct task)
42
43static void print_hist_for_pid(int fd, void *task)
44{
45 unsigned int nr_cpus = bpf_num_possible_cpus();
46 struct hist_key key = {}, next_key;
47 long values[nr_cpus];
48 char starstr[MAX_STARS];
49 long value;
50 long data[MAX_INDEX] = {};
51 int max_ind = -1;
52 long max_value = 0;
53 int i, ind;
54
55 while (bpf_map_get_next_key(fd, &key, &next_key) == 0) {
56 if (memcmp(&next_key, task, SIZE)) {
57 key = next_key;
58 continue;
59 }
60 bpf_map_lookup_elem(fd, &next_key, values);
61 value = 0;
62 for (i = 0; i < nr_cpus; i++)
63 value += values[i];
64 ind = next_key.index;
65 data[ind] = value;
66 if (value && ind > max_ind)
67 max_ind = ind;
68 if (value > max_value)
69 max_value = value;
70 key = next_key;
71 }
72
73 printf(" syscall write() stats\n");
74 printf(" byte_size : count distribution\n");
75 for (i = 1; i <= max_ind + 1; i++) {
76 stars(starstr, data[i - 1], max_value, MAX_STARS);
77 printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
78 (1l << i) >> 1, (1l << i) - 1, data[i - 1],
79 MAX_STARS, starstr);
80 }
81}
82
83static void print_hist(int fd)
84{
85 struct hist_key key = {}, next_key;
86 static struct task tasks[1024];
87 int task_cnt = 0;
88 int i;
89
90 while (bpf_map_get_next_key(fd, &key, &next_key) == 0) {
91 int found = 0;
92
93 for (i = 0; i < task_cnt; i++)
94 if (memcmp(&tasks[i], &next_key, SIZE) == 0)
95 found = 1;
96 if (!found)
97 memcpy(&tasks[task_cnt++], &next_key, SIZE);
98 key = next_key;
99 }
100
101 for (i = 0; i < task_cnt; i++) {
102 printf("\npid %d cmd %s uid %d\n",
103 (__u32) tasks[i].pid_tgid,
104 tasks[i].comm,
105 (__u32) tasks[i].uid_gid);
106 print_hist_for_pid(fd, &tasks[i]);
107 }
108
109}
110
111static void int_exit(int sig)
112{
113 print_hist(map_fd[1]);
114 exit(0);
115}
116
117int main(int ac, char **argv)
118{
119 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
120 long key, next_key, value;
121 struct bpf_link *links[2];
122 struct bpf_program *prog;
123 struct bpf_object *obj;
124 char filename[256];
125 int i, j = 0;
126 FILE *f;
127
128 if (setrlimit(RLIMIT_MEMLOCK, &r)) {
129 perror("setrlimit(RLIMIT_MEMLOCK)");
130 return 1;
131 }
132
133 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
134 obj = bpf_object__open_file(filename, NULL);
135 if (libbpf_get_error(obj)) {
136 fprintf(stderr, "ERROR: opening BPF object file failed\n");
137 return 0;
138 }
139
140 /* load BPF program */
141 if (bpf_object__load(obj)) {
142 fprintf(stderr, "ERROR: loading BPF object file failed\n");
143 goto cleanup;
144 }
145
146 map_fd[0] = bpf_object__find_map_fd_by_name(obj, "my_map");
147 map_fd[1] = bpf_object__find_map_fd_by_name(obj, "my_hist_map");
148 if (map_fd[0] < 0 || map_fd[1] < 0) {
149 fprintf(stderr, "ERROR: finding a map in obj file failed\n");
150 goto cleanup;
151 }
152
153 signal(SIGINT, int_exit);
154 signal(SIGTERM, int_exit);
155
156 /* start 'ping' in the background to have some kfree_skb events */
157 f = popen("ping -4 -c5 localhost", "r");
158 (void) f;
159
160 /* start 'dd' in the background to have plenty of 'write' syscalls */
161 f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r");
162 (void) f;
163
164 bpf_object__for_each_program(prog, obj) {
165 links[j] = bpf_program__attach(prog);
166 if (libbpf_get_error(links[j])) {
167 fprintf(stderr, "ERROR: bpf_program__attach failed\n");
168 links[j] = NULL;
169 goto cleanup;
170 }
171 j++;
172 }
173
174 for (i = 0; i < 5; i++) {
175 key = 0;
176 while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) {
177 bpf_map_lookup_elem(map_fd[0], &next_key, &value);
178 printf("location 0x%lx count %ld\n", next_key, value);
179 key = next_key;
180 }
181 if (key)
182 printf("\n");
183 sleep(1);
184 }
185 print_hist(map_fd[1]);
186
187cleanup:
188 for (j--; j >= 0; j--)
189 bpf_link__destroy(links[j]);
190
191 bpf_object__close(obj);
192 return 0;
193}
diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c
new file mode 100644
index 000000000..710a4410b
--- /dev/null
+++ b/samples/bpf/tracex3_kern.c
@@ -0,0 +1,90 @@
1/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#include <linux/skbuff.h>
8#include <linux/netdevice.h>
9#include <linux/version.h>
10#include <uapi/linux/bpf.h>
11#include <bpf/bpf_helpers.h>
12#include <bpf/bpf_tracing.h>
13
14struct {
15 __uint(type, BPF_MAP_TYPE_HASH);
16 __type(key, long);
17 __type(value, u64);
18 __uint(max_entries, 4096);
19} my_map SEC(".maps");
20
21/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
22 * example will no longer be meaningful
23 */
24SEC("kprobe/blk_mq_start_request")
25int bpf_prog1(struct pt_regs *ctx)
26{
27 long rq = PT_REGS_PARM1(ctx);
28 u64 val = bpf_ktime_get_ns();
29
30 bpf_map_update_elem(&my_map, &rq, &val, BPF_ANY);
31 return 0;
32}
33
34static unsigned int log2l(unsigned long long n)
35{
36#define S(k) if (n >= (1ull << k)) { i += k; n >>= k; }
37 int i = -(n == 0);
38 S(32); S(16); S(8); S(4); S(2); S(1);
39 return i;
40#undef S
41}
42
43#define SLOTS 100
44
45struct {
46 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
47 __uint(key_size, sizeof(u32));
48 __uint(value_size, sizeof(u64));
49 __uint(max_entries, SLOTS);
50} lat_map SEC(".maps");
51
52SEC("kprobe/blk_account_io_done")
53int bpf_prog2(struct pt_regs *ctx)
54{
55 long rq = PT_REGS_PARM1(ctx);
56 u64 *value, l, base;
57 u32 index;
58
59 value = bpf_map_lookup_elem(&my_map, &rq);
60 if (!value)
61 return 0;
62
63 u64 cur_time = bpf_ktime_get_ns();
64 u64 delta = cur_time - *value;
65
66 bpf_map_delete_elem(&my_map, &rq);
67
68 /* the lines below are computing index = log10(delta)*10
69 * using integer arithmetic
70 * index = 29 ~ 1 usec
71 * index = 59 ~ 1 msec
72 * index = 89 ~ 1 sec
73 * index = 99 ~ 10sec or more
74 * log10(x)*10 = log2(x)*10/log2(10) = log2(x)*3
75 */
76 l = log2l(delta);
77 base = 1ll << l;
78 index = (l * 64 + (delta - base) * 64 / base) * 3 / 64;
79
80 if (index >= SLOTS)
81 index = SLOTS - 1;
82
83 value = bpf_map_lookup_elem(&lat_map, &index);
84 if (value)
85 *value += 1;
86
87 return 0;
88}
89char _license[] SEC("license") = "GPL";
90u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c
new file mode 100644
index 000000000..83e0fecbb
--- /dev/null
+++ b/samples/bpf/tracex3_user.c
@@ -0,0 +1,190 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
3 */
4#include <stdio.h>
5#include <stdlib.h>
6#include <signal.h>
7#include <unistd.h>
8#include <stdbool.h>
9#include <string.h>
10#include <sys/resource.h>
11
12#include <bpf/bpf.h>
13#include <bpf/libbpf.h>
14#include "bpf_util.h"
15
16#define SLOTS 100
17
18static void clear_stats(int fd)
19{
20 unsigned int nr_cpus = bpf_num_possible_cpus();
21 __u64 values[nr_cpus];
22 __u32 key;
23
24 memset(values, 0, sizeof(values));
25 for (key = 0; key < SLOTS; key++)
26 bpf_map_update_elem(fd, &key, values, BPF_ANY);
27}
28
29const char *color[] = {
30 "\033[48;5;255m",
31 "\033[48;5;252m",
32 "\033[48;5;250m",
33 "\033[48;5;248m",
34 "\033[48;5;246m",
35 "\033[48;5;244m",
36 "\033[48;5;242m",
37 "\033[48;5;240m",
38 "\033[48;5;238m",
39 "\033[48;5;236m",
40 "\033[48;5;234m",
41 "\033[48;5;232m",
42};
43const int num_colors = ARRAY_SIZE(color);
44
45const char nocolor[] = "\033[00m";
46
47const char *sym[] = {
48 " ",
49 " ",
50 ".",
51 ".",
52 "*",
53 "*",
54 "o",
55 "o",
56 "O",
57 "O",
58 "#",
59 "#",
60};
61
62bool full_range = false;
63bool text_only = false;
64
65static void print_banner(void)
66{
67 if (full_range)
68 printf("|1ns |10ns |100ns |1us |10us |100us"
69 " |1ms |10ms |100ms |1s |10s\n");
70 else
71 printf("|1us |10us |100us |1ms |10ms "
72 "|100ms |1s |10s\n");
73}
74
75static void print_hist(int fd)
76{
77 unsigned int nr_cpus = bpf_num_possible_cpus();
78 __u64 total_events = 0;
79 long values[nr_cpus];
80 __u64 max_cnt = 0;
81 __u64 cnt[SLOTS];
82 __u64 value;
83 __u32 key;
84 int i;
85
86 for (key = 0; key < SLOTS; key++) {
87 bpf_map_lookup_elem(fd, &key, values);
88 value = 0;
89 for (i = 0; i < nr_cpus; i++)
90 value += values[i];
91 cnt[key] = value;
92 total_events += value;
93 if (value > max_cnt)
94 max_cnt = value;
95 }
96 clear_stats(fd);
97 for (key = full_range ? 0 : 29; key < SLOTS; key++) {
98 int c = num_colors * cnt[key] / (max_cnt + 1);
99
100 if (text_only)
101 printf("%s", sym[c]);
102 else
103 printf("%s %s", color[c], nocolor);
104 }
105 printf(" # %lld\n", total_events);
106}
107
108int main(int ac, char **argv)
109{
110 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
111 struct bpf_link *links[2];
112 struct bpf_program *prog;
113 struct bpf_object *obj;
114 char filename[256];
115 int map_fd, i, j = 0;
116
117 for (i = 1; i < ac; i++) {
118 if (strcmp(argv[i], "-a") == 0) {
119 full_range = true;
120 } else if (strcmp(argv[i], "-t") == 0) {
121 text_only = true;
122 } else if (strcmp(argv[i], "-h") == 0) {
123 printf("Usage:\n"
124 " -a display wider latency range\n"
125 " -t text only\n");
126 return 1;
127 }
128 }
129
130 if (setrlimit(RLIMIT_MEMLOCK, &r)) {
131 perror("setrlimit(RLIMIT_MEMLOCK)");
132 return 1;
133 }
134
135 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
136 obj = bpf_object__open_file(filename, NULL);
137 if (libbpf_get_error(obj)) {
138 fprintf(stderr, "ERROR: opening BPF object file failed\n");
139 return 0;
140 }
141
142 /* load BPF program */
143 if (bpf_object__load(obj)) {
144 fprintf(stderr, "ERROR: loading BPF object file failed\n");
145 goto cleanup;
146 }
147
148 map_fd = bpf_object__find_map_fd_by_name(obj, "lat_map");
149 if (map_fd < 0) {
150 fprintf(stderr, "ERROR: finding a map in obj file failed\n");
151 goto cleanup;
152 }
153
154 bpf_object__for_each_program(prog, obj) {
155 links[j] = bpf_program__attach(prog);
156 if (libbpf_get_error(links[j])) {
157 fprintf(stderr, "ERROR: bpf_program__attach failed\n");
158 links[j] = NULL;
159 goto cleanup;
160 }
161 j++;
162 }
163
164 printf(" heatmap of IO latency\n");
165 if (text_only)
166 printf(" %s", sym[num_colors - 1]);
167 else
168 printf(" %s %s", color[num_colors - 1], nocolor);
169 printf(" - many events with this latency\n");
170
171 if (text_only)
172 printf(" %s", sym[0]);
173 else
174 printf(" %s %s", color[0], nocolor);
175 printf(" - few events\n");
176
177 for (i = 0; ; i++) {
178 if (i % 20 == 0)
179 print_banner();
180 print_hist(map_fd);
181 sleep(2);
182 }
183
184cleanup:
185 for (j--; j >= 0; j--)
186 bpf_link__destroy(links[j]);
187
188 bpf_object__close(obj);
189 return 0;
190}
diff --git a/samples/bpf/tracex4_kern.c b/samples/bpf/tracex4_kern.c
new file mode 100644
index 000000000..eb0f8fdd1
--- /dev/null
+++ b/samples/bpf/tracex4_kern.c
@@ -0,0 +1,55 @@
1/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#include <linux/ptrace.h>
8#include <linux/version.h>
9#include <uapi/linux/bpf.h>
10#include <bpf/bpf_helpers.h>
11#include <bpf/bpf_tracing.h>
12
13struct pair {
14 u64 val;
15 u64 ip;
16};
17
18struct {
19 __uint(type, BPF_MAP_TYPE_HASH);
20 __type(key, long);
21 __type(value, struct pair);
22 __uint(max_entries, 1000000);
23} my_map SEC(".maps");
24
25/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
26 * example will no longer be meaningful
27 */
28SEC("kprobe/kmem_cache_free")
29int bpf_prog1(struct pt_regs *ctx)
30{
31 long ptr = PT_REGS_PARM2(ctx);
32
33 bpf_map_delete_elem(&my_map, &ptr);
34 return 0;
35}
36
37SEC("kretprobe/kmem_cache_alloc_node")
38int bpf_prog2(struct pt_regs *ctx)
39{
40 long ptr = PT_REGS_RC(ctx);
41 long ip = 0;
42
43 /* get ip address of kmem_cache_alloc_node() caller */
44 BPF_KRETPROBE_READ_RET_IP(ip, ctx);
45
46 struct pair v = {
47 .val = bpf_ktime_get_ns(),
48 .ip = ip,
49 };
50
51 bpf_map_update_elem(&my_map, &ptr, &v, BPF_ANY);
52 return 0;
53}
54char _license[] SEC("license") = "GPL";
55u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c
new file mode 100644
index 000000000..e8faf8f18
--- /dev/null
+++ b/samples/bpf/tracex4_user.c
@@ -0,0 +1,103 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
3 */
4#include <stdio.h>
5#include <stdlib.h>
6#include <signal.h>
7#include <unistd.h>
8#include <stdbool.h>
9#include <string.h>
10#include <time.h>
11#include <sys/resource.h>
12
13#include <bpf/bpf.h>
14#include <bpf/libbpf.h>
15
16struct pair {
17 long long val;
18 __u64 ip;
19};
20
21static __u64 time_get_ns(void)
22{
23 struct timespec ts;
24
25 clock_gettime(CLOCK_MONOTONIC, &ts);
26 return ts.tv_sec * 1000000000ull + ts.tv_nsec;
27}
28
29static void print_old_objects(int fd)
30{
31 long long val = time_get_ns();
32 __u64 key, next_key;
33 struct pair v;
34
35 key = write(1, "\e[1;1H\e[2J", 12); /* clear screen */
36
37 key = -1;
38 while (bpf_map_get_next_key(fd, &key, &next_key) == 0) {
39 bpf_map_lookup_elem(fd, &next_key, &v);
40 key = next_key;
41 if (val - v.val < 1000000000ll)
42 /* object was allocated more then 1 sec ago */
43 continue;
44 printf("obj 0x%llx is %2lldsec old was allocated at ip %llx\n",
45 next_key, (val - v.val) / 1000000000ll, v.ip);
46 }
47}
48
49int main(int ac, char **argv)
50{
51 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
52 struct bpf_link *links[2];
53 struct bpf_program *prog;
54 struct bpf_object *obj;
55 char filename[256];
56 int map_fd, i, j = 0;
57
58 if (setrlimit(RLIMIT_MEMLOCK, &r)) {
59 perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)");
60 return 1;
61 }
62
63 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
64 obj = bpf_object__open_file(filename, NULL);
65 if (libbpf_get_error(obj)) {
66 fprintf(stderr, "ERROR: opening BPF object file failed\n");
67 return 0;
68 }
69
70 /* load BPF program */
71 if (bpf_object__load(obj)) {
72 fprintf(stderr, "ERROR: loading BPF object file failed\n");
73 goto cleanup;
74 }
75
76 map_fd = bpf_object__find_map_fd_by_name(obj, "my_map");
77 if (map_fd < 0) {
78 fprintf(stderr, "ERROR: finding a map in obj file failed\n");
79 goto cleanup;
80 }
81
82 bpf_object__for_each_program(prog, obj) {
83 links[j] = bpf_program__attach(prog);
84 if (libbpf_get_error(links[j])) {
85 fprintf(stderr, "ERROR: bpf_program__attach failed\n");
86 links[j] = NULL;
87 goto cleanup;
88 }
89 j++;
90 }
91
92 for (i = 0; ; i++) {
93 print_old_objects(map_fd);
94 sleep(1);
95 }
96
97cleanup:
98 for (j--; j >= 0; j--)
99 bpf_link__destroy(links[j]);
100
101 bpf_object__close(obj);
102 return 0;
103}
diff --git a/samples/bpf/tracex5_kern.c b/samples/bpf/tracex5_kern.c
new file mode 100644
index 000000000..64a1f7550
--- /dev/null
+++ b/samples/bpf/tracex5_kern.c
@@ -0,0 +1,93 @@
1/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#include <linux/ptrace.h>
8#include <linux/version.h>
9#include <uapi/linux/bpf.h>
10#include <uapi/linux/seccomp.h>
11#include <uapi/linux/unistd.h>
12#include "syscall_nrs.h"
13#include <bpf/bpf_helpers.h>
14#include <bpf/bpf_tracing.h>
15
16#define PROG(F) SEC("kprobe/"__stringify(F)) int bpf_func_##F
17
18struct {
19 __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
20 __uint(key_size, sizeof(u32));
21 __uint(value_size, sizeof(u32));
22#ifdef __mips__
23 __uint(max_entries, 6000); /* MIPS n64 syscalls start at 5000 */
24#else
25 __uint(max_entries, 1024);
26#endif
27} progs SEC(".maps");
28
29SEC("kprobe/__seccomp_filter")
30int bpf_prog1(struct pt_regs *ctx)
31{
32 int sc_nr = (int)PT_REGS_PARM1(ctx);
33
34 /* dispatch into next BPF program depending on syscall number */
35 bpf_tail_call(ctx, &progs, sc_nr);
36
37 /* fall through -> unknown syscall */
38 if (sc_nr >= __NR_getuid && sc_nr <= __NR_getsid) {
39 char fmt[] = "syscall=%d (one of get/set uid/pid/gid)\n";
40 bpf_trace_printk(fmt, sizeof(fmt), sc_nr);
41 }
42 return 0;
43}
44
45/* we jump here when syscall number == __NR_write */
46PROG(SYS__NR_write)(struct pt_regs *ctx)
47{
48 struct seccomp_data sd;
49
50 bpf_probe_read_kernel(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx));
51 if (sd.args[2] == 512) {
52 char fmt[] = "write(fd=%d, buf=%p, size=%d)\n";
53 bpf_trace_printk(fmt, sizeof(fmt),
54 sd.args[0], sd.args[1], sd.args[2]);
55 }
56 return 0;
57}
58
59PROG(SYS__NR_read)(struct pt_regs *ctx)
60{
61 struct seccomp_data sd;
62
63 bpf_probe_read_kernel(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx));
64 if (sd.args[2] > 128 && sd.args[2] <= 1024) {
65 char fmt[] = "read(fd=%d, buf=%p, size=%d)\n";
66 bpf_trace_printk(fmt, sizeof(fmt),
67 sd.args[0], sd.args[1], sd.args[2]);
68 }
69 return 0;
70}
71
72#ifdef __NR_mmap2
73PROG(SYS__NR_mmap2)(struct pt_regs *ctx)
74{
75 char fmt[] = "mmap2\n";
76
77 bpf_trace_printk(fmt, sizeof(fmt));
78 return 0;
79}
80#endif
81
82#ifdef __NR_mmap
83PROG(SYS__NR_mmap)(struct pt_regs *ctx)
84{
85 char fmt[] = "mmap\n";
86
87 bpf_trace_printk(fmt, sizeof(fmt));
88 return 0;
89}
90#endif
91
92char _license[] SEC("license") = "GPL";
93u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex5_user.c b/samples/bpf/tracex5_user.c
new file mode 100644
index 000000000..c17d3fb5f
--- /dev/null
+++ b/samples/bpf/tracex5_user.c
@@ -0,0 +1,101 @@
1// SPDX-License-Identifier: GPL-2.0
2#include <stdio.h>
3#include <stdlib.h>
4#include <unistd.h>
5#include <linux/filter.h>
6#include <linux/seccomp.h>
7#include <sys/prctl.h>
8#include <bpf/bpf.h>
9#include <bpf/libbpf.h>
10#include <sys/resource.h>
11#include "trace_helpers.h"
12
13#ifdef __mips__
14#define MAX_ENTRIES 6000 /* MIPS n64 syscalls start at 5000 */
15#else
16#define MAX_ENTRIES 1024
17#endif
18
19/* install fake seccomp program to enable seccomp code path inside the kernel,
20 * so that our kprobe attached to seccomp_phase1() can be triggered
21 */
22static void install_accept_all_seccomp(void)
23{
24 struct sock_filter filter[] = {
25 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
26 };
27 struct sock_fprog prog = {
28 .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
29 .filter = filter,
30 };
31 if (prctl(PR_SET_SECCOMP, 2, &prog))
32 perror("prctl");
33}
34
35int main(int ac, char **argv)
36{
37 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
38 struct bpf_link *link = NULL;
39 struct bpf_program *prog;
40 struct bpf_object *obj;
41 int key, fd, progs_fd;
42 const char *section;
43 char filename[256];
44 FILE *f;
45
46 setrlimit(RLIMIT_MEMLOCK, &r);
47
48 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
49 obj = bpf_object__open_file(filename, NULL);
50 if (libbpf_get_error(obj)) {
51 fprintf(stderr, "ERROR: opening BPF object file failed\n");
52 return 0;
53 }
54
55 prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
56 if (!prog) {
57 printf("finding a prog in obj file failed\n");
58 goto cleanup;
59 }
60
61 /* load BPF program */
62 if (bpf_object__load(obj)) {
63 fprintf(stderr, "ERROR: loading BPF object file failed\n");
64 goto cleanup;
65 }
66
67 link = bpf_program__attach(prog);
68 if (libbpf_get_error(link)) {
69 fprintf(stderr, "ERROR: bpf_program__attach failed\n");
70 link = NULL;
71 goto cleanup;
72 }
73
74 progs_fd = bpf_object__find_map_fd_by_name(obj, "progs");
75 if (progs_fd < 0) {
76 fprintf(stderr, "ERROR: finding a map in obj file failed\n");
77 goto cleanup;
78 }
79
80 bpf_object__for_each_program(prog, obj) {
81 section = bpf_program__section_name(prog);
82 /* register only syscalls to PROG_ARRAY */
83 if (sscanf(section, "kprobe/%d", &key) != 1)
84 continue;
85
86 fd = bpf_program__fd(prog);
87 bpf_map_update_elem(progs_fd, &key, &fd, BPF_ANY);
88 }
89
90 install_accept_all_seccomp();
91
92 f = popen("dd if=/dev/zero of=/dev/null count=5", "r");
93 (void) f;
94
95 read_trace_pipe();
96
97cleanup:
98 bpf_link__destroy(link);
99 bpf_object__close(obj);
100 return 0;
101}
diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c
new file mode 100644
index 000000000..acad5712d
--- /dev/null
+++ b/samples/bpf/tracex6_kern.c
@@ -0,0 +1,69 @@
1#include <linux/ptrace.h>
2#include <linux/version.h>
3#include <uapi/linux/bpf.h>
4#include <bpf/bpf_helpers.h>
5
6struct {
7 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
8 __uint(key_size, sizeof(int));
9 __uint(value_size, sizeof(u32));
10 __uint(max_entries, 64);
11} counters SEC(".maps");
12
13struct {
14 __uint(type, BPF_MAP_TYPE_HASH);
15 __type(key, int);
16 __type(value, u64);
17 __uint(max_entries, 64);
18} values SEC(".maps");
19
20struct {
21 __uint(type, BPF_MAP_TYPE_HASH);
22 __type(key, int);
23 __type(value, struct bpf_perf_event_value);
24 __uint(max_entries, 64);
25} values2 SEC(".maps");
26
27SEC("kprobe/htab_map_get_next_key")
28int bpf_prog1(struct pt_regs *ctx)
29{
30 u32 key = bpf_get_smp_processor_id();
31 u64 count, *val;
32 s64 error;
33
34 count = bpf_perf_event_read(&counters, key);
35 error = (s64)count;
36 if (error <= -2 && error >= -22)
37 return 0;
38
39 val = bpf_map_lookup_elem(&values, &key);
40 if (val)
41 *val = count;
42 else
43 bpf_map_update_elem(&values, &key, &count, BPF_NOEXIST);
44
45 return 0;
46}
47
48SEC("kprobe/htab_map_lookup_elem")
49int bpf_prog2(struct pt_regs *ctx)
50{
51 u32 key = bpf_get_smp_processor_id();
52 struct bpf_perf_event_value *val, buf;
53 int error;
54
55 error = bpf_perf_event_read_value(&counters, key, &buf, sizeof(buf));
56 if (error)
57 return 0;
58
59 val = bpf_map_lookup_elem(&values2, &key);
60 if (val)
61 *val = buf;
62 else
63 bpf_map_update_elem(&values2, &key, &buf, BPF_NOEXIST);
64
65 return 0;
66}
67
68char _license[] SEC("license") = "GPL";
69u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c
new file mode 100644
index 000000000..33df97847
--- /dev/null
+++ b/samples/bpf/tracex6_user.c
@@ -0,0 +1,226 @@
1// SPDX-License-Identifier: GPL-2.0
2#define _GNU_SOURCE
3
4#include <assert.h>
5#include <fcntl.h>
6#include <linux/perf_event.h>
7#include <sched.h>
8#include <stdio.h>
9#include <stdlib.h>
10#include <sys/ioctl.h>
11#include <sys/resource.h>
12#include <sys/time.h>
13#include <sys/types.h>
14#include <sys/wait.h>
15#include <unistd.h>
16
17#include <bpf/bpf.h>
18#include <bpf/libbpf.h>
19#include "perf-sys.h"
20
21#define SAMPLE_PERIOD 0x7fffffffffffffffULL
22
23/* counters, values, values2 */
24static int map_fd[3];
25
26static void check_on_cpu(int cpu, struct perf_event_attr *attr)
27{
28 struct bpf_perf_event_value value2;
29 int pmu_fd, error = 0;
30 cpu_set_t set;
31 __u64 value;
32
33 /* Move to target CPU */
34 CPU_ZERO(&set);
35 CPU_SET(cpu, &set);
36 assert(sched_setaffinity(0, sizeof(set), &set) == 0);
37 /* Open perf event and attach to the perf_event_array */
38 pmu_fd = sys_perf_event_open(attr, -1/*pid*/, cpu/*cpu*/, -1/*group_fd*/, 0);
39 if (pmu_fd < 0) {
40 fprintf(stderr, "sys_perf_event_open failed on CPU %d\n", cpu);
41 error = 1;
42 goto on_exit;
43 }
44 assert(bpf_map_update_elem(map_fd[0], &cpu, &pmu_fd, BPF_ANY) == 0);
45 assert(ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0) == 0);
46 /* Trigger the kprobe */
47 bpf_map_get_next_key(map_fd[1], &cpu, NULL);
48 /* Check the value */
49 if (bpf_map_lookup_elem(map_fd[1], &cpu, &value)) {
50 fprintf(stderr, "Value missing for CPU %d\n", cpu);
51 error = 1;
52 goto on_exit;
53 } else {
54 fprintf(stderr, "CPU %d: %llu\n", cpu, value);
55 }
56 /* The above bpf_map_lookup_elem should trigger the second kprobe */
57 if (bpf_map_lookup_elem(map_fd[2], &cpu, &value2)) {
58 fprintf(stderr, "Value2 missing for CPU %d\n", cpu);
59 error = 1;
60 goto on_exit;
61 } else {
62 fprintf(stderr, "CPU %d: counter: %llu, enabled: %llu, running: %llu\n", cpu,
63 value2.counter, value2.enabled, value2.running);
64 }
65
66on_exit:
67 assert(bpf_map_delete_elem(map_fd[0], &cpu) == 0 || error);
68 assert(ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE, 0) == 0 || error);
69 assert(close(pmu_fd) == 0 || error);
70 assert(bpf_map_delete_elem(map_fd[1], &cpu) == 0 || error);
71 exit(error);
72}
73
74static void test_perf_event_array(struct perf_event_attr *attr,
75 const char *name)
76{
77 int i, status, nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
78 pid_t pid[nr_cpus];
79 int err = 0;
80
81 printf("Test reading %s counters\n", name);
82
83 for (i = 0; i < nr_cpus; i++) {
84 pid[i] = fork();
85 assert(pid[i] >= 0);
86 if (pid[i] == 0) {
87 check_on_cpu(i, attr);
88 exit(1);
89 }
90 }
91
92 for (i = 0; i < nr_cpus; i++) {
93 assert(waitpid(pid[i], &status, 0) == pid[i]);
94 err |= status;
95 }
96
97 if (err)
98 printf("Test: %s FAILED\n", name);
99}
100
101static void test_bpf_perf_event(void)
102{
103 struct perf_event_attr attr_cycles = {
104 .freq = 0,
105 .sample_period = SAMPLE_PERIOD,
106 .inherit = 0,
107 .type = PERF_TYPE_HARDWARE,
108 .read_format = 0,
109 .sample_type = 0,
110 .config = PERF_COUNT_HW_CPU_CYCLES,
111 };
112 struct perf_event_attr attr_clock = {
113 .freq = 0,
114 .sample_period = SAMPLE_PERIOD,
115 .inherit = 0,
116 .type = PERF_TYPE_SOFTWARE,
117 .read_format = 0,
118 .sample_type = 0,
119 .config = PERF_COUNT_SW_CPU_CLOCK,
120 };
121 struct perf_event_attr attr_raw = {
122 .freq = 0,
123 .sample_period = SAMPLE_PERIOD,
124 .inherit = 0,
125 .type = PERF_TYPE_RAW,
126 .read_format = 0,
127 .sample_type = 0,
128 /* Intel Instruction Retired */
129 .config = 0xc0,
130 };
131 struct perf_event_attr attr_l1d_load = {
132 .freq = 0,
133 .sample_period = SAMPLE_PERIOD,
134 .inherit = 0,
135 .type = PERF_TYPE_HW_CACHE,
136 .read_format = 0,
137 .sample_type = 0,
138 .config =
139 PERF_COUNT_HW_CACHE_L1D |
140 (PERF_COUNT_HW_CACHE_OP_READ << 8) |
141 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16),
142 };
143 struct perf_event_attr attr_llc_miss = {
144 .freq = 0,
145 .sample_period = SAMPLE_PERIOD,
146 .inherit = 0,
147 .type = PERF_TYPE_HW_CACHE,
148 .read_format = 0,
149 .sample_type = 0,
150 .config =
151 PERF_COUNT_HW_CACHE_LL |
152 (PERF_COUNT_HW_CACHE_OP_READ << 8) |
153 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16),
154 };
155 struct perf_event_attr attr_msr_tsc = {
156 .freq = 0,
157 .sample_period = 0,
158 .inherit = 0,
159 /* From /sys/bus/event_source/devices/msr/ */
160 .type = 7,
161 .read_format = 0,
162 .sample_type = 0,
163 .config = 0,
164 };
165
166 test_perf_event_array(&attr_cycles, "HARDWARE-cycles");
167 test_perf_event_array(&attr_clock, "SOFTWARE-clock");
168 test_perf_event_array(&attr_raw, "RAW-instruction-retired");
169 test_perf_event_array(&attr_l1d_load, "HW_CACHE-L1D-load");
170
171 /* below tests may fail in qemu */
172 test_perf_event_array(&attr_llc_miss, "HW_CACHE-LLC-miss");
173 test_perf_event_array(&attr_msr_tsc, "Dynamic-msr-tsc");
174}
175
176int main(int argc, char **argv)
177{
178 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
179 struct bpf_link *links[2];
180 struct bpf_program *prog;
181 struct bpf_object *obj;
182 char filename[256];
183 int i = 0;
184
185 setrlimit(RLIMIT_MEMLOCK, &r);
186
187 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
188 obj = bpf_object__open_file(filename, NULL);
189 if (libbpf_get_error(obj)) {
190 fprintf(stderr, "ERROR: opening BPF object file failed\n");
191 return 0;
192 }
193
194 /* load BPF program */
195 if (bpf_object__load(obj)) {
196 fprintf(stderr, "ERROR: loading BPF object file failed\n");
197 goto cleanup;
198 }
199
200 map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counters");
201 map_fd[1] = bpf_object__find_map_fd_by_name(obj, "values");
202 map_fd[2] = bpf_object__find_map_fd_by_name(obj, "values2");
203 if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0) {
204 fprintf(stderr, "ERROR: finding a map in obj file failed\n");
205 goto cleanup;
206 }
207
208 bpf_object__for_each_program(prog, obj) {
209 links[i] = bpf_program__attach(prog);
210 if (libbpf_get_error(links[i])) {
211 fprintf(stderr, "ERROR: bpf_program__attach failed\n");
212 links[i] = NULL;
213 goto cleanup;
214 }
215 i++;
216 }
217
218 test_bpf_perf_event();
219
220cleanup:
221 for (i--; i >= 0; i--)
222 bpf_link__destroy(links[i]);
223
224 bpf_object__close(obj);
225 return 0;
226}
diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c
new file mode 100644
index 000000000..c5a92df8a
--- /dev/null
+++ b/samples/bpf/tracex7_kern.c
@@ -0,0 +1,16 @@
1#include <uapi/linux/ptrace.h>
2#include <uapi/linux/bpf.h>
3#include <linux/version.h>
4#include <bpf/bpf_helpers.h>
5
6SEC("kprobe/open_ctree")
7int bpf_prog1(struct pt_regs *ctx)
8{
9 unsigned long rc = -12;
10
11 bpf_override_return(ctx, rc);
12 return 0;
13}
14
15char _license[] SEC("license") = "GPL";
16u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
new file mode 100644
index 000000000..8be7ce18d
--- /dev/null
+++ b/samples/bpf/tracex7_user.c
@@ -0,0 +1,56 @@
1#define _GNU_SOURCE
2
3#include <stdio.h>
4#include <unistd.h>
5#include <bpf/libbpf.h>
6
7int main(int argc, char **argv)
8{
9 struct bpf_link *link = NULL;
10 struct bpf_program *prog;
11 struct bpf_object *obj;
12 char filename[256];
13 char command[256];
14 int ret = 0;
15 FILE *f;
16
17 if (!argv[1]) {
18 fprintf(stderr, "ERROR: Run with the btrfs device argument!\n");
19 return 0;
20 }
21
22 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
23 obj = bpf_object__open_file(filename, NULL);
24 if (libbpf_get_error(obj)) {
25 fprintf(stderr, "ERROR: opening BPF object file failed\n");
26 return 0;
27 }
28
29 prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
30 if (!prog) {
31 fprintf(stderr, "ERROR: finding a prog in obj file failed\n");
32 goto cleanup;
33 }
34
35 /* load BPF program */
36 if (bpf_object__load(obj)) {
37 fprintf(stderr, "ERROR: loading BPF object file failed\n");
38 goto cleanup;
39 }
40
41 link = bpf_program__attach(prog);
42 if (libbpf_get_error(link)) {
43 fprintf(stderr, "ERROR: bpf_program__attach failed\n");
44 link = NULL;
45 goto cleanup;
46 }
47
48 snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
49 f = popen(command, "r");
50 ret = pclose(f);
51
52cleanup:
53 bpf_link__destroy(link);
54 bpf_object__close(obj);
55 return ret ? 0 : 1;
56}
diff --git a/samples/bpf/xdp1_kern.c b/samples/bpf/xdp1_kern.c
new file mode 100644
index 000000000..34b64394e
--- /dev/null
+++ b/samples/bpf/xdp1_kern.c
@@ -0,0 +1,93 @@
1/* Copyright (c) 2016 PLUMgrid
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#define KBUILD_MODNAME "foo"
8#include <uapi/linux/bpf.h>
9#include <linux/in.h>
10#include <linux/if_ether.h>
11#include <linux/if_packet.h>
12#include <linux/if_vlan.h>
13#include <linux/ip.h>
14#include <linux/ipv6.h>
15#include <bpf/bpf_helpers.h>
16
17struct {
18 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
19 __type(key, u32);
20 __type(value, long);
21 __uint(max_entries, 256);
22} rxcnt SEC(".maps");
23
24static int parse_ipv4(void *data, u64 nh_off, void *data_end)
25{
26 struct iphdr *iph = data + nh_off;
27
28 if (iph + 1 > data_end)
29 return 0;
30 return iph->protocol;
31}
32
33static int parse_ipv6(void *data, u64 nh_off, void *data_end)
34{
35 struct ipv6hdr *ip6h = data + nh_off;
36
37 if (ip6h + 1 > data_end)
38 return 0;
39 return ip6h->nexthdr;
40}
41
42SEC("xdp1")
43int xdp_prog1(struct xdp_md *ctx)
44{
45 void *data_end = (void *)(long)ctx->data_end;
46 void *data = (void *)(long)ctx->data;
47 struct ethhdr *eth = data;
48 int rc = XDP_DROP;
49 long *value;
50 u16 h_proto;
51 u64 nh_off;
52 u32 ipproto;
53
54 nh_off = sizeof(*eth);
55 if (data + nh_off > data_end)
56 return rc;
57
58 h_proto = eth->h_proto;
59
60 if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
61 struct vlan_hdr *vhdr;
62
63 vhdr = data + nh_off;
64 nh_off += sizeof(struct vlan_hdr);
65 if (data + nh_off > data_end)
66 return rc;
67 h_proto = vhdr->h_vlan_encapsulated_proto;
68 }
69 if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
70 struct vlan_hdr *vhdr;
71
72 vhdr = data + nh_off;
73 nh_off += sizeof(struct vlan_hdr);
74 if (data + nh_off > data_end)
75 return rc;
76 h_proto = vhdr->h_vlan_encapsulated_proto;
77 }
78
79 if (h_proto == htons(ETH_P_IP))
80 ipproto = parse_ipv4(data, nh_off, data_end);
81 else if (h_proto == htons(ETH_P_IPV6))
82 ipproto = parse_ipv6(data, nh_off, data_end);
83 else
84 ipproto = 0;
85
86 value = bpf_map_lookup_elem(&rxcnt, &ipproto);
87 if (value)
88 *value += 1;
89
90 return rc;
91}
92
93char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c
new file mode 100644
index 000000000..c447ad9e3
--- /dev/null
+++ b/samples/bpf/xdp1_user.c
@@ -0,0 +1,167 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright (c) 2016 PLUMgrid
3 */
4#include <linux/bpf.h>
5#include <linux/if_link.h>
6#include <assert.h>
7#include <errno.h>
8#include <signal.h>
9#include <stdio.h>
10#include <stdlib.h>
11#include <string.h>
12#include <unistd.h>
13#include <libgen.h>
14#include <sys/resource.h>
15#include <net/if.h>
16
17#include "bpf_util.h"
18#include <bpf/bpf.h>
19#include <bpf/libbpf.h>
20
21static int ifindex;
22static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
23static __u32 prog_id;
24
25static void int_exit(int sig)
26{
27 __u32 curr_prog_id = 0;
28
29 if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) {
30 printf("bpf_get_link_xdp_id failed\n");
31 exit(1);
32 }
33 if (prog_id == curr_prog_id)
34 bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
35 else if (!curr_prog_id)
36 printf("couldn't find a prog id on a given interface\n");
37 else
38 printf("program on interface changed, not removing\n");
39 exit(0);
40}
41
42/* simple per-protocol drop counter
43 */
44static void poll_stats(int map_fd, int interval)
45{
46 unsigned int nr_cpus = bpf_num_possible_cpus();
47 __u64 values[nr_cpus], prev[UINT8_MAX] = { 0 };
48 int i;
49
50 while (1) {
51 __u32 key = UINT32_MAX;
52
53 sleep(interval);
54
55 while (bpf_map_get_next_key(map_fd, &key, &key) != -1) {
56 __u64 sum = 0;
57
58 assert(bpf_map_lookup_elem(map_fd, &key, values) == 0);
59 for (i = 0; i < nr_cpus; i++)
60 sum += values[i];
61 if (sum > prev[key])
62 printf("proto %u: %10llu pkt/s\n",
63 key, (sum - prev[key]) / interval);
64 prev[key] = sum;
65 }
66 }
67}
68
69static void usage(const char *prog)
70{
71 fprintf(stderr,
72 "usage: %s [OPTS] IFACE\n\n"
73 "OPTS:\n"
74 " -S use skb-mode\n"
75 " -N enforce native mode\n"
76 " -F force loading prog\n",
77 prog);
78}
79
80int main(int argc, char **argv)
81{
82 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
83 struct bpf_prog_load_attr prog_load_attr = {
84 .prog_type = BPF_PROG_TYPE_XDP,
85 };
86 struct bpf_prog_info info = {};
87 __u32 info_len = sizeof(info);
88 const char *optstr = "FSN";
89 int prog_fd, map_fd, opt;
90 struct bpf_object *obj;
91 struct bpf_map *map;
92 char filename[256];
93 int err;
94
95 while ((opt = getopt(argc, argv, optstr)) != -1) {
96 switch (opt) {
97 case 'S':
98 xdp_flags |= XDP_FLAGS_SKB_MODE;
99 break;
100 case 'N':
101 /* default, set below */
102 break;
103 case 'F':
104 xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
105 break;
106 default:
107 usage(basename(argv[0]));
108 return 1;
109 }
110 }
111
112 if (!(xdp_flags & XDP_FLAGS_SKB_MODE))
113 xdp_flags |= XDP_FLAGS_DRV_MODE;
114
115 if (optind == argc) {
116 usage(basename(argv[0]));
117 return 1;
118 }
119
120 if (setrlimit(RLIMIT_MEMLOCK, &r)) {
121 perror("setrlimit(RLIMIT_MEMLOCK)");
122 return 1;
123 }
124
125 ifindex = if_nametoindex(argv[optind]);
126 if (!ifindex) {
127 perror("if_nametoindex");
128 return 1;
129 }
130
131 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
132 prog_load_attr.file = filename;
133
134 if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
135 return 1;
136
137 map = bpf_map__next(NULL, obj);
138 if (!map) {
139 printf("finding a map in obj file failed\n");
140 return 1;
141 }
142 map_fd = bpf_map__fd(map);
143
144 if (!prog_fd) {
145 printf("bpf_prog_load_xattr: %s\n", strerror(errno));
146 return 1;
147 }
148
149 signal(SIGINT, int_exit);
150 signal(SIGTERM, int_exit);
151
152 if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) {
153 printf("link set xdp fd failed\n");
154 return 1;
155 }
156
157 err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
158 if (err) {
159 printf("can't get prog info - %s\n", strerror(errno));
160 return err;
161 }
162 prog_id = info.id;
163
164 poll_stats(map_fd, 2);
165
166 return 0;
167}
diff --git a/samples/bpf/xdp2_kern.c b/samples/bpf/xdp2_kern.c
new file mode 100644
index 000000000..c787f4b49
--- /dev/null
+++ b/samples/bpf/xdp2_kern.c
@@ -0,0 +1,114 @@
1/* Copyright (c) 2016 PLUMgrid
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#define KBUILD_MODNAME "foo"
8#include <uapi/linux/bpf.h>
9#include <linux/in.h>
10#include <linux/if_ether.h>
11#include <linux/if_packet.h>
12#include <linux/if_vlan.h>
13#include <linux/ip.h>
14#include <linux/ipv6.h>
15#include <bpf/bpf_helpers.h>
16
17struct {
18 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
19 __type(key, u32);
20 __type(value, long);
21 __uint(max_entries, 256);
22} rxcnt SEC(".maps");
23
24static void swap_src_dst_mac(void *data)
25{
26 unsigned short *p = data;
27 unsigned short dst[3];
28
29 dst[0] = p[0];
30 dst[1] = p[1];
31 dst[2] = p[2];
32 p[0] = p[3];
33 p[1] = p[4];
34 p[2] = p[5];
35 p[3] = dst[0];
36 p[4] = dst[1];
37 p[5] = dst[2];
38}
39
40static int parse_ipv4(void *data, u64 nh_off, void *data_end)
41{
42 struct iphdr *iph = data + nh_off;
43
44 if (iph + 1 > data_end)
45 return 0;
46 return iph->protocol;
47}
48
49static int parse_ipv6(void *data, u64 nh_off, void *data_end)
50{
51 struct ipv6hdr *ip6h = data + nh_off;
52
53 if (ip6h + 1 > data_end)
54 return 0;
55 return ip6h->nexthdr;
56}
57
58SEC("xdp1")
59int xdp_prog1(struct xdp_md *ctx)
60{
61 void *data_end = (void *)(long)ctx->data_end;
62 void *data = (void *)(long)ctx->data;
63 struct ethhdr *eth = data;
64 int rc = XDP_DROP;
65 long *value;
66 u16 h_proto;
67 u64 nh_off;
68 u32 ipproto;
69
70 nh_off = sizeof(*eth);
71 if (data + nh_off > data_end)
72 return rc;
73
74 h_proto = eth->h_proto;
75
76 if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
77 struct vlan_hdr *vhdr;
78
79 vhdr = data + nh_off;
80 nh_off += sizeof(struct vlan_hdr);
81 if (data + nh_off > data_end)
82 return rc;
83 h_proto = vhdr->h_vlan_encapsulated_proto;
84 }
85 if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
86 struct vlan_hdr *vhdr;
87
88 vhdr = data + nh_off;
89 nh_off += sizeof(struct vlan_hdr);
90 if (data + nh_off > data_end)
91 return rc;
92 h_proto = vhdr->h_vlan_encapsulated_proto;
93 }
94
95 if (h_proto == htons(ETH_P_IP))
96 ipproto = parse_ipv4(data, nh_off, data_end);
97 else if (h_proto == htons(ETH_P_IPV6))
98 ipproto = parse_ipv6(data, nh_off, data_end);
99 else
100 ipproto = 0;
101
102 value = bpf_map_lookup_elem(&rxcnt, &ipproto);
103 if (value)
104 *value += 1;
105
106 if (ipproto == IPPROTO_UDP) {
107 swap_src_dst_mac(data);
108 rc = XDP_TX;
109 }
110
111 return rc;
112}
113
114char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp2skb_meta.sh b/samples/bpf/xdp2skb_meta.sh
new file mode 100755
index 000000000..4bde9d066
--- /dev/null
+++ b/samples/bpf/xdp2skb_meta.sh
@@ -0,0 +1,220 @@
1#!/bin/bash
2#
3# SPDX-License-Identifier: GPL-2.0
4# Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc.
5#
6# Bash-shell example on using iproute2 tools 'tc' and 'ip' to load
7# eBPF programs, both for XDP and clsbpf. Shell script function
8# wrappers and even long options parsing is illustrated, for ease of
9# use.
10#
11# Related to sample/bpf/xdp2skb_meta_kern.c, which contains BPF-progs
12# that need to collaborate between XDP and TC hooks. Thus, it is
13# convenient that the same tool load both programs that need to work
14# together.
15#
16BPF_FILE=xdp2skb_meta_kern.o
17DIR=$(dirname $0)
18
19[ -z "$TC" ] && TC=tc
20[ -z "$IP" ] && IP=ip
21
22function usage() {
23 echo ""
24 echo "Usage: $0 [-vfh] --dev ethX"
25 echo " -d | --dev : Network device (required)"
26 echo " --flush : Cleanup flush TC and XDP progs"
27 echo " --list : (\$LIST) List TC and XDP progs"
28 echo " -v | --verbose : (\$VERBOSE) Verbose"
29 echo " --dry-run : (\$DRYRUN) Dry-run only (echo commands)"
30 echo ""
31}
32
33## -- General shell logging cmds --
34function err() {
35 local exitcode=$1
36 shift
37 echo "ERROR: $@" >&2
38 exit $exitcode
39}
40
41function info() {
42 if [[ -n "$VERBOSE" ]]; then
43 echo "# $@"
44 fi
45}
46
47## -- Helper function calls --
48
49# Wrapper call for TC and IP
50# - Will display the offending command on failure
51function _call_cmd() {
52 local cmd="$1"
53 local allow_fail="$2"
54 shift 2
55 if [[ -n "$VERBOSE" ]]; then
56 echo "$cmd $@"
57 fi
58 if [[ -n "$DRYRUN" ]]; then
59 return
60 fi
61 $cmd "$@"
62 local status=$?
63 if (( $status != 0 )); then
64 if [[ "$allow_fail" == "" ]]; then
65 err 2 "Exec error($status) occurred cmd: \"$cmd $@\""
66 fi
67 fi
68}
69function call_tc() {
70 _call_cmd "$TC" "" "$@"
71}
72function call_tc_allow_fail() {
73 _call_cmd "$TC" "allow_fail" "$@"
74}
75function call_ip() {
76 _call_cmd "$IP" "" "$@"
77}
78
79## --- Parse command line arguments / parameters ---
80# Using external program "getopt" to get --long-options
81OPTIONS=$(getopt -o vfhd: \
82 --long verbose,flush,help,list,dev:,dry-run -- "$@")
83if (( $? != 0 )); then
84 err 4 "Error calling getopt"
85fi
86eval set -- "$OPTIONS"
87
88unset DEV
89unset FLUSH
90while true; do
91 case "$1" in
92 -d | --dev ) # device
93 DEV=$2
94 info "Device set to: DEV=$DEV" >&2
95 shift 2
96 ;;
97 -v | --verbose)
98 VERBOSE=yes
99 # info "Verbose mode: VERBOSE=$VERBOSE" >&2
100 shift
101 ;;
102 --dry-run )
103 DRYRUN=yes
104 VERBOSE=yes
105 info "Dry-run mode: enable VERBOSE and don't call TC+IP" >&2
106 shift
107 ;;
108 -f | --flush )
109 FLUSH=yes
110 shift
111 ;;
112 --list )
113 LIST=yes
114 shift
115 ;;
116 -- )
117 shift
118 break
119 ;;
120 -h | --help )
121 usage;
122 exit 0
123 ;;
124 * )
125 shift
126 break
127 ;;
128 esac
129done
130
131FILE="$DIR/$BPF_FILE"
132if [[ ! -e $FILE ]]; then
133 err 3 "Missing BPF object file ($FILE)"
134fi
135
136if [[ -z $DEV ]]; then
137 usage
138 err 2 "Please specify network device -- required option --dev"
139fi
140
141## -- Function calls --
142
143function list_tc()
144{
145 local device="$1"
146 shift
147 info "Listing current TC ingress rules"
148 call_tc filter show dev $device ingress
149}
150
151function list_xdp()
152{
153 local device="$1"
154 shift
155 info "Listing current XDP device($device) setting"
156 call_ip link show dev $device | grep --color=auto xdp
157}
158
159function flush_tc()
160{
161 local device="$1"
162 shift
163 info "Flush TC on device: $device"
164 call_tc_allow_fail filter del dev $device ingress
165 call_tc_allow_fail qdisc del dev $device clsact
166}
167
168function flush_xdp()
169{
170 local device="$1"
171 shift
172 info "Flush XDP on device: $device"
173 call_ip link set dev $device xdp off
174}
175
176function attach_tc_mark()
177{
178 local device="$1"
179 local file="$2"
180 local prog="tc_mark"
181 shift 2
182
183 # Re-attach clsact to clear/flush existing role
184 call_tc_allow_fail qdisc del dev $device clsact 2> /dev/null
185 call_tc qdisc add dev $device clsact
186
187 # Attach BPF prog
188 call_tc filter add dev $device ingress \
189 prio 1 handle 1 bpf da obj $file sec $prog
190}
191
192function attach_xdp_mark()
193{
194 local device="$1"
195 local file="$2"
196 local prog="xdp_mark"
197 shift 2
198
199 # Remove XDP prog in-case it's already loaded
200 # TODO: Need ip-link option to override/replace existing XDP prog
201 flush_xdp $device
202
203 # Attach XDP/BPF prog
204 call_ip link set dev $device xdp obj $file sec $prog
205}
206
207if [[ -n $FLUSH ]]; then
208 flush_tc $DEV
209 flush_xdp $DEV
210 exit 0
211fi
212
213if [[ -n $LIST ]]; then
214 list_tc $DEV
215 list_xdp $DEV
216 exit 0
217fi
218
219attach_tc_mark $DEV $FILE
220attach_xdp_mark $DEV $FILE
diff --git a/samples/bpf/xdp2skb_meta_kern.c b/samples/bpf/xdp2skb_meta_kern.c
new file mode 100644
index 000000000..9b783316e
--- /dev/null
+++ b/samples/bpf/xdp2skb_meta_kern.c
@@ -0,0 +1,105 @@
1/* SPDX-License-Identifier: GPL-2.0
2 * Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc.
3 *
4 * Example howto transfer info from XDP to SKB, e.g. skb->mark
5 * -----------------------------------------------------------
6 * This uses the XDP data_meta infrastructure, and is a cooperation
7 * between two bpf-programs (1) XDP and (2) clsact at TC-ingress hook.
8 *
9 * Notice: This example does not use the BPF C-loader (bpf_load.c),
10 * but instead rely on the iproute2 TC tool for loading BPF-objects.
11 */
12#include <uapi/linux/bpf.h>
13#include <uapi/linux/pkt_cls.h>
14
15#include <bpf/bpf_helpers.h>
16
17/*
18 * This struct is stored in the XDP 'data_meta' area, which is located
19 * just in-front-of the raw packet payload data. The meaning is
20 * specific to these two BPF programs that use it as a communication
21 * channel. XDP adjust/increase the area via a bpf-helper, and TC use
22 * boundary checks to see if data have been provided.
23 *
24 * The struct must be 4 byte aligned, which here is enforced by the
25 * struct __attribute__((aligned(4))).
26 */
27struct meta_info {
28 __u32 mark;
29} __attribute__((aligned(4)));
30
31SEC("xdp_mark")
32int _xdp_mark(struct xdp_md *ctx)
33{
34 struct meta_info *meta;
35 void *data, *data_end;
36 int ret;
37
38 /* Reserve space in-front of data pointer for our meta info.
39 * (Notice drivers not supporting data_meta will fail here!)
40 */
41 ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(*meta));
42 if (ret < 0)
43 return XDP_ABORTED;
44
45 /* Notice: Kernel-side verifier requires that loading of
46 * ctx->data MUST happen _after_ helper bpf_xdp_adjust_meta(),
47 * as pkt-data pointers are invalidated. Helpers that require
48 * this are determined/marked by bpf_helper_changes_pkt_data()
49 */
50 data = (void *)(unsigned long)ctx->data;
51
52 /* Check data_meta have room for meta_info struct */
53 meta = (void *)(unsigned long)ctx->data_meta;
54 if (meta + 1 > data)
55 return XDP_ABORTED;
56
57 meta->mark = 42;
58
59 return XDP_PASS;
60}
61
62SEC("tc_mark")
63int _tc_mark(struct __sk_buff *ctx)
64{
65 void *data = (void *)(unsigned long)ctx->data;
66 void *data_end = (void *)(unsigned long)ctx->data_end;
67 void *data_meta = (void *)(unsigned long)ctx->data_meta;
68 struct meta_info *meta = data_meta;
69
70 /* Check XDP gave us some data_meta */
71 if (meta + 1 > data) {
72 ctx->mark = 41;
73 /* Skip "accept" if no data_meta is avail */
74 return TC_ACT_OK;
75 }
76
77 /* Hint: See func tc_cls_act_is_valid_access() for BPF_WRITE access */
78 ctx->mark = meta->mark; /* Transfer XDP-mark to SKB-mark */
79
80 return TC_ACT_OK;
81}
82
83/* Manually attaching these programs:
84export DEV=ixgbe2
85export FILE=xdp2skb_meta_kern.o
86
87# via TC command
88tc qdisc del dev $DEV clsact 2> /dev/null
89tc qdisc add dev $DEV clsact
90tc filter add dev $DEV ingress prio 1 handle 1 bpf da obj $FILE sec tc_mark
91tc filter show dev $DEV ingress
92
93# XDP via IP command:
94ip link set dev $DEV xdp off
95ip link set dev $DEV xdp obj $FILE sec xdp_mark
96
97# Use iptable to "see" if SKBs are marked
98iptables -I INPUT -p icmp -m mark --mark 41 # == 0x29
99iptables -I INPUT -p icmp -m mark --mark 42 # == 0x2a
100
101# Hint: catch XDP_ABORTED errors via
102perf record -e xdp:*
103perf script
104
105*/
diff --git a/samples/bpf/xdp_adjust_tail_kern.c b/samples/bpf/xdp_adjust_tail_kern.c
new file mode 100644
index 000000000..ffdd54862
--- /dev/null
+++ b/samples/bpf/xdp_adjust_tail_kern.c
@@ -0,0 +1,155 @@
1/* SPDX-License-Identifier: GPL-2.0
2 * Copyright (c) 2018 Facebook
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program shows how to use bpf_xdp_adjust_tail() by
9 * generating ICMPv4 "packet to big" (unreachable/ df bit set frag needed
10 * to be more preice in case of v4)" where receiving packets bigger then
11 * 600 bytes.
12 */
13#define KBUILD_MODNAME "foo"
14#include <uapi/linux/bpf.h>
15#include <linux/in.h>
16#include <linux/if_ether.h>
17#include <linux/if_packet.h>
18#include <linux/if_vlan.h>
19#include <linux/ip.h>
20#include <linux/icmp.h>
21#include <bpf/bpf_helpers.h>
22
23#define DEFAULT_TTL 64
24#define MAX_PCKT_SIZE 600
25#define ICMP_TOOBIG_SIZE 98
26#define ICMP_TOOBIG_PAYLOAD_SIZE 92
27
28/* volatile to prevent compiler optimizations */
29static volatile __u32 max_pcktsz = MAX_PCKT_SIZE;
30
31struct {
32 __uint(type, BPF_MAP_TYPE_ARRAY);
33 __type(key, __u32);
34 __type(value, __u64);
35 __uint(max_entries, 1);
36} icmpcnt SEC(".maps");
37
38static __always_inline void count_icmp(void)
39{
40 u64 key = 0;
41 u64 *icmp_count;
42
43 icmp_count = bpf_map_lookup_elem(&icmpcnt, &key);
44 if (icmp_count)
45 *icmp_count += 1;
46}
47
48static __always_inline void swap_mac(void *data, struct ethhdr *orig_eth)
49{
50 struct ethhdr *eth;
51
52 eth = data;
53 memcpy(eth->h_source, orig_eth->h_dest, ETH_ALEN);
54 memcpy(eth->h_dest, orig_eth->h_source, ETH_ALEN);
55 eth->h_proto = orig_eth->h_proto;
56}
57
58static __always_inline __u16 csum_fold_helper(__u32 csum)
59{
60 return ~((csum & 0xffff) + (csum >> 16));
61}
62
63static __always_inline void ipv4_csum(void *data_start, int data_size,
64 __u32 *csum)
65{
66 *csum = bpf_csum_diff(0, 0, data_start, data_size, *csum);
67 *csum = csum_fold_helper(*csum);
68}
69
70static __always_inline int send_icmp4_too_big(struct xdp_md *xdp)
71{
72 int headroom = (int)sizeof(struct iphdr) + (int)sizeof(struct icmphdr);
73
74 if (bpf_xdp_adjust_head(xdp, 0 - headroom))
75 return XDP_DROP;
76 void *data = (void *)(long)xdp->data;
77 void *data_end = (void *)(long)xdp->data_end;
78
79 if (data + (ICMP_TOOBIG_SIZE + headroom) > data_end)
80 return XDP_DROP;
81
82 struct iphdr *iph, *orig_iph;
83 struct icmphdr *icmp_hdr;
84 struct ethhdr *orig_eth;
85 __u32 csum = 0;
86 __u64 off = 0;
87
88 orig_eth = data + headroom;
89 swap_mac(data, orig_eth);
90 off += sizeof(struct ethhdr);
91 iph = data + off;
92 off += sizeof(struct iphdr);
93 icmp_hdr = data + off;
94 off += sizeof(struct icmphdr);
95 orig_iph = data + off;
96 icmp_hdr->type = ICMP_DEST_UNREACH;
97 icmp_hdr->code = ICMP_FRAG_NEEDED;
98 icmp_hdr->un.frag.mtu = htons(max_pcktsz - sizeof(struct ethhdr));
99 icmp_hdr->checksum = 0;
100 ipv4_csum(icmp_hdr, ICMP_TOOBIG_PAYLOAD_SIZE, &csum);
101 icmp_hdr->checksum = csum;
102 iph->ttl = DEFAULT_TTL;
103 iph->daddr = orig_iph->saddr;
104 iph->saddr = orig_iph->daddr;
105 iph->version = 4;
106 iph->ihl = 5;
107 iph->protocol = IPPROTO_ICMP;
108 iph->tos = 0;
109 iph->tot_len = htons(
110 ICMP_TOOBIG_SIZE + headroom - sizeof(struct ethhdr));
111 iph->check = 0;
112 csum = 0;
113 ipv4_csum(iph, sizeof(struct iphdr), &csum);
114 iph->check = csum;
115 count_icmp();
116 return XDP_TX;
117}
118
119
120static __always_inline int handle_ipv4(struct xdp_md *xdp)
121{
122 void *data_end = (void *)(long)xdp->data_end;
123 void *data = (void *)(long)xdp->data;
124 int pckt_size = data_end - data;
125 int offset;
126
127 if (pckt_size > max(max_pcktsz, ICMP_TOOBIG_SIZE)) {
128 offset = pckt_size - ICMP_TOOBIG_SIZE;
129 if (bpf_xdp_adjust_tail(xdp, 0 - offset))
130 return XDP_PASS;
131 return send_icmp4_too_big(xdp);
132 }
133 return XDP_PASS;
134}
135
136SEC("xdp_icmp")
137int _xdp_icmp(struct xdp_md *xdp)
138{
139 void *data_end = (void *)(long)xdp->data_end;
140 void *data = (void *)(long)xdp->data;
141 struct ethhdr *eth = data;
142 __u16 h_proto;
143
144 if (eth + 1 > data_end)
145 return XDP_DROP;
146
147 h_proto = eth->h_proto;
148
149 if (h_proto == htons(ETH_P_IP))
150 return handle_ipv4(xdp);
151 else
152 return XDP_PASS;
153}
154
155char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_adjust_tail_user.c b/samples/bpf/xdp_adjust_tail_user.c
new file mode 100644
index 000000000..ba482dc3d
--- /dev/null
+++ b/samples/bpf/xdp_adjust_tail_user.c
@@ -0,0 +1,198 @@
1/* SPDX-License-Identifier: GPL-2.0
2 * Copyright (c) 2018 Facebook
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 */
8#include <linux/bpf.h>
9#include <linux/if_link.h>
10#include <assert.h>
11#include <errno.h>
12#include <signal.h>
13#include <stdio.h>
14#include <stdlib.h>
15#include <string.h>
16#include <net/if.h>
17#include <sys/resource.h>
18#include <arpa/inet.h>
19#include <netinet/ether.h>
20#include <unistd.h>
21#include <time.h>
22#include <bpf/bpf.h>
23#include <bpf/libbpf.h>
24
25#define STATS_INTERVAL_S 2U
26#define MAX_PCKT_SIZE 600
27
28static int ifindex = -1;
29static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
30static __u32 prog_id;
31
32static void int_exit(int sig)
33{
34 __u32 curr_prog_id = 0;
35
36 if (ifindex > -1) {
37 if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) {
38 printf("bpf_get_link_xdp_id failed\n");
39 exit(1);
40 }
41 if (prog_id == curr_prog_id)
42 bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
43 else if (!curr_prog_id)
44 printf("couldn't find a prog id on a given iface\n");
45 else
46 printf("program on interface changed, not removing\n");
47 }
48 exit(0);
49}
50
51/* simple "icmp packet too big sent" counter
52 */
53static void poll_stats(unsigned int map_fd, unsigned int kill_after_s)
54{
55 time_t started_at = time(NULL);
56 __u64 value = 0;
57 int key = 0;
58
59
60 while (!kill_after_s || time(NULL) - started_at <= kill_after_s) {
61 sleep(STATS_INTERVAL_S);
62
63 assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
64
65 printf("icmp \"packet too big\" sent: %10llu pkts\n", value);
66 }
67}
68
69static void usage(const char *cmd)
70{
71 printf("Start a XDP prog which send ICMP \"packet too big\" \n"
72 "messages if ingress packet is bigger then MAX_SIZE bytes\n");
73 printf("Usage: %s [...]\n", cmd);
74 printf(" -i <ifname|ifindex> Interface\n");
75 printf(" -T <stop-after-X-seconds> Default: 0 (forever)\n");
76 printf(" -P <MAX_PCKT_SIZE> Default: %u\n", MAX_PCKT_SIZE);
77 printf(" -S use skb-mode\n");
78 printf(" -N enforce native mode\n");
79 printf(" -F force loading prog\n");
80 printf(" -h Display this help\n");
81}
82
83int main(int argc, char **argv)
84{
85 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
86 struct bpf_prog_load_attr prog_load_attr = {
87 .prog_type = BPF_PROG_TYPE_XDP,
88 };
89 unsigned char opt_flags[256] = {};
90 const char *optstr = "i:T:P:SNFh";
91 struct bpf_prog_info info = {};
92 __u32 info_len = sizeof(info);
93 unsigned int kill_after_s = 0;
94 int i, prog_fd, map_fd, opt;
95 struct bpf_object *obj;
96 __u32 max_pckt_size = 0;
97 __u32 key = 0;
98 char filename[256];
99 int err;
100
101 for (i = 0; i < strlen(optstr); i++)
102 if (optstr[i] != 'h' && 'a' <= optstr[i] && optstr[i] <= 'z')
103 opt_flags[(unsigned char)optstr[i]] = 1;
104
105 while ((opt = getopt(argc, argv, optstr)) != -1) {
106
107 switch (opt) {
108 case 'i':
109 ifindex = if_nametoindex(optarg);
110 if (!ifindex)
111 ifindex = atoi(optarg);
112 break;
113 case 'T':
114 kill_after_s = atoi(optarg);
115 break;
116 case 'P':
117 max_pckt_size = atoi(optarg);
118 break;
119 case 'S':
120 xdp_flags |= XDP_FLAGS_SKB_MODE;
121 break;
122 case 'N':
123 /* default, set below */
124 break;
125 case 'F':
126 xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
127 break;
128 default:
129 usage(argv[0]);
130 return 1;
131 }
132 opt_flags[opt] = 0;
133 }
134
135 if (!(xdp_flags & XDP_FLAGS_SKB_MODE))
136 xdp_flags |= XDP_FLAGS_DRV_MODE;
137
138 for (i = 0; i < strlen(optstr); i++) {
139 if (opt_flags[(unsigned int)optstr[i]]) {
140 fprintf(stderr, "Missing argument -%c\n", optstr[i]);
141 usage(argv[0]);
142 return 1;
143 }
144 }
145
146 if (setrlimit(RLIMIT_MEMLOCK, &r)) {
147 perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)");
148 return 1;
149 }
150
151 if (!ifindex) {
152 fprintf(stderr, "Invalid ifname\n");
153 return 1;
154 }
155
156 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
157 prog_load_attr.file = filename;
158
159 if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
160 return 1;
161
162 /* static global var 'max_pcktsz' is accessible from .data section */
163 if (max_pckt_size) {
164 map_fd = bpf_object__find_map_fd_by_name(obj, "xdp_adju.data");
165 if (map_fd < 0) {
166 printf("finding a max_pcktsz map in obj file failed\n");
167 return 1;
168 }
169 bpf_map_update_elem(map_fd, &key, &max_pckt_size, BPF_ANY);
170 }
171
172 /* fetch icmpcnt map */
173 map_fd = bpf_object__find_map_fd_by_name(obj, "icmpcnt");
174 if (map_fd < 0) {
175 printf("finding a icmpcnt map in obj file failed\n");
176 return 1;
177 }
178
179 signal(SIGINT, int_exit);
180 signal(SIGTERM, int_exit);
181
182 if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) {
183 printf("link set xdp fd failed\n");
184 return 1;
185 }
186
187 err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
188 if (err) {
189 printf("can't get prog info - %s\n", strerror(errno));
190 return 1;
191 }
192 prog_id = info.id;
193
194 poll_stats(map_fd, kill_after_s);
195 int_exit(0);
196
197 return 0;
198}
diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
new file mode 100644
index 000000000..54c099cbd
--- /dev/null
+++ b/samples/bpf/xdp_fwd_kern.c
@@ -0,0 +1,158 @@
1// SPDX-License-Identifier: GPL-2.0
2/* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com>
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#define KBUILD_MODNAME "foo"
14#include <uapi/linux/bpf.h>
15#include <linux/in.h>
16#include <linux/if_ether.h>
17#include <linux/if_packet.h>
18#include <linux/if_vlan.h>
19#include <linux/ip.h>
20#include <linux/ipv6.h>
21
22#include <bpf/bpf_helpers.h>
23
24#define IPV6_FLOWINFO_MASK cpu_to_be32(0x0FFFFFFF)
25
26struct {
27 __uint(type, BPF_MAP_TYPE_DEVMAP);
28 __uint(key_size, sizeof(int));
29 __uint(value_size, sizeof(int));
30 __uint(max_entries, 64);
31} xdp_tx_ports SEC(".maps");
32
33/* from include/net/ip.h */
34static __always_inline int ip_decrease_ttl(struct iphdr *iph)
35{
36 u32 check = (__force u32)iph->check;
37
38 check += (__force u32)htons(0x0100);
39 iph->check = (__force __sum16)(check + (check >= 0xFFFF));
40 return --iph->ttl;
41}
42
43static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
44{
45 void *data_end = (void *)(long)ctx->data_end;
46 void *data = (void *)(long)ctx->data;
47 struct bpf_fib_lookup fib_params;
48 struct ethhdr *eth = data;
49 struct ipv6hdr *ip6h;
50 struct iphdr *iph;
51 u16 h_proto;
52 u64 nh_off;
53 int rc;
54
55 nh_off = sizeof(*eth);
56 if (data + nh_off > data_end)
57 return XDP_DROP;
58
59 __builtin_memset(&fib_params, 0, sizeof(fib_params));
60
61 h_proto = eth->h_proto;
62 if (h_proto == htons(ETH_P_IP)) {
63 iph = data + nh_off;
64
65 if (iph + 1 > data_end)
66 return XDP_DROP;
67
68 if (iph->ttl <= 1)
69 return XDP_PASS;
70
71 fib_params.family = AF_INET;
72 fib_params.tos = iph->tos;
73 fib_params.l4_protocol = iph->protocol;
74 fib_params.sport = 0;
75 fib_params.dport = 0;
76 fib_params.tot_len = ntohs(iph->tot_len);
77 fib_params.ipv4_src = iph->saddr;
78 fib_params.ipv4_dst = iph->daddr;
79 } else if (h_proto == htons(ETH_P_IPV6)) {
80 struct in6_addr *src = (struct in6_addr *) fib_params.ipv6_src;
81 struct in6_addr *dst = (struct in6_addr *) fib_params.ipv6_dst;
82
83 ip6h = data + nh_off;
84 if (ip6h + 1 > data_end)
85 return XDP_DROP;
86
87 if (ip6h->hop_limit <= 1)
88 return XDP_PASS;
89
90 fib_params.family = AF_INET6;
91 fib_params.flowinfo = *(__be32 *)ip6h & IPV6_FLOWINFO_MASK;
92 fib_params.l4_protocol = ip6h->nexthdr;
93 fib_params.sport = 0;
94 fib_params.dport = 0;
95 fib_params.tot_len = ntohs(ip6h->payload_len);
96 *src = ip6h->saddr;
97 *dst = ip6h->daddr;
98 } else {
99 return XDP_PASS;
100 }
101
102 fib_params.ifindex = ctx->ingress_ifindex;
103
104 rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
105 /*
106 * Some rc (return codes) from bpf_fib_lookup() are important,
107 * to understand how this XDP-prog interacts with network stack.
108 *
109 * BPF_FIB_LKUP_RET_NO_NEIGH:
110 * Even if route lookup was a success, then the MAC-addresses are also
111 * needed. This is obtained from arp/neighbour table, but if table is
112 * (still) empty then BPF_FIB_LKUP_RET_NO_NEIGH is returned. To avoid
113 * doing ARP lookup directly from XDP, then send packet to normal
114 * network stack via XDP_PASS and expect it will do ARP resolution.
115 *
116 * BPF_FIB_LKUP_RET_FWD_DISABLED:
117 * The bpf_fib_lookup respect sysctl net.ipv{4,6}.conf.all.forwarding
118 * setting, and will return BPF_FIB_LKUP_RET_FWD_DISABLED if not
119 * enabled this on ingress device.
120 */
121 if (rc == BPF_FIB_LKUP_RET_SUCCESS) {
122 /* Verify egress index has been configured as TX-port.
123 * (Note: User can still have inserted an egress ifindex that
124 * doesn't support XDP xmit, which will result in packet drops).
125 *
126 * Note: lookup in devmap supported since 0cdbb4b09a0.
127 * If not supported will fail with:
128 * cannot pass map_type 14 into func bpf_map_lookup_elem#1:
129 */
130 if (!bpf_map_lookup_elem(&xdp_tx_ports, &fib_params.ifindex))
131 return XDP_PASS;
132
133 if (h_proto == htons(ETH_P_IP))
134 ip_decrease_ttl(iph);
135 else if (h_proto == htons(ETH_P_IPV6))
136 ip6h->hop_limit--;
137
138 memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
139 memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
140 return bpf_redirect_map(&xdp_tx_ports, fib_params.ifindex, 0);
141 }
142
143 return XDP_PASS;
144}
145
146SEC("xdp_fwd")
147int xdp_fwd_prog(struct xdp_md *ctx)
148{
149 return xdp_fwd_flags(ctx, 0);
150}
151
152SEC("xdp_fwd_direct")
153int xdp_fwd_direct_prog(struct xdp_md *ctx)
154{
155 return xdp_fwd_flags(ctx, BPF_FIB_LOOKUP_DIRECT);
156}
157
158char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c
new file mode 100644
index 000000000..74a4583d0
--- /dev/null
+++ b/samples/bpf/xdp_fwd_user.c
@@ -0,0 +1,170 @@
1// SPDX-License-Identifier: GPL-2.0
2/* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com>
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13
14#include <linux/bpf.h>
15#include <linux/if_link.h>
16#include <linux/limits.h>
17#include <net/if.h>
18#include <errno.h>
19#include <stdio.h>
20#include <stdlib.h>
21#include <stdbool.h>
22#include <string.h>
23#include <unistd.h>
24#include <fcntl.h>
25#include <libgen.h>
26
27#include <bpf/libbpf.h>
28#include <bpf/bpf.h>
29
30static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
31
32static int do_attach(int idx, int prog_fd, int map_fd, const char *name)
33{
34 int err;
35
36 err = bpf_set_link_xdp_fd(idx, prog_fd, xdp_flags);
37 if (err < 0) {
38 printf("ERROR: failed to attach program to %s\n", name);
39 return err;
40 }
41
42 /* Adding ifindex as a possible egress TX port */
43 err = bpf_map_update_elem(map_fd, &idx, &idx, 0);
44 if (err)
45 printf("ERROR: failed using device %s as TX-port\n", name);
46
47 return err;
48}
49
50static int do_detach(int idx, const char *name)
51{
52 int err;
53
54 err = bpf_set_link_xdp_fd(idx, -1, xdp_flags);
55 if (err < 0)
56 printf("ERROR: failed to detach program from %s\n", name);
57
58 /* TODO: Remember to cleanup map, when adding use of shared map
59 * bpf_map_delete_elem((map_fd, &idx);
60 */
61 return err;
62}
63
64static void usage(const char *prog)
65{
66 fprintf(stderr,
67 "usage: %s [OPTS] interface-list\n"
68 "\nOPTS:\n"
69 " -d detach program\n"
70 " -D direct table lookups (skip fib rules)\n",
71 prog);
72}
73
74int main(int argc, char **argv)
75{
76 struct bpf_prog_load_attr prog_load_attr = {
77 .prog_type = BPF_PROG_TYPE_XDP,
78 };
79 const char *prog_name = "xdp_fwd";
80 struct bpf_program *prog;
81 int prog_fd, map_fd = -1;
82 char filename[PATH_MAX];
83 struct bpf_object *obj;
84 int opt, i, idx, err;
85 int attach = 1;
86 int ret = 0;
87
88 while ((opt = getopt(argc, argv, ":dDSF")) != -1) {
89 switch (opt) {
90 case 'd':
91 attach = 0;
92 break;
93 case 'S':
94 xdp_flags |= XDP_FLAGS_SKB_MODE;
95 break;
96 case 'F':
97 xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
98 break;
99 case 'D':
100 prog_name = "xdp_fwd_direct";
101 break;
102 default:
103 usage(basename(argv[0]));
104 return 1;
105 }
106 }
107
108 if (!(xdp_flags & XDP_FLAGS_SKB_MODE))
109 xdp_flags |= XDP_FLAGS_DRV_MODE;
110
111 if (optind == argc) {
112 usage(basename(argv[0]));
113 return 1;
114 }
115
116 if (attach) {
117 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
118 prog_load_attr.file = filename;
119
120 if (access(filename, O_RDONLY) < 0) {
121 printf("error accessing file %s: %s\n",
122 filename, strerror(errno));
123 return 1;
124 }
125
126 err = bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd);
127 if (err) {
128 printf("Does kernel support devmap lookup?\n");
129 /* If not, the error message will be:
130 * "cannot pass map_type 14 into func bpf_map_lookup_elem#1"
131 */
132 return 1;
133 }
134
135 prog = bpf_object__find_program_by_title(obj, prog_name);
136 prog_fd = bpf_program__fd(prog);
137 if (prog_fd < 0) {
138 printf("program not found: %s\n", strerror(prog_fd));
139 return 1;
140 }
141 map_fd = bpf_map__fd(bpf_object__find_map_by_name(obj,
142 "xdp_tx_ports"));
143 if (map_fd < 0) {
144 printf("map not found: %s\n", strerror(map_fd));
145 return 1;
146 }
147 }
148
149 for (i = optind; i < argc; ++i) {
150 idx = if_nametoindex(argv[i]);
151 if (!idx)
152 idx = strtoul(argv[i], NULL, 0);
153
154 if (!idx) {
155 fprintf(stderr, "Invalid arg\n");
156 return 1;
157 }
158 if (!attach) {
159 err = do_detach(idx, argv[i]);
160 if (err)
161 ret = err;
162 } else {
163 err = do_attach(idx, prog_fd, map_fd, argv[i]);
164 if (err)
165 ret = err;
166 }
167 }
168
169 return ret;
170}
diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c
new file mode 100644
index 000000000..5c955b812
--- /dev/null
+++ b/samples/bpf/xdp_monitor_kern.c
@@ -0,0 +1,257 @@
1/* SPDX-License-Identifier: GPL-2.0
2 * Copyright(c) 2017-2018 Jesper Dangaard Brouer, Red Hat Inc.
3 *
4 * XDP monitor tool, based on tracepoints
5 */
6#include <uapi/linux/bpf.h>
7#include <bpf/bpf_helpers.h>
8
9struct {
10 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
11 __type(key, u32);
12 __type(value, u64);
13 __uint(max_entries, 2);
14 /* TODO: have entries for all possible errno's */
15} redirect_err_cnt SEC(".maps");
16
17#define XDP_UNKNOWN XDP_REDIRECT + 1
18struct {
19 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
20 __type(key, u32);
21 __type(value, u64);
22 __uint(max_entries, XDP_UNKNOWN + 1);
23} exception_cnt SEC(".maps");
24
25/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
26 * Code in: kernel/include/trace/events/xdp.h
27 */
28struct xdp_redirect_ctx {
29 u64 __pad; // First 8 bytes are not accessible by bpf code
30 int prog_id; // offset:8; size:4; signed:1;
31 u32 act; // offset:12 size:4; signed:0;
32 int ifindex; // offset:16 size:4; signed:1;
33 int err; // offset:20 size:4; signed:1;
34 int to_ifindex; // offset:24 size:4; signed:1;
35 u32 map_id; // offset:28 size:4; signed:0;
36 int map_index; // offset:32 size:4; signed:1;
37}; // offset:36
38
39enum {
40 XDP_REDIRECT_SUCCESS = 0,
41 XDP_REDIRECT_ERROR = 1
42};
43
44static __always_inline
45int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
46{
47 u32 key = XDP_REDIRECT_ERROR;
48 int err = ctx->err;
49 u64 *cnt;
50
51 if (!err)
52 key = XDP_REDIRECT_SUCCESS;
53
54 cnt = bpf_map_lookup_elem(&redirect_err_cnt, &key);
55 if (!cnt)
56 return 1;
57 *cnt += 1;
58
59 return 0; /* Indicate event was filtered (no further processing)*/
60 /*
61 * Returning 1 here would allow e.g. a perf-record tracepoint
62 * to see and record these events, but it doesn't work well
63 * in-practice as stopping perf-record also unload this
64 * bpf_prog. Plus, there is additional overhead of doing so.
65 */
66}
67
68SEC("tracepoint/xdp/xdp_redirect_err")
69int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx)
70{
71 return xdp_redirect_collect_stat(ctx);
72}
73
74
75SEC("tracepoint/xdp/xdp_redirect_map_err")
76int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx)
77{
78 return xdp_redirect_collect_stat(ctx);
79}
80
81/* Likely unloaded when prog starts */
82SEC("tracepoint/xdp/xdp_redirect")
83int trace_xdp_redirect(struct xdp_redirect_ctx *ctx)
84{
85 return xdp_redirect_collect_stat(ctx);
86}
87
88/* Likely unloaded when prog starts */
89SEC("tracepoint/xdp/xdp_redirect_map")
90int trace_xdp_redirect_map(struct xdp_redirect_ctx *ctx)
91{
92 return xdp_redirect_collect_stat(ctx);
93}
94
95/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
96 * Code in: kernel/include/trace/events/xdp.h
97 */
98struct xdp_exception_ctx {
99 u64 __pad; // First 8 bytes are not accessible by bpf code
100 int prog_id; // offset:8; size:4; signed:1;
101 u32 act; // offset:12; size:4; signed:0;
102 int ifindex; // offset:16; size:4; signed:1;
103};
104
105SEC("tracepoint/xdp/xdp_exception")
106int trace_xdp_exception(struct xdp_exception_ctx *ctx)
107{
108 u64 *cnt;
109 u32 key;
110
111 key = ctx->act;
112 if (key > XDP_REDIRECT)
113 key = XDP_UNKNOWN;
114
115 cnt = bpf_map_lookup_elem(&exception_cnt, &key);
116 if (!cnt)
117 return 1;
118 *cnt += 1;
119
120 return 0;
121}
122
123/* Common stats data record shared with _user.c */
124struct datarec {
125 u64 processed;
126 u64 dropped;
127 u64 info;
128 u64 err;
129};
130#define MAX_CPUS 64
131
132struct {
133 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
134 __type(key, u32);
135 __type(value, struct datarec);
136 __uint(max_entries, MAX_CPUS);
137} cpumap_enqueue_cnt SEC(".maps");
138
139struct {
140 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
141 __type(key, u32);
142 __type(value, struct datarec);
143 __uint(max_entries, 1);
144} cpumap_kthread_cnt SEC(".maps");
145
146/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
147 * Code in: kernel/include/trace/events/xdp.h
148 */
149struct cpumap_enqueue_ctx {
150 u64 __pad; // First 8 bytes are not accessible by bpf code
151 int map_id; // offset:8; size:4; signed:1;
152 u32 act; // offset:12; size:4; signed:0;
153 int cpu; // offset:16; size:4; signed:1;
154 unsigned int drops; // offset:20; size:4; signed:0;
155 unsigned int processed; // offset:24; size:4; signed:0;
156 int to_cpu; // offset:28; size:4; signed:1;
157};
158
159SEC("tracepoint/xdp/xdp_cpumap_enqueue")
160int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
161{
162 u32 to_cpu = ctx->to_cpu;
163 struct datarec *rec;
164
165 if (to_cpu >= MAX_CPUS)
166 return 1;
167
168 rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
169 if (!rec)
170 return 0;
171 rec->processed += ctx->processed;
172 rec->dropped += ctx->drops;
173
174 /* Record bulk events, then userspace can calc average bulk size */
175 if (ctx->processed > 0)
176 rec->info += 1;
177
178 return 0;
179}
180
181/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
182 * Code in: kernel/include/trace/events/xdp.h
183 */
184struct cpumap_kthread_ctx {
185 u64 __pad; // First 8 bytes are not accessible by bpf code
186 int map_id; // offset:8; size:4; signed:1;
187 u32 act; // offset:12; size:4; signed:0;
188 int cpu; // offset:16; size:4; signed:1;
189 unsigned int drops; // offset:20; size:4; signed:0;
190 unsigned int processed; // offset:24; size:4; signed:0;
191 int sched; // offset:28; size:4; signed:1;
192};
193
194SEC("tracepoint/xdp/xdp_cpumap_kthread")
195int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
196{
197 struct datarec *rec;
198 u32 key = 0;
199
200 rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
201 if (!rec)
202 return 0;
203 rec->processed += ctx->processed;
204 rec->dropped += ctx->drops;
205
206 /* Count times kthread yielded CPU via schedule call */
207 if (ctx->sched)
208 rec->info++;
209
210 return 0;
211}
212
213struct {
214 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
215 __type(key, u32);
216 __type(value, struct datarec);
217 __uint(max_entries, 1);
218} devmap_xmit_cnt SEC(".maps");
219
220/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format
221 * Code in: kernel/include/trace/events/xdp.h
222 */
223struct devmap_xmit_ctx {
224 u64 __pad; // First 8 bytes are not accessible by bpf code
225 int from_ifindex; // offset:8; size:4; signed:1;
226 u32 act; // offset:12; size:4; signed:0;
227 int to_ifindex; // offset:16; size:4; signed:1;
228 int drops; // offset:20; size:4; signed:1;
229 int sent; // offset:24; size:4; signed:1;
230 int err; // offset:28; size:4; signed:1;
231};
232
233SEC("tracepoint/xdp/xdp_devmap_xmit")
234int trace_xdp_devmap_xmit(struct devmap_xmit_ctx *ctx)
235{
236 struct datarec *rec;
237 u32 key = 0;
238
239 rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &key);
240 if (!rec)
241 return 0;
242 rec->processed += ctx->sent;
243 rec->dropped += ctx->drops;
244
245 /* Record bulk events, then userspace can calc average bulk size */
246 rec->info += 1;
247
248 /* Record error cases, where no frame were sent */
249 if (ctx->err)
250 rec->err++;
251
252 /* Catch API error of drv ndo_xdp_xmit sent more than count */
253 if (ctx->drops < 0)
254 rec->err++;
255
256 return 1;
257}
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
new file mode 100644
index 000000000..03d0a1829
--- /dev/null
+++ b/samples/bpf/xdp_monitor_user.c
@@ -0,0 +1,792 @@
1/* SPDX-License-Identifier: GPL-2.0
2 * Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
3 */
4static const char *__doc__=
5 "XDP monitor tool, based on tracepoints\n"
6;
7
8static const char *__doc_err_only__=
9 " NOTICE: Only tracking XDP redirect errors\n"
10 " Enable TX success stats via '--stats'\n"
11 " (which comes with a per packet processing overhead)\n"
12;
13
14#include <errno.h>
15#include <stdio.h>
16#include <stdlib.h>
17#include <stdbool.h>
18#include <stdint.h>
19#include <string.h>
20#include <ctype.h>
21#include <unistd.h>
22#include <locale.h>
23
24#include <sys/resource.h>
25#include <getopt.h>
26#include <net/if.h>
27#include <time.h>
28
29#include <signal.h>
30#include <bpf/bpf.h>
31#include <bpf/libbpf.h>
32#include "bpf_util.h"
33
34enum map_type {
35 REDIRECT_ERR_CNT,
36 EXCEPTION_CNT,
37 CPUMAP_ENQUEUE_CNT,
38 CPUMAP_KTHREAD_CNT,
39 DEVMAP_XMIT_CNT,
40};
41
42static const char *const map_type_strings[] = {
43 [REDIRECT_ERR_CNT] = "redirect_err_cnt",
44 [EXCEPTION_CNT] = "exception_cnt",
45 [CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt",
46 [CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt",
47 [DEVMAP_XMIT_CNT] = "devmap_xmit_cnt",
48};
49
50#define NUM_MAP 5
51#define NUM_TP 8
52
53static int tp_cnt;
54static int map_cnt;
55static int verbose = 1;
56static bool debug = false;
57struct bpf_map *map_data[NUM_MAP] = {};
58struct bpf_link *tp_links[NUM_TP] = {};
59struct bpf_object *obj;
60
61static const struct option long_options[] = {
62 {"help", no_argument, NULL, 'h' },
63 {"debug", no_argument, NULL, 'D' },
64 {"stats", no_argument, NULL, 'S' },
65 {"sec", required_argument, NULL, 's' },
66 {0, 0, NULL, 0 }
67};
68
69static void int_exit(int sig)
70{
71 /* Detach tracepoints */
72 while (tp_cnt)
73 bpf_link__destroy(tp_links[--tp_cnt]);
74
75 bpf_object__close(obj);
76 exit(0);
77}
78
79/* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */
80#define EXIT_FAIL_MEM 5
81
82static void usage(char *argv[])
83{
84 int i;
85 printf("\nDOCUMENTATION:\n%s\n", __doc__);
86 printf("\n");
87 printf(" Usage: %s (options-see-below)\n",
88 argv[0]);
89 printf(" Listing options:\n");
90 for (i = 0; long_options[i].name != 0; i++) {
91 printf(" --%-15s", long_options[i].name);
92 if (long_options[i].flag != NULL)
93 printf(" flag (internal value:%d)",
94 *long_options[i].flag);
95 else
96 printf("short-option: -%c",
97 long_options[i].val);
98 printf("\n");
99 }
100 printf("\n");
101}
102
103#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
104static __u64 gettime(void)
105{
106 struct timespec t;
107 int res;
108
109 res = clock_gettime(CLOCK_MONOTONIC, &t);
110 if (res < 0) {
111 fprintf(stderr, "Error with gettimeofday! (%i)\n", res);
112 exit(EXIT_FAILURE);
113 }
114 return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
115}
116
117enum {
118 REDIR_SUCCESS = 0,
119 REDIR_ERROR = 1,
120};
121#define REDIR_RES_MAX 2
122static const char *redir_names[REDIR_RES_MAX] = {
123 [REDIR_SUCCESS] = "Success",
124 [REDIR_ERROR] = "Error",
125};
126static const char *err2str(int err)
127{
128 if (err < REDIR_RES_MAX)
129 return redir_names[err];
130 return NULL;
131}
132/* enum xdp_action */
133#define XDP_UNKNOWN XDP_REDIRECT + 1
134#define XDP_ACTION_MAX (XDP_UNKNOWN + 1)
135static const char *xdp_action_names[XDP_ACTION_MAX] = {
136 [XDP_ABORTED] = "XDP_ABORTED",
137 [XDP_DROP] = "XDP_DROP",
138 [XDP_PASS] = "XDP_PASS",
139 [XDP_TX] = "XDP_TX",
140 [XDP_REDIRECT] = "XDP_REDIRECT",
141 [XDP_UNKNOWN] = "XDP_UNKNOWN",
142};
143static const char *action2str(int action)
144{
145 if (action < XDP_ACTION_MAX)
146 return xdp_action_names[action];
147 return NULL;
148}
149
150/* Common stats data record shared with _kern.c */
151struct datarec {
152 __u64 processed;
153 __u64 dropped;
154 __u64 info;
155 __u64 err;
156};
157#define MAX_CPUS 64
158
159/* Userspace structs for collection of stats from maps */
160struct record {
161 __u64 timestamp;
162 struct datarec total;
163 struct datarec *cpu;
164};
165struct u64rec {
166 __u64 processed;
167};
168struct record_u64 {
169 /* record for _kern side __u64 values */
170 __u64 timestamp;
171 struct u64rec total;
172 struct u64rec *cpu;
173};
174
175struct stats_record {
176 struct record_u64 xdp_redirect[REDIR_RES_MAX];
177 struct record_u64 xdp_exception[XDP_ACTION_MAX];
178 struct record xdp_cpumap_kthread;
179 struct record xdp_cpumap_enqueue[MAX_CPUS];
180 struct record xdp_devmap_xmit;
181};
182
183static bool map_collect_record(int fd, __u32 key, struct record *rec)
184{
185 /* For percpu maps, userspace gets a value per possible CPU */
186 unsigned int nr_cpus = bpf_num_possible_cpus();
187 struct datarec values[nr_cpus];
188 __u64 sum_processed = 0;
189 __u64 sum_dropped = 0;
190 __u64 sum_info = 0;
191 __u64 sum_err = 0;
192 int i;
193
194 if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
195 fprintf(stderr,
196 "ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
197 return false;
198 }
199 /* Get time as close as possible to reading map contents */
200 rec->timestamp = gettime();
201
202 /* Record and sum values from each CPU */
203 for (i = 0; i < nr_cpus; i++) {
204 rec->cpu[i].processed = values[i].processed;
205 sum_processed += values[i].processed;
206 rec->cpu[i].dropped = values[i].dropped;
207 sum_dropped += values[i].dropped;
208 rec->cpu[i].info = values[i].info;
209 sum_info += values[i].info;
210 rec->cpu[i].err = values[i].err;
211 sum_err += values[i].err;
212 }
213 rec->total.processed = sum_processed;
214 rec->total.dropped = sum_dropped;
215 rec->total.info = sum_info;
216 rec->total.err = sum_err;
217 return true;
218}
219
220static bool map_collect_record_u64(int fd, __u32 key, struct record_u64 *rec)
221{
222 /* For percpu maps, userspace gets a value per possible CPU */
223 unsigned int nr_cpus = bpf_num_possible_cpus();
224 struct u64rec values[nr_cpus];
225 __u64 sum_total = 0;
226 int i;
227
228 if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
229 fprintf(stderr,
230 "ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
231 return false;
232 }
233 /* Get time as close as possible to reading map contents */
234 rec->timestamp = gettime();
235
236 /* Record and sum values from each CPU */
237 for (i = 0; i < nr_cpus; i++) {
238 rec->cpu[i].processed = values[i].processed;
239 sum_total += values[i].processed;
240 }
241 rec->total.processed = sum_total;
242 return true;
243}
244
245static double calc_period(struct record *r, struct record *p)
246{
247 double period_ = 0;
248 __u64 period = 0;
249
250 period = r->timestamp - p->timestamp;
251 if (period > 0)
252 period_ = ((double) period / NANOSEC_PER_SEC);
253
254 return period_;
255}
256
257static double calc_period_u64(struct record_u64 *r, struct record_u64 *p)
258{
259 double period_ = 0;
260 __u64 period = 0;
261
262 period = r->timestamp - p->timestamp;
263 if (period > 0)
264 period_ = ((double) period / NANOSEC_PER_SEC);
265
266 return period_;
267}
268
269static double calc_pps(struct datarec *r, struct datarec *p, double period)
270{
271 __u64 packets = 0;
272 double pps = 0;
273
274 if (period > 0) {
275 packets = r->processed - p->processed;
276 pps = packets / period;
277 }
278 return pps;
279}
280
281static double calc_pps_u64(struct u64rec *r, struct u64rec *p, double period)
282{
283 __u64 packets = 0;
284 double pps = 0;
285
286 if (period > 0) {
287 packets = r->processed - p->processed;
288 pps = packets / period;
289 }
290 return pps;
291}
292
293static double calc_drop(struct datarec *r, struct datarec *p, double period)
294{
295 __u64 packets = 0;
296 double pps = 0;
297
298 if (period > 0) {
299 packets = r->dropped - p->dropped;
300 pps = packets / period;
301 }
302 return pps;
303}
304
305static double calc_info(struct datarec *r, struct datarec *p, double period)
306{
307 __u64 packets = 0;
308 double pps = 0;
309
310 if (period > 0) {
311 packets = r->info - p->info;
312 pps = packets / period;
313 }
314 return pps;
315}
316
317static double calc_err(struct datarec *r, struct datarec *p, double period)
318{
319 __u64 packets = 0;
320 double pps = 0;
321
322 if (period > 0) {
323 packets = r->err - p->err;
324 pps = packets / period;
325 }
326 return pps;
327}
328
329static void stats_print(struct stats_record *stats_rec,
330 struct stats_record *stats_prev,
331 bool err_only)
332{
333 unsigned int nr_cpus = bpf_num_possible_cpus();
334 int rec_i = 0, i, to_cpu;
335 double t = 0, pps = 0;
336
337 /* Header */
338 printf("%-15s %-7s %-12s %-12s %-9s\n",
339 "XDP-event", "CPU:to", "pps", "drop-pps", "extra-info");
340
341 /* tracepoint: xdp:xdp_redirect_* */
342 if (err_only)
343 rec_i = REDIR_ERROR;
344
345 for (; rec_i < REDIR_RES_MAX; rec_i++) {
346 struct record_u64 *rec, *prev;
347 char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n";
348 char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n";
349
350 rec = &stats_rec->xdp_redirect[rec_i];
351 prev = &stats_prev->xdp_redirect[rec_i];
352 t = calc_period_u64(rec, prev);
353
354 for (i = 0; i < nr_cpus; i++) {
355 struct u64rec *r = &rec->cpu[i];
356 struct u64rec *p = &prev->cpu[i];
357
358 pps = calc_pps_u64(r, p, t);
359 if (pps > 0)
360 printf(fmt1, "XDP_REDIRECT", i,
361 rec_i ? 0.0: pps, rec_i ? pps : 0.0,
362 err2str(rec_i));
363 }
364 pps = calc_pps_u64(&rec->total, &prev->total, t);
365 printf(fmt2, "XDP_REDIRECT", "total",
366 rec_i ? 0.0: pps, rec_i ? pps : 0.0, err2str(rec_i));
367 }
368
369 /* tracepoint: xdp:xdp_exception */
370 for (rec_i = 0; rec_i < XDP_ACTION_MAX; rec_i++) {
371 struct record_u64 *rec, *prev;
372 char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n";
373 char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n";
374
375 rec = &stats_rec->xdp_exception[rec_i];
376 prev = &stats_prev->xdp_exception[rec_i];
377 t = calc_period_u64(rec, prev);
378
379 for (i = 0; i < nr_cpus; i++) {
380 struct u64rec *r = &rec->cpu[i];
381 struct u64rec *p = &prev->cpu[i];
382
383 pps = calc_pps_u64(r, p, t);
384 if (pps > 0)
385 printf(fmt1, "Exception", i,
386 0.0, pps, action2str(rec_i));
387 }
388 pps = calc_pps_u64(&rec->total, &prev->total, t);
389 if (pps > 0)
390 printf(fmt2, "Exception", "total",
391 0.0, pps, action2str(rec_i));
392 }
393
394 /* cpumap enqueue stats */
395 for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) {
396 char *fmt1 = "%-15s %3d:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n";
397 char *fmt2 = "%-15s %3s:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n";
398 struct record *rec, *prev;
399 char *info_str = "";
400 double drop, info;
401
402 rec = &stats_rec->xdp_cpumap_enqueue[to_cpu];
403 prev = &stats_prev->xdp_cpumap_enqueue[to_cpu];
404 t = calc_period(rec, prev);
405 for (i = 0; i < nr_cpus; i++) {
406 struct datarec *r = &rec->cpu[i];
407 struct datarec *p = &prev->cpu[i];
408
409 pps = calc_pps(r, p, t);
410 drop = calc_drop(r, p, t);
411 info = calc_info(r, p, t);
412 if (info > 0) {
413 info_str = "bulk-average";
414 info = pps / info; /* calc average bulk size */
415 }
416 if (pps > 0)
417 printf(fmt1, "cpumap-enqueue",
418 i, to_cpu, pps, drop, info, info_str);
419 }
420 pps = calc_pps(&rec->total, &prev->total, t);
421 if (pps > 0) {
422 drop = calc_drop(&rec->total, &prev->total, t);
423 info = calc_info(&rec->total, &prev->total, t);
424 if (info > 0) {
425 info_str = "bulk-average";
426 info = pps / info; /* calc average bulk size */
427 }
428 printf(fmt2, "cpumap-enqueue",
429 "sum", to_cpu, pps, drop, info, info_str);
430 }
431 }
432
433 /* cpumap kthread stats */
434 {
435 char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.0f %s\n";
436 char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.0f %s\n";
437 struct record *rec, *prev;
438 double drop, info;
439 char *i_str = "";
440
441 rec = &stats_rec->xdp_cpumap_kthread;
442 prev = &stats_prev->xdp_cpumap_kthread;
443 t = calc_period(rec, prev);
444 for (i = 0; i < nr_cpus; i++) {
445 struct datarec *r = &rec->cpu[i];
446 struct datarec *p = &prev->cpu[i];
447
448 pps = calc_pps(r, p, t);
449 drop = calc_drop(r, p, t);
450 info = calc_info(r, p, t);
451 if (info > 0)
452 i_str = "sched";
453 if (pps > 0 || drop > 0)
454 printf(fmt1, "cpumap-kthread",
455 i, pps, drop, info, i_str);
456 }
457 pps = calc_pps(&rec->total, &prev->total, t);
458 drop = calc_drop(&rec->total, &prev->total, t);
459 info = calc_info(&rec->total, &prev->total, t);
460 if (info > 0)
461 i_str = "sched-sum";
462 printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str);
463 }
464
465 /* devmap ndo_xdp_xmit stats */
466 {
467 char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s %s\n";
468 char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s %s\n";
469 struct record *rec, *prev;
470 double drop, info, err;
471 char *i_str = "";
472 char *err_str = "";
473
474 rec = &stats_rec->xdp_devmap_xmit;
475 prev = &stats_prev->xdp_devmap_xmit;
476 t = calc_period(rec, prev);
477 for (i = 0; i < nr_cpus; i++) {
478 struct datarec *r = &rec->cpu[i];
479 struct datarec *p = &prev->cpu[i];
480
481 pps = calc_pps(r, p, t);
482 drop = calc_drop(r, p, t);
483 info = calc_info(r, p, t);
484 err = calc_err(r, p, t);
485 if (info > 0) {
486 i_str = "bulk-average";
487 info = (pps+drop) / info; /* calc avg bulk */
488 }
489 if (err > 0)
490 err_str = "drv-err";
491 if (pps > 0 || drop > 0)
492 printf(fmt1, "devmap-xmit",
493 i, pps, drop, info, i_str, err_str);
494 }
495 pps = calc_pps(&rec->total, &prev->total, t);
496 drop = calc_drop(&rec->total, &prev->total, t);
497 info = calc_info(&rec->total, &prev->total, t);
498 err = calc_err(&rec->total, &prev->total, t);
499 if (info > 0) {
500 i_str = "bulk-average";
501 info = (pps+drop) / info; /* calc avg bulk */
502 }
503 if (err > 0)
504 err_str = "drv-err";
505 printf(fmt2, "devmap-xmit", "total", pps, drop,
506 info, i_str, err_str);
507 }
508
509 printf("\n");
510}
511
512static bool stats_collect(struct stats_record *rec)
513{
514 int fd;
515 int i;
516
517 /* TODO: Detect if someone unloaded the perf event_fd's, as
518 * this can happen by someone running perf-record -e
519 */
520
521 fd = bpf_map__fd(map_data[REDIRECT_ERR_CNT]);
522 for (i = 0; i < REDIR_RES_MAX; i++)
523 map_collect_record_u64(fd, i, &rec->xdp_redirect[i]);
524
525 fd = bpf_map__fd(map_data[EXCEPTION_CNT]);
526 for (i = 0; i < XDP_ACTION_MAX; i++) {
527 map_collect_record_u64(fd, i, &rec->xdp_exception[i]);
528 }
529
530 fd = bpf_map__fd(map_data[CPUMAP_ENQUEUE_CNT]);
531 for (i = 0; i < MAX_CPUS; i++)
532 map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]);
533
534 fd = bpf_map__fd(map_data[CPUMAP_KTHREAD_CNT]);
535 map_collect_record(fd, 0, &rec->xdp_cpumap_kthread);
536
537 fd = bpf_map__fd(map_data[DEVMAP_XMIT_CNT]);
538 map_collect_record(fd, 0, &rec->xdp_devmap_xmit);
539
540 return true;
541}
542
543static void *alloc_rec_per_cpu(int record_size)
544{
545 unsigned int nr_cpus = bpf_num_possible_cpus();
546 void *array;
547
548 array = calloc(nr_cpus, record_size);
549 if (!array) {
550 fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
551 exit(EXIT_FAIL_MEM);
552 }
553 return array;
554}
555
556static struct stats_record *alloc_stats_record(void)
557{
558 struct stats_record *rec;
559 int rec_sz;
560 int i;
561
562 /* Alloc main stats_record structure */
563 rec = calloc(1, sizeof(*rec));
564 if (!rec) {
565 fprintf(stderr, "Mem alloc error\n");
566 exit(EXIT_FAIL_MEM);
567 }
568
569 /* Alloc stats stored per CPU for each record */
570 rec_sz = sizeof(struct u64rec);
571 for (i = 0; i < REDIR_RES_MAX; i++)
572 rec->xdp_redirect[i].cpu = alloc_rec_per_cpu(rec_sz);
573
574 for (i = 0; i < XDP_ACTION_MAX; i++)
575 rec->xdp_exception[i].cpu = alloc_rec_per_cpu(rec_sz);
576
577 rec_sz = sizeof(struct datarec);
578 rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz);
579 rec->xdp_devmap_xmit.cpu = alloc_rec_per_cpu(rec_sz);
580
581 for (i = 0; i < MAX_CPUS; i++)
582 rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz);
583
584 return rec;
585}
586
587static void free_stats_record(struct stats_record *r)
588{
589 int i;
590
591 for (i = 0; i < REDIR_RES_MAX; i++)
592 free(r->xdp_redirect[i].cpu);
593
594 for (i = 0; i < XDP_ACTION_MAX; i++)
595 free(r->xdp_exception[i].cpu);
596
597 free(r->xdp_cpumap_kthread.cpu);
598 free(r->xdp_devmap_xmit.cpu);
599
600 for (i = 0; i < MAX_CPUS; i++)
601 free(r->xdp_cpumap_enqueue[i].cpu);
602
603 free(r);
604}
605
606/* Pointer swap trick */
607static inline void swap(struct stats_record **a, struct stats_record **b)
608{
609 struct stats_record *tmp;
610
611 tmp = *a;
612 *a = *b;
613 *b = tmp;
614}
615
616static void stats_poll(int interval, bool err_only)
617{
618 struct stats_record *rec, *prev;
619
620 rec = alloc_stats_record();
621 prev = alloc_stats_record();
622 stats_collect(rec);
623
624 if (err_only)
625 printf("\n%s\n", __doc_err_only__);
626
627 /* Trick to pretty printf with thousands separators use %' */
628 setlocale(LC_NUMERIC, "en_US");
629
630 /* Header */
631 if (verbose)
632 printf("\n%s", __doc__);
633
634 /* TODO Need more advanced stats on error types */
635 if (verbose) {
636 printf(" - Stats map0: %s\n", bpf_map__name(map_data[0]));
637 printf(" - Stats map1: %s\n", bpf_map__name(map_data[1]));
638 printf("\n");
639 }
640 fflush(stdout);
641
642 while (1) {
643 swap(&prev, &rec);
644 stats_collect(rec);
645 stats_print(rec, prev, err_only);
646 fflush(stdout);
647 sleep(interval);
648 }
649
650 free_stats_record(rec);
651 free_stats_record(prev);
652}
653
654static void print_bpf_prog_info(void)
655{
656 struct bpf_program *prog;
657 struct bpf_map *map;
658 int i = 0;
659
660 /* Prog info */
661 printf("Loaded BPF prog have %d bpf program(s)\n", tp_cnt);
662 bpf_object__for_each_program(prog, obj) {
663 printf(" - prog_fd[%d] = fd(%d)\n", i, bpf_program__fd(prog));
664 i++;
665 }
666
667 i = 0;
668 /* Maps info */
669 printf("Loaded BPF prog have %d map(s)\n", map_cnt);
670 bpf_object__for_each_map(map, obj) {
671 const char *name = bpf_map__name(map);
672 int fd = bpf_map__fd(map);
673
674 printf(" - map_data[%d] = fd(%d) name:%s\n", i, fd, name);
675 i++;
676 }
677
678 /* Event info */
679 printf("Searching for (max:%d) event file descriptor(s)\n", tp_cnt);
680 for (i = 0; i < tp_cnt; i++) {
681 int fd = bpf_link__fd(tp_links[i]);
682
683 if (fd != -1)
684 printf(" - event_fd[%d] = fd(%d)\n", i, fd);
685 }
686}
687
688int main(int argc, char **argv)
689{
690 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
691 struct bpf_program *prog;
692 int longindex = 0, opt;
693 int ret = EXIT_FAILURE;
694 enum map_type type;
695 char filename[256];
696
697 /* Default settings: */
698 bool errors_only = true;
699 int interval = 2;
700
701 /* Parse commands line args */
702 while ((opt = getopt_long(argc, argv, "hDSs:",
703 long_options, &longindex)) != -1) {
704 switch (opt) {
705 case 'D':
706 debug = true;
707 break;
708 case 'S':
709 errors_only = false;
710 break;
711 case 's':
712 interval = atoi(optarg);
713 break;
714 case 'h':
715 default:
716 usage(argv);
717 return ret;
718 }
719 }
720
721 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
722 if (setrlimit(RLIMIT_MEMLOCK, &r)) {
723 perror("setrlimit(RLIMIT_MEMLOCK)");
724 return ret;
725 }
726
727 /* Remove tracepoint program when program is interrupted or killed */
728 signal(SIGINT, int_exit);
729 signal(SIGTERM, int_exit);
730
731 obj = bpf_object__open_file(filename, NULL);
732 if (libbpf_get_error(obj)) {
733 printf("ERROR: opening BPF object file failed\n");
734 obj = NULL;
735 goto cleanup;
736 }
737
738 /* load BPF program */
739 if (bpf_object__load(obj)) {
740 printf("ERROR: loading BPF object file failed\n");
741 goto cleanup;
742 }
743
744 for (type = 0; type < NUM_MAP; type++) {
745 map_data[type] =
746 bpf_object__find_map_by_name(obj, map_type_strings[type]);
747
748 if (libbpf_get_error(map_data[type])) {
749 printf("ERROR: finding a map in obj file failed\n");
750 goto cleanup;
751 }
752 map_cnt++;
753 }
754
755 bpf_object__for_each_program(prog, obj) {
756 tp_links[tp_cnt] = bpf_program__attach(prog);
757 if (libbpf_get_error(tp_links[tp_cnt])) {
758 printf("ERROR: bpf_program__attach failed\n");
759 tp_links[tp_cnt] = NULL;
760 goto cleanup;
761 }
762 tp_cnt++;
763 }
764
765 if (debug) {
766 print_bpf_prog_info();
767 }
768
769 /* Unload/stop tracepoint event by closing bpf_link's */
770 if (errors_only) {
771 /* The bpf_link[i] depend on the order of
772 * the functions was defined in _kern.c
773 */
774 bpf_link__destroy(tp_links[2]); /* tracepoint/xdp/xdp_redirect */
775 tp_links[2] = NULL;
776
777 bpf_link__destroy(tp_links[3]); /* tracepoint/xdp/xdp_redirect_map */
778 tp_links[3] = NULL;
779 }
780
781 stats_poll(interval, errors_only);
782
783 ret = EXIT_SUCCESS;
784
785cleanup:
786 /* Detach tracepoints */
787 while (tp_cnt)
788 bpf_link__destroy(tp_links[--tp_cnt]);
789
790 bpf_object__close(obj);
791 return ret;
792}
diff --git a/samples/bpf/xdp_redirect_cpu_kern.c b/samples/bpf/xdp_redirect_cpu_kern.c
new file mode 100644
index 000000000..8255025de
--- /dev/null
+++ b/samples/bpf/xdp_redirect_cpu_kern.c
@@ -0,0 +1,730 @@
1/* XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP)
2 *
3 * GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
4 */
5#include <uapi/linux/if_ether.h>
6#include <uapi/linux/if_packet.h>
7#include <uapi/linux/if_vlan.h>
8#include <uapi/linux/ip.h>
9#include <uapi/linux/ipv6.h>
10#include <uapi/linux/in.h>
11#include <uapi/linux/tcp.h>
12#include <uapi/linux/udp.h>
13
14#include <uapi/linux/bpf.h>
15#include <bpf/bpf_helpers.h>
16#include "hash_func01.h"
17
18#define MAX_CPUS NR_CPUS
19
20/* Special map type that can XDP_REDIRECT frames to another CPU */
21struct {
22 __uint(type, BPF_MAP_TYPE_CPUMAP);
23 __uint(key_size, sizeof(u32));
24 __uint(value_size, sizeof(struct bpf_cpumap_val));
25 __uint(max_entries, MAX_CPUS);
26} cpu_map SEC(".maps");
27
28/* Common stats data record to keep userspace more simple */
29struct datarec {
30 __u64 processed;
31 __u64 dropped;
32 __u64 issue;
33 __u64 xdp_pass;
34 __u64 xdp_drop;
35 __u64 xdp_redirect;
36};
37
38/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
39 * feedback. Redirect TX errors can be caught via a tracepoint.
40 */
41struct {
42 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
43 __type(key, u32);
44 __type(value, struct datarec);
45 __uint(max_entries, 1);
46} rx_cnt SEC(".maps");
47
48/* Used by trace point */
49struct {
50 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
51 __type(key, u32);
52 __type(value, struct datarec);
53 __uint(max_entries, 2);
54 /* TODO: have entries for all possible errno's */
55} redirect_err_cnt SEC(".maps");
56
57/* Used by trace point */
58struct {
59 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
60 __type(key, u32);
61 __type(value, struct datarec);
62 __uint(max_entries, MAX_CPUS);
63} cpumap_enqueue_cnt SEC(".maps");
64
65/* Used by trace point */
66struct {
67 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
68 __type(key, u32);
69 __type(value, struct datarec);
70 __uint(max_entries, 1);
71} cpumap_kthread_cnt SEC(".maps");
72
73/* Set of maps controlling available CPU, and for iterating through
74 * selectable redirect CPUs.
75 */
76struct {
77 __uint(type, BPF_MAP_TYPE_ARRAY);
78 __type(key, u32);
79 __type(value, u32);
80 __uint(max_entries, MAX_CPUS);
81} cpus_available SEC(".maps");
82struct {
83 __uint(type, BPF_MAP_TYPE_ARRAY);
84 __type(key, u32);
85 __type(value, u32);
86 __uint(max_entries, 1);
87} cpus_count SEC(".maps");
88struct {
89 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
90 __type(key, u32);
91 __type(value, u32);
92 __uint(max_entries, 1);
93} cpus_iterator SEC(".maps");
94
95/* Used by trace point */
96struct {
97 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
98 __type(key, u32);
99 __type(value, struct datarec);
100 __uint(max_entries, 1);
101} exception_cnt SEC(".maps");
102
103/* Helper parse functions */
104
105/* Parse Ethernet layer 2, extract network layer 3 offset and protocol
106 *
107 * Returns false on error and non-supported ether-type
108 */
109struct vlan_hdr {
110 __be16 h_vlan_TCI;
111 __be16 h_vlan_encapsulated_proto;
112};
113
114static __always_inline
115bool parse_eth(struct ethhdr *eth, void *data_end,
116 u16 *eth_proto, u64 *l3_offset)
117{
118 u16 eth_type;
119 u64 offset;
120
121 offset = sizeof(*eth);
122 if ((void *)eth + offset > data_end)
123 return false;
124
125 eth_type = eth->h_proto;
126
127 /* Skip non 802.3 Ethertypes */
128 if (unlikely(ntohs(eth_type) < ETH_P_802_3_MIN))
129 return false;
130
131 /* Handle VLAN tagged packet */
132 if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) {
133 struct vlan_hdr *vlan_hdr;
134
135 vlan_hdr = (void *)eth + offset;
136 offset += sizeof(*vlan_hdr);
137 if ((void *)eth + offset > data_end)
138 return false;
139 eth_type = vlan_hdr->h_vlan_encapsulated_proto;
140 }
141 /* Handle double VLAN tagged packet */
142 if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) {
143 struct vlan_hdr *vlan_hdr;
144
145 vlan_hdr = (void *)eth + offset;
146 offset += sizeof(*vlan_hdr);
147 if ((void *)eth + offset > data_end)
148 return false;
149 eth_type = vlan_hdr->h_vlan_encapsulated_proto;
150 }
151
152 *eth_proto = ntohs(eth_type);
153 *l3_offset = offset;
154 return true;
155}
156
157static __always_inline
158u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off)
159{
160 void *data_end = (void *)(long)ctx->data_end;
161 void *data = (void *)(long)ctx->data;
162 struct iphdr *iph = data + nh_off;
163 struct udphdr *udph;
164 u16 dport;
165
166 if (iph + 1 > data_end)
167 return 0;
168 if (!(iph->protocol == IPPROTO_UDP))
169 return 0;
170
171 udph = (void *)(iph + 1);
172 if (udph + 1 > data_end)
173 return 0;
174
175 dport = ntohs(udph->dest);
176 return dport;
177}
178
179static __always_inline
180int get_proto_ipv4(struct xdp_md *ctx, u64 nh_off)
181{
182 void *data_end = (void *)(long)ctx->data_end;
183 void *data = (void *)(long)ctx->data;
184 struct iphdr *iph = data + nh_off;
185
186 if (iph + 1 > data_end)
187 return 0;
188 return iph->protocol;
189}
190
191static __always_inline
192int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off)
193{
194 void *data_end = (void *)(long)ctx->data_end;
195 void *data = (void *)(long)ctx->data;
196 struct ipv6hdr *ip6h = data + nh_off;
197
198 if (ip6h + 1 > data_end)
199 return 0;
200 return ip6h->nexthdr;
201}
202
203SEC("xdp_cpu_map0")
204int xdp_prognum0_no_touch(struct xdp_md *ctx)
205{
206 void *data_end = (void *)(long)ctx->data_end;
207 void *data = (void *)(long)ctx->data;
208 struct datarec *rec;
209 u32 *cpu_selected;
210 u32 cpu_dest;
211 u32 key = 0;
212
213 /* Only use first entry in cpus_available */
214 cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
215 if (!cpu_selected)
216 return XDP_ABORTED;
217 cpu_dest = *cpu_selected;
218
219 /* Count RX packet in map */
220 rec = bpf_map_lookup_elem(&rx_cnt, &key);
221 if (!rec)
222 return XDP_ABORTED;
223 rec->processed++;
224
225 if (cpu_dest >= MAX_CPUS) {
226 rec->issue++;
227 return XDP_ABORTED;
228 }
229
230 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
231}
232
233SEC("xdp_cpu_map1_touch_data")
234int xdp_prognum1_touch_data(struct xdp_md *ctx)
235{
236 void *data_end = (void *)(long)ctx->data_end;
237 void *data = (void *)(long)ctx->data;
238 struct ethhdr *eth = data;
239 struct datarec *rec;
240 u32 *cpu_selected;
241 u32 cpu_dest;
242 u16 eth_type;
243 u32 key = 0;
244
245 /* Only use first entry in cpus_available */
246 cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
247 if (!cpu_selected)
248 return XDP_ABORTED;
249 cpu_dest = *cpu_selected;
250
251 /* Validate packet length is minimum Eth header size */
252 if (eth + 1 > data_end)
253 return XDP_ABORTED;
254
255 /* Count RX packet in map */
256 rec = bpf_map_lookup_elem(&rx_cnt, &key);
257 if (!rec)
258 return XDP_ABORTED;
259 rec->processed++;
260
261 /* Read packet data, and use it (drop non 802.3 Ethertypes) */
262 eth_type = eth->h_proto;
263 if (ntohs(eth_type) < ETH_P_802_3_MIN) {
264 rec->dropped++;
265 return XDP_DROP;
266 }
267
268 if (cpu_dest >= MAX_CPUS) {
269 rec->issue++;
270 return XDP_ABORTED;
271 }
272
273 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
274}
275
276SEC("xdp_cpu_map2_round_robin")
277int xdp_prognum2_round_robin(struct xdp_md *ctx)
278{
279 void *data_end = (void *)(long)ctx->data_end;
280 void *data = (void *)(long)ctx->data;
281 struct ethhdr *eth = data;
282 struct datarec *rec;
283 u32 cpu_dest;
284 u32 *cpu_lookup;
285 u32 key0 = 0;
286
287 u32 *cpu_selected;
288 u32 *cpu_iterator;
289 u32 *cpu_max;
290 u32 cpu_idx;
291
292 cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
293 if (!cpu_max)
294 return XDP_ABORTED;
295
296 cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key0);
297 if (!cpu_iterator)
298 return XDP_ABORTED;
299 cpu_idx = *cpu_iterator;
300
301 *cpu_iterator += 1;
302 if (*cpu_iterator == *cpu_max)
303 *cpu_iterator = 0;
304
305 cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
306 if (!cpu_selected)
307 return XDP_ABORTED;
308 cpu_dest = *cpu_selected;
309
310 /* Count RX packet in map */
311 rec = bpf_map_lookup_elem(&rx_cnt, &key0);
312 if (!rec)
313 return XDP_ABORTED;
314 rec->processed++;
315
316 if (cpu_dest >= MAX_CPUS) {
317 rec->issue++;
318 return XDP_ABORTED;
319 }
320
321 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
322}
323
324SEC("xdp_cpu_map3_proto_separate")
325int xdp_prognum3_proto_separate(struct xdp_md *ctx)
326{
327 void *data_end = (void *)(long)ctx->data_end;
328 void *data = (void *)(long)ctx->data;
329 struct ethhdr *eth = data;
330 u8 ip_proto = IPPROTO_UDP;
331 struct datarec *rec;
332 u16 eth_proto = 0;
333 u64 l3_offset = 0;
334 u32 cpu_dest = 0;
335 u32 cpu_idx = 0;
336 u32 *cpu_lookup;
337 u32 key = 0;
338
339 /* Count RX packet in map */
340 rec = bpf_map_lookup_elem(&rx_cnt, &key);
341 if (!rec)
342 return XDP_ABORTED;
343 rec->processed++;
344
345 if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
346 return XDP_PASS; /* Just skip */
347
348 /* Extract L4 protocol */
349 switch (eth_proto) {
350 case ETH_P_IP:
351 ip_proto = get_proto_ipv4(ctx, l3_offset);
352 break;
353 case ETH_P_IPV6:
354 ip_proto = get_proto_ipv6(ctx, l3_offset);
355 break;
356 case ETH_P_ARP:
357 cpu_idx = 0; /* ARP packet handled on separate CPU */
358 break;
359 default:
360 cpu_idx = 0;
361 }
362
363 /* Choose CPU based on L4 protocol */
364 switch (ip_proto) {
365 case IPPROTO_ICMP:
366 case IPPROTO_ICMPV6:
367 cpu_idx = 2;
368 break;
369 case IPPROTO_TCP:
370 cpu_idx = 0;
371 break;
372 case IPPROTO_UDP:
373 cpu_idx = 1;
374 break;
375 default:
376 cpu_idx = 0;
377 }
378
379 cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
380 if (!cpu_lookup)
381 return XDP_ABORTED;
382 cpu_dest = *cpu_lookup;
383
384 if (cpu_dest >= MAX_CPUS) {
385 rec->issue++;
386 return XDP_ABORTED;
387 }
388
389 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
390}
391
392SEC("xdp_cpu_map4_ddos_filter_pktgen")
393int xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx)
394{
395 void *data_end = (void *)(long)ctx->data_end;
396 void *data = (void *)(long)ctx->data;
397 struct ethhdr *eth = data;
398 u8 ip_proto = IPPROTO_UDP;
399 struct datarec *rec;
400 u16 eth_proto = 0;
401 u64 l3_offset = 0;
402 u32 cpu_dest = 0;
403 u32 cpu_idx = 0;
404 u16 dest_port;
405 u32 *cpu_lookup;
406 u32 key = 0;
407
408 /* Count RX packet in map */
409 rec = bpf_map_lookup_elem(&rx_cnt, &key);
410 if (!rec)
411 return XDP_ABORTED;
412 rec->processed++;
413
414 if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
415 return XDP_PASS; /* Just skip */
416
417 /* Extract L4 protocol */
418 switch (eth_proto) {
419 case ETH_P_IP:
420 ip_proto = get_proto_ipv4(ctx, l3_offset);
421 break;
422 case ETH_P_IPV6:
423 ip_proto = get_proto_ipv6(ctx, l3_offset);
424 break;
425 case ETH_P_ARP:
426 cpu_idx = 0; /* ARP packet handled on separate CPU */
427 break;
428 default:
429 cpu_idx = 0;
430 }
431
432 /* Choose CPU based on L4 protocol */
433 switch (ip_proto) {
434 case IPPROTO_ICMP:
435 case IPPROTO_ICMPV6:
436 cpu_idx = 2;
437 break;
438 case IPPROTO_TCP:
439 cpu_idx = 0;
440 break;
441 case IPPROTO_UDP:
442 cpu_idx = 1;
443 /* DDoS filter UDP port 9 (pktgen) */
444 dest_port = get_dest_port_ipv4_udp(ctx, l3_offset);
445 if (dest_port == 9) {
446 if (rec)
447 rec->dropped++;
448 return XDP_DROP;
449 }
450 break;
451 default:
452 cpu_idx = 0;
453 }
454
455 cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
456 if (!cpu_lookup)
457 return XDP_ABORTED;
458 cpu_dest = *cpu_lookup;
459
460 if (cpu_dest >= MAX_CPUS) {
461 rec->issue++;
462 return XDP_ABORTED;
463 }
464
465 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
466}
467
468/* Hashing initval */
469#define INITVAL 15485863
470
471static __always_inline
472u32 get_ipv4_hash_ip_pair(struct xdp_md *ctx, u64 nh_off)
473{
474 void *data_end = (void *)(long)ctx->data_end;
475 void *data = (void *)(long)ctx->data;
476 struct iphdr *iph = data + nh_off;
477 u32 cpu_hash;
478
479 if (iph + 1 > data_end)
480 return 0;
481
482 cpu_hash = iph->saddr + iph->daddr;
483 cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + iph->protocol);
484
485 return cpu_hash;
486}
487
488static __always_inline
489u32 get_ipv6_hash_ip_pair(struct xdp_md *ctx, u64 nh_off)
490{
491 void *data_end = (void *)(long)ctx->data_end;
492 void *data = (void *)(long)ctx->data;
493 struct ipv6hdr *ip6h = data + nh_off;
494 u32 cpu_hash;
495
496 if (ip6h + 1 > data_end)
497 return 0;
498
499 cpu_hash = ip6h->saddr.s6_addr32[0] + ip6h->daddr.s6_addr32[0];
500 cpu_hash += ip6h->saddr.s6_addr32[1] + ip6h->daddr.s6_addr32[1];
501 cpu_hash += ip6h->saddr.s6_addr32[2] + ip6h->daddr.s6_addr32[2];
502 cpu_hash += ip6h->saddr.s6_addr32[3] + ip6h->daddr.s6_addr32[3];
503 cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + ip6h->nexthdr);
504
505 return cpu_hash;
506}
507
508/* Load-Balance traffic based on hashing IP-addrs + L4-proto. The
509 * hashing scheme is symmetric, meaning swapping IP src/dest still hit
510 * same CPU.
511 */
512SEC("xdp_cpu_map5_lb_hash_ip_pairs")
513int xdp_prognum5_lb_hash_ip_pairs(struct xdp_md *ctx)
514{
515 void *data_end = (void *)(long)ctx->data_end;
516 void *data = (void *)(long)ctx->data;
517 struct ethhdr *eth = data;
518 u8 ip_proto = IPPROTO_UDP;
519 struct datarec *rec;
520 u16 eth_proto = 0;
521 u64 l3_offset = 0;
522 u32 cpu_dest = 0;
523 u32 cpu_idx = 0;
524 u32 *cpu_lookup;
525 u32 *cpu_max;
526 u32 cpu_hash;
527 u32 key = 0;
528
529 /* Count RX packet in map */
530 rec = bpf_map_lookup_elem(&rx_cnt, &key);
531 if (!rec)
532 return XDP_ABORTED;
533 rec->processed++;
534
535 cpu_max = bpf_map_lookup_elem(&cpus_count, &key);
536 if (!cpu_max)
537 return XDP_ABORTED;
538
539 if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
540 return XDP_PASS; /* Just skip */
541
542 /* Hash for IPv4 and IPv6 */
543 switch (eth_proto) {
544 case ETH_P_IP:
545 cpu_hash = get_ipv4_hash_ip_pair(ctx, l3_offset);
546 break;
547 case ETH_P_IPV6:
548 cpu_hash = get_ipv6_hash_ip_pair(ctx, l3_offset);
549 break;
550 case ETH_P_ARP: /* ARP packet handled on CPU idx 0 */
551 default:
552 cpu_hash = 0;
553 }
554
555 /* Choose CPU based on hash */
556 cpu_idx = cpu_hash % *cpu_max;
557
558 cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
559 if (!cpu_lookup)
560 return XDP_ABORTED;
561 cpu_dest = *cpu_lookup;
562
563 if (cpu_dest >= MAX_CPUS) {
564 rec->issue++;
565 return XDP_ABORTED;
566 }
567
568 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
569}
570
571char _license[] SEC("license") = "GPL";
572
573/*** Trace point code ***/
574
575/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
576 * Code in: kernel/include/trace/events/xdp.h
577 */
578struct xdp_redirect_ctx {
579 u64 __pad; // First 8 bytes are not accessible by bpf code
580 int prog_id; // offset:8; size:4; signed:1;
581 u32 act; // offset:12 size:4; signed:0;
582 int ifindex; // offset:16 size:4; signed:1;
583 int err; // offset:20 size:4; signed:1;
584 int to_ifindex; // offset:24 size:4; signed:1;
585 u32 map_id; // offset:28 size:4; signed:0;
586 int map_index; // offset:32 size:4; signed:1;
587}; // offset:36
588
589enum {
590 XDP_REDIRECT_SUCCESS = 0,
591 XDP_REDIRECT_ERROR = 1
592};
593
594static __always_inline
595int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
596{
597 u32 key = XDP_REDIRECT_ERROR;
598 struct datarec *rec;
599 int err = ctx->err;
600
601 if (!err)
602 key = XDP_REDIRECT_SUCCESS;
603
604 rec = bpf_map_lookup_elem(&redirect_err_cnt, &key);
605 if (!rec)
606 return 0;
607 rec->dropped += 1;
608
609 return 0; /* Indicate event was filtered (no further processing)*/
610 /*
611 * Returning 1 here would allow e.g. a perf-record tracepoint
612 * to see and record these events, but it doesn't work well
613 * in-practice as stopping perf-record also unload this
614 * bpf_prog. Plus, there is additional overhead of doing so.
615 */
616}
617
618SEC("tracepoint/xdp/xdp_redirect_err")
619int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx)
620{
621 return xdp_redirect_collect_stat(ctx);
622}
623
624SEC("tracepoint/xdp/xdp_redirect_map_err")
625int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx)
626{
627 return xdp_redirect_collect_stat(ctx);
628}
629
630/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
631 * Code in: kernel/include/trace/events/xdp.h
632 */
633struct xdp_exception_ctx {
634 u64 __pad; // First 8 bytes are not accessible by bpf code
635 int prog_id; // offset:8; size:4; signed:1;
636 u32 act; // offset:12; size:4; signed:0;
637 int ifindex; // offset:16; size:4; signed:1;
638};
639
640SEC("tracepoint/xdp/xdp_exception")
641int trace_xdp_exception(struct xdp_exception_ctx *ctx)
642{
643 struct datarec *rec;
644 u32 key = 0;
645
646 rec = bpf_map_lookup_elem(&exception_cnt, &key);
647 if (!rec)
648 return 1;
649 rec->dropped += 1;
650
651 return 0;
652}
653
654/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
655 * Code in: kernel/include/trace/events/xdp.h
656 */
657struct cpumap_enqueue_ctx {
658 u64 __pad; // First 8 bytes are not accessible by bpf code
659 int map_id; // offset:8; size:4; signed:1;
660 u32 act; // offset:12; size:4; signed:0;
661 int cpu; // offset:16; size:4; signed:1;
662 unsigned int drops; // offset:20; size:4; signed:0;
663 unsigned int processed; // offset:24; size:4; signed:0;
664 int to_cpu; // offset:28; size:4; signed:1;
665};
666
667SEC("tracepoint/xdp/xdp_cpumap_enqueue")
668int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
669{
670 u32 to_cpu = ctx->to_cpu;
671 struct datarec *rec;
672
673 if (to_cpu >= MAX_CPUS)
674 return 1;
675
676 rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
677 if (!rec)
678 return 0;
679 rec->processed += ctx->processed;
680 rec->dropped += ctx->drops;
681
682 /* Record bulk events, then userspace can calc average bulk size */
683 if (ctx->processed > 0)
684 rec->issue += 1;
685
686 /* Inception: It's possible to detect overload situations, via
687 * this tracepoint. This can be used for creating a feedback
688 * loop to XDP, which can take appropriate actions to mitigate
689 * this overload situation.
690 */
691 return 0;
692}
693
694/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
695 * Code in: kernel/include/trace/events/xdp.h
696 */
697struct cpumap_kthread_ctx {
698 u64 __pad; // First 8 bytes are not accessible
699 int map_id; // offset:8; size:4; signed:1;
700 u32 act; // offset:12; size:4; signed:0;
701 int cpu; // offset:16; size:4; signed:1;
702 unsigned int drops; // offset:20; size:4; signed:0;
703 unsigned int processed; // offset:24; size:4; signed:0;
704 int sched; // offset:28; size:4; signed:1;
705 unsigned int xdp_pass; // offset:32; size:4; signed:0;
706 unsigned int xdp_drop; // offset:36; size:4; signed:0;
707 unsigned int xdp_redirect; // offset:40; size:4; signed:0;
708};
709
710SEC("tracepoint/xdp/xdp_cpumap_kthread")
711int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
712{
713 struct datarec *rec;
714 u32 key = 0;
715
716 rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
717 if (!rec)
718 return 0;
719 rec->processed += ctx->processed;
720 rec->dropped += ctx->drops;
721 rec->xdp_pass += ctx->xdp_pass;
722 rec->xdp_drop += ctx->xdp_drop;
723 rec->xdp_redirect += ctx->xdp_redirect;
724
725 /* Count times kthread yielded CPU via schedule call */
726 if (ctx->sched)
727 rec->issue++;
728
729 return 0;
730}
diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c
new file mode 100644
index 000000000..16eb839e7
--- /dev/null
+++ b/samples/bpf/xdp_redirect_cpu_user.c
@@ -0,0 +1,983 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
3 */
4static const char *__doc__ =
5 " XDP redirect with a CPU-map type \"BPF_MAP_TYPE_CPUMAP\"";
6
7#include <errno.h>
8#include <signal.h>
9#include <stdio.h>
10#include <stdlib.h>
11#include <stdbool.h>
12#include <string.h>
13#include <unistd.h>
14#include <locale.h>
15#include <sys/resource.h>
16#include <sys/sysinfo.h>
17#include <getopt.h>
18#include <net/if.h>
19#include <time.h>
20#include <linux/limits.h>
21
22#include <arpa/inet.h>
23#include <linux/if_link.h>
24
25/* How many xdp_progs are defined in _kern.c */
26#define MAX_PROG 6
27
28#include <bpf/bpf.h>
29#include <bpf/libbpf.h>
30
31#include "bpf_util.h"
32
33static int ifindex = -1;
34static char ifname_buf[IF_NAMESIZE];
35static char *ifname;
36static __u32 prog_id;
37
38static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
39static int n_cpus;
40
41enum map_type {
42 CPU_MAP,
43 RX_CNT,
44 REDIRECT_ERR_CNT,
45 CPUMAP_ENQUEUE_CNT,
46 CPUMAP_KTHREAD_CNT,
47 CPUS_AVAILABLE,
48 CPUS_COUNT,
49 CPUS_ITERATOR,
50 EXCEPTION_CNT,
51};
52
53static const char *const map_type_strings[] = {
54 [CPU_MAP] = "cpu_map",
55 [RX_CNT] = "rx_cnt",
56 [REDIRECT_ERR_CNT] = "redirect_err_cnt",
57 [CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt",
58 [CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt",
59 [CPUS_AVAILABLE] = "cpus_available",
60 [CPUS_COUNT] = "cpus_count",
61 [CPUS_ITERATOR] = "cpus_iterator",
62 [EXCEPTION_CNT] = "exception_cnt",
63};
64
65#define NUM_TP 5
66#define NUM_MAP 9
67struct bpf_link *tp_links[NUM_TP] = {};
68static int map_fds[NUM_MAP];
69static int tp_cnt = 0;
70
71/* Exit return codes */
72#define EXIT_OK 0
73#define EXIT_FAIL 1
74#define EXIT_FAIL_OPTION 2
75#define EXIT_FAIL_XDP 3
76#define EXIT_FAIL_BPF 4
77#define EXIT_FAIL_MEM 5
78
79static const struct option long_options[] = {
80 {"help", no_argument, NULL, 'h' },
81 {"dev", required_argument, NULL, 'd' },
82 {"skb-mode", no_argument, NULL, 'S' },
83 {"sec", required_argument, NULL, 's' },
84 {"progname", required_argument, NULL, 'p' },
85 {"qsize", required_argument, NULL, 'q' },
86 {"cpu", required_argument, NULL, 'c' },
87 {"stress-mode", no_argument, NULL, 'x' },
88 {"no-separators", no_argument, NULL, 'z' },
89 {"force", no_argument, NULL, 'F' },
90 {"mprog-disable", no_argument, NULL, 'n' },
91 {"mprog-name", required_argument, NULL, 'e' },
92 {"mprog-filename", required_argument, NULL, 'f' },
93 {"redirect-device", required_argument, NULL, 'r' },
94 {"redirect-map", required_argument, NULL, 'm' },
95 {0, 0, NULL, 0 }
96};
97
98static void int_exit(int sig)
99{
100 __u32 curr_prog_id = 0;
101
102 if (ifindex > -1) {
103 if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) {
104 printf("bpf_get_link_xdp_id failed\n");
105 exit(EXIT_FAIL);
106 }
107 if (prog_id == curr_prog_id) {
108 fprintf(stderr,
109 "Interrupted: Removing XDP program on ifindex:%d device:%s\n",
110 ifindex, ifname);
111 bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
112 } else if (!curr_prog_id) {
113 printf("couldn't find a prog id on a given iface\n");
114 } else {
115 printf("program on interface changed, not removing\n");
116 }
117 }
118 /* Detach tracepoints */
119 while (tp_cnt)
120 bpf_link__destroy(tp_links[--tp_cnt]);
121
122 exit(EXIT_OK);
123}
124
125static void print_avail_progs(struct bpf_object *obj)
126{
127 struct bpf_program *pos;
128
129 bpf_object__for_each_program(pos, obj) {
130 if (bpf_program__is_xdp(pos))
131 printf(" %s\n", bpf_program__section_name(pos));
132 }
133}
134
135static void usage(char *argv[], struct bpf_object *obj)
136{
137 int i;
138
139 printf("\nDOCUMENTATION:\n%s\n", __doc__);
140 printf("\n");
141 printf(" Usage: %s (options-see-below)\n", argv[0]);
142 printf(" Listing options:\n");
143 for (i = 0; long_options[i].name != 0; i++) {
144 printf(" --%-12s", long_options[i].name);
145 if (long_options[i].flag != NULL)
146 printf(" flag (internal value:%d)",
147 *long_options[i].flag);
148 else
149 printf(" short-option: -%c",
150 long_options[i].val);
151 printf("\n");
152 }
153 printf("\n Programs to be used for --progname:\n");
154 print_avail_progs(obj);
155 printf("\n");
156}
157
158/* gettime returns the current time of day in nanoseconds.
159 * Cost: clock_gettime (ns) => 26ns (CLOCK_MONOTONIC)
160 * clock_gettime (ns) => 9ns (CLOCK_MONOTONIC_COARSE)
161 */
162#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
163static __u64 gettime(void)
164{
165 struct timespec t;
166 int res;
167
168 res = clock_gettime(CLOCK_MONOTONIC, &t);
169 if (res < 0) {
170 fprintf(stderr, "Error with gettimeofday! (%i)\n", res);
171 exit(EXIT_FAIL);
172 }
173 return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
174}
175
176/* Common stats data record shared with _kern.c */
177struct datarec {
178 __u64 processed;
179 __u64 dropped;
180 __u64 issue;
181 __u64 xdp_pass;
182 __u64 xdp_drop;
183 __u64 xdp_redirect;
184};
185struct record {
186 __u64 timestamp;
187 struct datarec total;
188 struct datarec *cpu;
189};
190struct stats_record {
191 struct record rx_cnt;
192 struct record redir_err;
193 struct record kthread;
194 struct record exception;
195 struct record enq[];
196};
197
198static bool map_collect_percpu(int fd, __u32 key, struct record *rec)
199{
200 /* For percpu maps, userspace gets a value per possible CPU */
201 unsigned int nr_cpus = bpf_num_possible_cpus();
202 struct datarec values[nr_cpus];
203 __u64 sum_xdp_redirect = 0;
204 __u64 sum_xdp_pass = 0;
205 __u64 sum_xdp_drop = 0;
206 __u64 sum_processed = 0;
207 __u64 sum_dropped = 0;
208 __u64 sum_issue = 0;
209 int i;
210
211 if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
212 fprintf(stderr,
213 "ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
214 return false;
215 }
216 /* Get time as close as possible to reading map contents */
217 rec->timestamp = gettime();
218
219 /* Record and sum values from each CPU */
220 for (i = 0; i < nr_cpus; i++) {
221 rec->cpu[i].processed = values[i].processed;
222 sum_processed += values[i].processed;
223 rec->cpu[i].dropped = values[i].dropped;
224 sum_dropped += values[i].dropped;
225 rec->cpu[i].issue = values[i].issue;
226 sum_issue += values[i].issue;
227 rec->cpu[i].xdp_pass = values[i].xdp_pass;
228 sum_xdp_pass += values[i].xdp_pass;
229 rec->cpu[i].xdp_drop = values[i].xdp_drop;
230 sum_xdp_drop += values[i].xdp_drop;
231 rec->cpu[i].xdp_redirect = values[i].xdp_redirect;
232 sum_xdp_redirect += values[i].xdp_redirect;
233 }
234 rec->total.processed = sum_processed;
235 rec->total.dropped = sum_dropped;
236 rec->total.issue = sum_issue;
237 rec->total.xdp_pass = sum_xdp_pass;
238 rec->total.xdp_drop = sum_xdp_drop;
239 rec->total.xdp_redirect = sum_xdp_redirect;
240 return true;
241}
242
243static struct datarec *alloc_record_per_cpu(void)
244{
245 unsigned int nr_cpus = bpf_num_possible_cpus();
246 struct datarec *array;
247
248 array = calloc(nr_cpus, sizeof(struct datarec));
249 if (!array) {
250 fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
251 exit(EXIT_FAIL_MEM);
252 }
253 return array;
254}
255
256static struct stats_record *alloc_stats_record(void)
257{
258 struct stats_record *rec;
259 int i, size;
260
261 size = sizeof(*rec) + n_cpus * sizeof(struct record);
262 rec = malloc(size);
263 if (!rec) {
264 fprintf(stderr, "Mem alloc error\n");
265 exit(EXIT_FAIL_MEM);
266 }
267 memset(rec, 0, size);
268 rec->rx_cnt.cpu = alloc_record_per_cpu();
269 rec->redir_err.cpu = alloc_record_per_cpu();
270 rec->kthread.cpu = alloc_record_per_cpu();
271 rec->exception.cpu = alloc_record_per_cpu();
272 for (i = 0; i < n_cpus; i++)
273 rec->enq[i].cpu = alloc_record_per_cpu();
274
275 return rec;
276}
277
278static void free_stats_record(struct stats_record *r)
279{
280 int i;
281
282 for (i = 0; i < n_cpus; i++)
283 free(r->enq[i].cpu);
284 free(r->exception.cpu);
285 free(r->kthread.cpu);
286 free(r->redir_err.cpu);
287 free(r->rx_cnt.cpu);
288 free(r);
289}
290
291static double calc_period(struct record *r, struct record *p)
292{
293 double period_ = 0;
294 __u64 period = 0;
295
296 period = r->timestamp - p->timestamp;
297 if (period > 0)
298 period_ = ((double) period / NANOSEC_PER_SEC);
299
300 return period_;
301}
302
303static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_)
304{
305 __u64 packets = 0;
306 __u64 pps = 0;
307
308 if (period_ > 0) {
309 packets = r->processed - p->processed;
310 pps = packets / period_;
311 }
312 return pps;
313}
314
315static __u64 calc_drop_pps(struct datarec *r, struct datarec *p, double period_)
316{
317 __u64 packets = 0;
318 __u64 pps = 0;
319
320 if (period_ > 0) {
321 packets = r->dropped - p->dropped;
322 pps = packets / period_;
323 }
324 return pps;
325}
326
327static __u64 calc_errs_pps(struct datarec *r,
328 struct datarec *p, double period_)
329{
330 __u64 packets = 0;
331 __u64 pps = 0;
332
333 if (period_ > 0) {
334 packets = r->issue - p->issue;
335 pps = packets / period_;
336 }
337 return pps;
338}
339
340static void calc_xdp_pps(struct datarec *r, struct datarec *p,
341 double *xdp_pass, double *xdp_drop,
342 double *xdp_redirect, double period_)
343{
344 *xdp_pass = 0, *xdp_drop = 0, *xdp_redirect = 0;
345 if (period_ > 0) {
346 *xdp_redirect = (r->xdp_redirect - p->xdp_redirect) / period_;
347 *xdp_pass = (r->xdp_pass - p->xdp_pass) / period_;
348 *xdp_drop = (r->xdp_drop - p->xdp_drop) / period_;
349 }
350}
351
352static void stats_print(struct stats_record *stats_rec,
353 struct stats_record *stats_prev,
354 char *prog_name, char *mprog_name, int mprog_fd)
355{
356 unsigned int nr_cpus = bpf_num_possible_cpus();
357 double pps = 0, drop = 0, err = 0;
358 bool mprog_enabled = false;
359 struct record *rec, *prev;
360 int to_cpu;
361 double t;
362 int i;
363
364 if (mprog_fd > 0)
365 mprog_enabled = true;
366
367 /* Header */
368 printf("Running XDP/eBPF prog_name:%s\n", prog_name);
369 printf("%-15s %-7s %-14s %-11s %-9s\n",
370 "XDP-cpumap", "CPU:to", "pps", "drop-pps", "extra-info");
371
372 /* XDP rx_cnt */
373 {
374 char *fmt_rx = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n";
375 char *fm2_rx = "%-15s %-7s %'-14.0f %'-11.0f\n";
376 char *errstr = "";
377
378 rec = &stats_rec->rx_cnt;
379 prev = &stats_prev->rx_cnt;
380 t = calc_period(rec, prev);
381 for (i = 0; i < nr_cpus; i++) {
382 struct datarec *r = &rec->cpu[i];
383 struct datarec *p = &prev->cpu[i];
384
385 pps = calc_pps(r, p, t);
386 drop = calc_drop_pps(r, p, t);
387 err = calc_errs_pps(r, p, t);
388 if (err > 0)
389 errstr = "cpu-dest/err";
390 if (pps > 0)
391 printf(fmt_rx, "XDP-RX",
392 i, pps, drop, err, errstr);
393 }
394 pps = calc_pps(&rec->total, &prev->total, t);
395 drop = calc_drop_pps(&rec->total, &prev->total, t);
396 err = calc_errs_pps(&rec->total, &prev->total, t);
397 printf(fm2_rx, "XDP-RX", "total", pps, drop);
398 }
399
400 /* cpumap enqueue stats */
401 for (to_cpu = 0; to_cpu < n_cpus; to_cpu++) {
402 char *fmt = "%-15s %3d:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n";
403 char *fm2 = "%-15s %3s:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n";
404 char *errstr = "";
405
406 rec = &stats_rec->enq[to_cpu];
407 prev = &stats_prev->enq[to_cpu];
408 t = calc_period(rec, prev);
409 for (i = 0; i < nr_cpus; i++) {
410 struct datarec *r = &rec->cpu[i];
411 struct datarec *p = &prev->cpu[i];
412
413 pps = calc_pps(r, p, t);
414 drop = calc_drop_pps(r, p, t);
415 err = calc_errs_pps(r, p, t);
416 if (err > 0) {
417 errstr = "bulk-average";
418 err = pps / err; /* calc average bulk size */
419 }
420 if (pps > 0)
421 printf(fmt, "cpumap-enqueue",
422 i, to_cpu, pps, drop, err, errstr);
423 }
424 pps = calc_pps(&rec->total, &prev->total, t);
425 if (pps > 0) {
426 drop = calc_drop_pps(&rec->total, &prev->total, t);
427 err = calc_errs_pps(&rec->total, &prev->total, t);
428 if (err > 0) {
429 errstr = "bulk-average";
430 err = pps / err; /* calc average bulk size */
431 }
432 printf(fm2, "cpumap-enqueue",
433 "sum", to_cpu, pps, drop, err, errstr);
434 }
435 }
436
437 /* cpumap kthread stats */
438 {
439 char *fmt_k = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n";
440 char *fm2_k = "%-15s %-7s %'-14.0f %'-11.0f %'-10.0f %s\n";
441 char *e_str = "";
442
443 rec = &stats_rec->kthread;
444 prev = &stats_prev->kthread;
445 t = calc_period(rec, prev);
446 for (i = 0; i < nr_cpus; i++) {
447 struct datarec *r = &rec->cpu[i];
448 struct datarec *p = &prev->cpu[i];
449
450 pps = calc_pps(r, p, t);
451 drop = calc_drop_pps(r, p, t);
452 err = calc_errs_pps(r, p, t);
453 if (err > 0)
454 e_str = "sched";
455 if (pps > 0)
456 printf(fmt_k, "cpumap_kthread",
457 i, pps, drop, err, e_str);
458 }
459 pps = calc_pps(&rec->total, &prev->total, t);
460 drop = calc_drop_pps(&rec->total, &prev->total, t);
461 err = calc_errs_pps(&rec->total, &prev->total, t);
462 if (err > 0)
463 e_str = "sched-sum";
464 printf(fm2_k, "cpumap_kthread", "total", pps, drop, err, e_str);
465 }
466
467 /* XDP redirect err tracepoints (very unlikely) */
468 {
469 char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n";
470 char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n";
471
472 rec = &stats_rec->redir_err;
473 prev = &stats_prev->redir_err;
474 t = calc_period(rec, prev);
475 for (i = 0; i < nr_cpus; i++) {
476 struct datarec *r = &rec->cpu[i];
477 struct datarec *p = &prev->cpu[i];
478
479 pps = calc_pps(r, p, t);
480 drop = calc_drop_pps(r, p, t);
481 if (pps > 0)
482 printf(fmt_err, "redirect_err", i, pps, drop);
483 }
484 pps = calc_pps(&rec->total, &prev->total, t);
485 drop = calc_drop_pps(&rec->total, &prev->total, t);
486 printf(fm2_err, "redirect_err", "total", pps, drop);
487 }
488
489 /* XDP general exception tracepoints */
490 {
491 char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n";
492 char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n";
493
494 rec = &stats_rec->exception;
495 prev = &stats_prev->exception;
496 t = calc_period(rec, prev);
497 for (i = 0; i < nr_cpus; i++) {
498 struct datarec *r = &rec->cpu[i];
499 struct datarec *p = &prev->cpu[i];
500
501 pps = calc_pps(r, p, t);
502 drop = calc_drop_pps(r, p, t);
503 if (pps > 0)
504 printf(fmt_err, "xdp_exception", i, pps, drop);
505 }
506 pps = calc_pps(&rec->total, &prev->total, t);
507 drop = calc_drop_pps(&rec->total, &prev->total, t);
508 printf(fm2_err, "xdp_exception", "total", pps, drop);
509 }
510
511 /* CPUMAP attached XDP program that runs on remote/destination CPU */
512 if (mprog_enabled) {
513 char *fmt_k = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f\n";
514 char *fm2_k = "%-15s %-7s %'-14.0f %'-11.0f %'-10.0f\n";
515 double xdp_pass, xdp_drop, xdp_redirect;
516
517 printf("\n2nd remote XDP/eBPF prog_name: %s\n", mprog_name);
518 printf("%-15s %-7s %-14s %-11s %-9s\n",
519 "XDP-cpumap", "CPU:to", "xdp-pass", "xdp-drop", "xdp-redir");
520
521 rec = &stats_rec->kthread;
522 prev = &stats_prev->kthread;
523 t = calc_period(rec, prev);
524 for (i = 0; i < nr_cpus; i++) {
525 struct datarec *r = &rec->cpu[i];
526 struct datarec *p = &prev->cpu[i];
527
528 calc_xdp_pps(r, p, &xdp_pass, &xdp_drop,
529 &xdp_redirect, t);
530 if (xdp_pass > 0 || xdp_drop > 0 || xdp_redirect > 0)
531 printf(fmt_k, "xdp-in-kthread", i, xdp_pass, xdp_drop,
532 xdp_redirect);
533 }
534 calc_xdp_pps(&rec->total, &prev->total, &xdp_pass, &xdp_drop,
535 &xdp_redirect, t);
536 printf(fm2_k, "xdp-in-kthread", "total", xdp_pass, xdp_drop, xdp_redirect);
537 }
538
539 printf("\n");
540 fflush(stdout);
541}
542
543static void stats_collect(struct stats_record *rec)
544{
545 int fd, i;
546
547 fd = map_fds[RX_CNT];
548 map_collect_percpu(fd, 0, &rec->rx_cnt);
549
550 fd = map_fds[REDIRECT_ERR_CNT];
551 map_collect_percpu(fd, 1, &rec->redir_err);
552
553 fd = map_fds[CPUMAP_ENQUEUE_CNT];
554 for (i = 0; i < n_cpus; i++)
555 map_collect_percpu(fd, i, &rec->enq[i]);
556
557 fd = map_fds[CPUMAP_KTHREAD_CNT];
558 map_collect_percpu(fd, 0, &rec->kthread);
559
560 fd = map_fds[EXCEPTION_CNT];
561 map_collect_percpu(fd, 0, &rec->exception);
562}
563
564
565/* Pointer swap trick */
566static inline void swap(struct stats_record **a, struct stats_record **b)
567{
568 struct stats_record *tmp;
569
570 tmp = *a;
571 *a = *b;
572 *b = tmp;
573}
574
575static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value,
576 __u32 avail_idx, bool new)
577{
578 __u32 curr_cpus_count = 0;
579 __u32 key = 0;
580 int ret;
581
582 /* Add a CPU entry to cpumap, as this allocate a cpu entry in
583 * the kernel for the cpu.
584 */
585 ret = bpf_map_update_elem(map_fds[CPU_MAP], &cpu, value, 0);
586 if (ret) {
587 fprintf(stderr, "Create CPU entry failed (err:%d)\n", ret);
588 exit(EXIT_FAIL_BPF);
589 }
590
591 /* Inform bpf_prog's that a new CPU is available to select
592 * from via some control maps.
593 */
594 ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &avail_idx, &cpu, 0);
595 if (ret) {
596 fprintf(stderr, "Add to avail CPUs failed\n");
597 exit(EXIT_FAIL_BPF);
598 }
599
600 /* When not replacing/updating existing entry, bump the count */
601 ret = bpf_map_lookup_elem(map_fds[CPUS_COUNT], &key, &curr_cpus_count);
602 if (ret) {
603 fprintf(stderr, "Failed reading curr cpus_count\n");
604 exit(EXIT_FAIL_BPF);
605 }
606 if (new) {
607 curr_cpus_count++;
608 ret = bpf_map_update_elem(map_fds[CPUS_COUNT], &key,
609 &curr_cpus_count, 0);
610 if (ret) {
611 fprintf(stderr, "Failed write curr cpus_count\n");
612 exit(EXIT_FAIL_BPF);
613 }
614 }
615 /* map_fd[7] = cpus_iterator */
616 printf("%s CPU:%u as idx:%u qsize:%d prog_fd: %d (cpus_count:%u)\n",
617 new ? "Add-new":"Replace", cpu, avail_idx,
618 value->qsize, value->bpf_prog.fd, curr_cpus_count);
619
620 return 0;
621}
622
623/* CPUs are zero-indexed. Thus, add a special sentinel default value
624 * in map cpus_available to mark CPU index'es not configured
625 */
626static void mark_cpus_unavailable(void)
627{
628 __u32 invalid_cpu = n_cpus;
629 int ret, i;
630
631 for (i = 0; i < n_cpus; i++) {
632 ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &i,
633 &invalid_cpu, 0);
634 if (ret) {
635 fprintf(stderr, "Failed marking CPU unavailable\n");
636 exit(EXIT_FAIL_BPF);
637 }
638 }
639}
640
641/* Stress cpumap management code by concurrently changing underlying cpumap */
642static void stress_cpumap(struct bpf_cpumap_val *value)
643{
644 /* Changing qsize will cause kernel to free and alloc a new
645 * bpf_cpu_map_entry, with an associated/complicated tear-down
646 * procedure.
647 */
648 value->qsize = 1024;
649 create_cpu_entry(1, value, 0, false);
650 value->qsize = 8;
651 create_cpu_entry(1, value, 0, false);
652 value->qsize = 16000;
653 create_cpu_entry(1, value, 0, false);
654}
655
656static void stats_poll(int interval, bool use_separators, char *prog_name,
657 char *mprog_name, struct bpf_cpumap_val *value,
658 bool stress_mode)
659{
660 struct stats_record *record, *prev;
661 int mprog_fd;
662
663 record = alloc_stats_record();
664 prev = alloc_stats_record();
665 stats_collect(record);
666
667 /* Trick to pretty printf with thousands separators use %' */
668 if (use_separators)
669 setlocale(LC_NUMERIC, "en_US");
670
671 while (1) {
672 swap(&prev, &record);
673 mprog_fd = value->bpf_prog.fd;
674 stats_collect(record);
675 stats_print(record, prev, prog_name, mprog_name, mprog_fd);
676 sleep(interval);
677 if (stress_mode)
678 stress_cpumap(value);
679 }
680
681 free_stats_record(record);
682 free_stats_record(prev);
683}
684
685static int init_tracepoints(struct bpf_object *obj)
686{
687 struct bpf_program *prog;
688
689 bpf_object__for_each_program(prog, obj) {
690 if (bpf_program__is_tracepoint(prog) != true)
691 continue;
692
693 tp_links[tp_cnt] = bpf_program__attach(prog);
694 if (libbpf_get_error(tp_links[tp_cnt])) {
695 tp_links[tp_cnt] = NULL;
696 return -EINVAL;
697 }
698 tp_cnt++;
699 }
700
701 return 0;
702}
703
704static int init_map_fds(struct bpf_object *obj)
705{
706 enum map_type type;
707
708 for (type = 0; type < NUM_MAP; type++) {
709 map_fds[type] =
710 bpf_object__find_map_fd_by_name(obj,
711 map_type_strings[type]);
712
713 if (map_fds[type] < 0)
714 return -ENOENT;
715 }
716
717 return 0;
718}
719
720static int load_cpumap_prog(char *file_name, char *prog_name,
721 char *redir_interface, char *redir_map)
722{
723 struct bpf_prog_load_attr prog_load_attr = {
724 .prog_type = BPF_PROG_TYPE_XDP,
725 .expected_attach_type = BPF_XDP_CPUMAP,
726 .file = file_name,
727 };
728 struct bpf_program *prog;
729 struct bpf_object *obj;
730 int fd;
731
732 if (bpf_prog_load_xattr(&prog_load_attr, &obj, &fd))
733 return -1;
734
735 if (fd < 0) {
736 fprintf(stderr, "ERR: bpf_prog_load_xattr: %s\n",
737 strerror(errno));
738 return fd;
739 }
740
741 if (redir_interface && redir_map) {
742 int err, map_fd, ifindex_out, key = 0;
743
744 map_fd = bpf_object__find_map_fd_by_name(obj, redir_map);
745 if (map_fd < 0)
746 return map_fd;
747
748 ifindex_out = if_nametoindex(redir_interface);
749 if (!ifindex_out)
750 return -1;
751
752 err = bpf_map_update_elem(map_fd, &key, &ifindex_out, 0);
753 if (err < 0)
754 return err;
755 }
756
757 prog = bpf_object__find_program_by_title(obj, prog_name);
758 if (!prog) {
759 fprintf(stderr, "bpf_object__find_program_by_title failed\n");
760 return EXIT_FAIL;
761 }
762
763 return bpf_program__fd(prog);
764}
765
766int main(int argc, char **argv)
767{
768 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
769 char *prog_name = "xdp_cpu_map5_lb_hash_ip_pairs";
770 char *mprog_filename = "xdp_redirect_kern.o";
771 char *redir_interface = NULL, *redir_map = NULL;
772 char *mprog_name = "xdp_redirect_dummy";
773 bool mprog_disable = false;
774 struct bpf_prog_load_attr prog_load_attr = {
775 .prog_type = BPF_PROG_TYPE_UNSPEC,
776 };
777 struct bpf_prog_info info = {};
778 __u32 info_len = sizeof(info);
779 struct bpf_cpumap_val value;
780 bool use_separators = true;
781 bool stress_mode = false;
782 struct bpf_program *prog;
783 struct bpf_object *obj;
784 int err = EXIT_FAIL;
785 char filename[256];
786 int added_cpus = 0;
787 int longindex = 0;
788 int interval = 2;
789 int add_cpu = -1;
790 int opt, prog_fd;
791 int *cpu, i;
792 __u32 qsize;
793
794 n_cpus = get_nprocs_conf();
795
796 /* Notice: choosing he queue size is very important with the
797 * ixgbe driver, because it's driver page recycling trick is
798 * dependend on pages being returned quickly. The number of
799 * out-standing packets in the system must be less-than 2x
800 * RX-ring size.
801 */
802 qsize = 128+64;
803
804 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
805 prog_load_attr.file = filename;
806
807 if (setrlimit(RLIMIT_MEMLOCK, &r)) {
808 perror("setrlimit(RLIMIT_MEMLOCK)");
809 return 1;
810 }
811
812 if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
813 return err;
814
815 if (prog_fd < 0) {
816 fprintf(stderr, "ERR: bpf_prog_load_xattr: %s\n",
817 strerror(errno));
818 return err;
819 }
820
821 if (init_tracepoints(obj) < 0) {
822 fprintf(stderr, "ERR: bpf_program__attach failed\n");
823 return err;
824 }
825
826 if (init_map_fds(obj) < 0) {
827 fprintf(stderr, "bpf_object__find_map_fd_by_name failed\n");
828 return err;
829 }
830 mark_cpus_unavailable();
831
832 cpu = malloc(n_cpus * sizeof(int));
833 if (!cpu) {
834 fprintf(stderr, "failed to allocate cpu array\n");
835 return err;
836 }
837 memset(cpu, 0, n_cpus * sizeof(int));
838
839 /* Parse commands line args */
840 while ((opt = getopt_long(argc, argv, "hSd:s:p:q:c:xzFf:e:r:m:n",
841 long_options, &longindex)) != -1) {
842 switch (opt) {
843 case 'd':
844 if (strlen(optarg) >= IF_NAMESIZE) {
845 fprintf(stderr, "ERR: --dev name too long\n");
846 goto error;
847 }
848 ifname = (char *)&ifname_buf;
849 strncpy(ifname, optarg, IF_NAMESIZE);
850 ifindex = if_nametoindex(ifname);
851 if (ifindex == 0) {
852 fprintf(stderr,
853 "ERR: --dev name unknown err(%d):%s\n",
854 errno, strerror(errno));
855 goto error;
856 }
857 break;
858 case 's':
859 interval = atoi(optarg);
860 break;
861 case 'S':
862 xdp_flags |= XDP_FLAGS_SKB_MODE;
863 break;
864 case 'x':
865 stress_mode = true;
866 break;
867 case 'z':
868 use_separators = false;
869 break;
870 case 'p':
871 /* Selecting eBPF prog to load */
872 prog_name = optarg;
873 break;
874 case 'n':
875 mprog_disable = true;
876 break;
877 case 'f':
878 mprog_filename = optarg;
879 break;
880 case 'e':
881 mprog_name = optarg;
882 break;
883 case 'r':
884 redir_interface = optarg;
885 break;
886 case 'm':
887 redir_map = optarg;
888 break;
889 case 'c':
890 /* Add multiple CPUs */
891 add_cpu = strtoul(optarg, NULL, 0);
892 if (add_cpu >= n_cpus) {
893 fprintf(stderr,
894 "--cpu nr too large for cpumap err(%d):%s\n",
895 errno, strerror(errno));
896 goto error;
897 }
898 cpu[added_cpus++] = add_cpu;
899 break;
900 case 'q':
901 qsize = atoi(optarg);
902 break;
903 case 'F':
904 xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
905 break;
906 case 'h':
907 error:
908 default:
909 free(cpu);
910 usage(argv, obj);
911 return EXIT_FAIL_OPTION;
912 }
913 }
914
915 if (!(xdp_flags & XDP_FLAGS_SKB_MODE))
916 xdp_flags |= XDP_FLAGS_DRV_MODE;
917
918 /* Required option */
919 if (ifindex == -1) {
920 fprintf(stderr, "ERR: required option --dev missing\n");
921 usage(argv, obj);
922 err = EXIT_FAIL_OPTION;
923 goto out;
924 }
925 /* Required option */
926 if (add_cpu == -1) {
927 fprintf(stderr, "ERR: required option --cpu missing\n");
928 fprintf(stderr, " Specify multiple --cpu option to add more\n");
929 usage(argv, obj);
930 err = EXIT_FAIL_OPTION;
931 goto out;
932 }
933
934 value.bpf_prog.fd = 0;
935 if (!mprog_disable)
936 value.bpf_prog.fd = load_cpumap_prog(mprog_filename, mprog_name,
937 redir_interface, redir_map);
938 if (value.bpf_prog.fd < 0) {
939 err = value.bpf_prog.fd;
940 goto out;
941 }
942 value.qsize = qsize;
943
944 for (i = 0; i < added_cpus; i++)
945 create_cpu_entry(cpu[i], &value, i, true);
946
947 /* Remove XDP program when program is interrupted or killed */
948 signal(SIGINT, int_exit);
949 signal(SIGTERM, int_exit);
950
951 prog = bpf_object__find_program_by_title(obj, prog_name);
952 if (!prog) {
953 fprintf(stderr, "bpf_object__find_program_by_title failed\n");
954 goto out;
955 }
956
957 prog_fd = bpf_program__fd(prog);
958 if (prog_fd < 0) {
959 fprintf(stderr, "bpf_program__fd failed\n");
960 goto out;
961 }
962
963 if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) {
964 fprintf(stderr, "link set xdp fd failed\n");
965 err = EXIT_FAIL_XDP;
966 goto out;
967 }
968
969 err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
970 if (err) {
971 printf("can't get prog info - %s\n", strerror(errno));
972 goto out;
973 }
974 prog_id = info.id;
975
976 stats_poll(interval, use_separators, prog_name, mprog_name,
977 &value, stress_mode);
978
979 err = EXIT_OK;
980out:
981 free(cpu);
982 return err;
983}
diff --git a/samples/bpf/xdp_redirect_kern.c b/samples/bpf/xdp_redirect_kern.c
new file mode 100644
index 000000000..d26ec3aa2
--- /dev/null
+++ b/samples/bpf/xdp_redirect_kern.c
@@ -0,0 +1,90 @@
1/* Copyright (c) 2016 John Fastabend <john.r.fastabend@intel.com>
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12#define KBUILD_MODNAME "foo"
13#include <uapi/linux/bpf.h>
14#include <linux/in.h>
15#include <linux/if_ether.h>
16#include <linux/if_packet.h>
17#include <linux/if_vlan.h>
18#include <linux/ip.h>
19#include <linux/ipv6.h>
20#include <bpf/bpf_helpers.h>
21
22struct {
23 __uint(type, BPF_MAP_TYPE_ARRAY);
24 __type(key, int);
25 __type(value, int);
26 __uint(max_entries, 1);
27} tx_port SEC(".maps");
28
29/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
30 * feedback. Redirect TX errors can be caught via a tracepoint.
31 */
32struct {
33 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
34 __type(key, u32);
35 __type(value, long);
36 __uint(max_entries, 1);
37} rxcnt SEC(".maps");
38
39static void swap_src_dst_mac(void *data)
40{
41 unsigned short *p = data;
42 unsigned short dst[3];
43
44 dst[0] = p[0];
45 dst[1] = p[1];
46 dst[2] = p[2];
47 p[0] = p[3];
48 p[1] = p[4];
49 p[2] = p[5];
50 p[3] = dst[0];
51 p[4] = dst[1];
52 p[5] = dst[2];
53}
54
55SEC("xdp_redirect")
56int xdp_redirect_prog(struct xdp_md *ctx)
57{
58 void *data_end = (void *)(long)ctx->data_end;
59 void *data = (void *)(long)ctx->data;
60 struct ethhdr *eth = data;
61 int rc = XDP_DROP;
62 int *ifindex, port = 0;
63 long *value;
64 u32 key = 0;
65 u64 nh_off;
66
67 nh_off = sizeof(*eth);
68 if (data + nh_off > data_end)
69 return rc;
70
71 ifindex = bpf_map_lookup_elem(&tx_port, &port);
72 if (!ifindex)
73 return rc;
74
75 value = bpf_map_lookup_elem(&rxcnt, &key);
76 if (value)
77 *value += 1;
78
79 swap_src_dst_mac(data);
80 return bpf_redirect(*ifindex, 0);
81}
82
83/* Redirect require an XDP bpf_prog loaded on the TX device */
84SEC("xdp_redirect_dummy")
85int xdp_redirect_dummy_prog(struct xdp_md *ctx)
86{
87 return XDP_PASS;
88}
89
90char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_redirect_map_kern.c b/samples/bpf/xdp_redirect_map_kern.c
new file mode 100644
index 000000000..6489352ab
--- /dev/null
+++ b/samples/bpf/xdp_redirect_map_kern.c
@@ -0,0 +1,92 @@
1/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12#define KBUILD_MODNAME "foo"
13#include <uapi/linux/bpf.h>
14#include <linux/in.h>
15#include <linux/if_ether.h>
16#include <linux/if_packet.h>
17#include <linux/if_vlan.h>
18#include <linux/ip.h>
19#include <linux/ipv6.h>
20#include <bpf/bpf_helpers.h>
21
22struct {
23 __uint(type, BPF_MAP_TYPE_DEVMAP);
24 __uint(key_size, sizeof(int));
25 __uint(value_size, sizeof(int));
26 __uint(max_entries, 100);
27} tx_port SEC(".maps");
28
29/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
30 * feedback. Redirect TX errors can be caught via a tracepoint.
31 */
32struct {
33 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
34 __type(key, u32);
35 __type(value, long);
36 __uint(max_entries, 1);
37} rxcnt SEC(".maps");
38
39static void swap_src_dst_mac(void *data)
40{
41 unsigned short *p = data;
42 unsigned short dst[3];
43
44 dst[0] = p[0];
45 dst[1] = p[1];
46 dst[2] = p[2];
47 p[0] = p[3];
48 p[1] = p[4];
49 p[2] = p[5];
50 p[3] = dst[0];
51 p[4] = dst[1];
52 p[5] = dst[2];
53}
54
55SEC("xdp_redirect_map")
56int xdp_redirect_map_prog(struct xdp_md *ctx)
57{
58 void *data_end = (void *)(long)ctx->data_end;
59 void *data = (void *)(long)ctx->data;
60 struct ethhdr *eth = data;
61 int rc = XDP_DROP;
62 int vport, port = 0, m = 0;
63 long *value;
64 u32 key = 0;
65 u64 nh_off;
66
67 nh_off = sizeof(*eth);
68 if (data + nh_off > data_end)
69 return rc;
70
71 /* constant virtual port */
72 vport = 0;
73
74 /* count packet in global counter */
75 value = bpf_map_lookup_elem(&rxcnt, &key);
76 if (value)
77 *value += 1;
78
79 swap_src_dst_mac(data);
80
81 /* send packet out physical port */
82 return bpf_redirect_map(&tx_port, vport, 0);
83}
84
85/* Redirect require an XDP bpf_prog loaded on the TX device */
86SEC("xdp_redirect_dummy")
87int xdp_redirect_dummy_prog(struct xdp_md *ctx)
88{
89 return XDP_PASS;
90}
91
92char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c
new file mode 100644
index 000000000..35e16dee6
--- /dev/null
+++ b/samples/bpf/xdp_redirect_map_user.c
@@ -0,0 +1,222 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
3 */
4#include <linux/bpf.h>
5#include <linux/if_link.h>
6#include <assert.h>
7#include <errno.h>
8#include <signal.h>
9#include <stdio.h>
10#include <stdlib.h>
11#include <stdbool.h>
12#include <string.h>
13#include <net/if.h>
14#include <unistd.h>
15#include <libgen.h>
16#include <sys/resource.h>
17
18#include "bpf_util.h"
19#include <bpf/bpf.h>
20#include <bpf/libbpf.h>
21
22static int ifindex_in;
23static int ifindex_out;
24static bool ifindex_out_xdp_dummy_attached = true;
25static __u32 prog_id;
26static __u32 dummy_prog_id;
27
28static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
29static int rxcnt_map_fd;
30
31static void int_exit(int sig)
32{
33 __u32 curr_prog_id = 0;
34
35 if (bpf_get_link_xdp_id(ifindex_in, &curr_prog_id, xdp_flags)) {
36 printf("bpf_get_link_xdp_id failed\n");
37 exit(1);
38 }
39 if (prog_id == curr_prog_id)
40 bpf_set_link_xdp_fd(ifindex_in, -1, xdp_flags);
41 else if (!curr_prog_id)
42 printf("couldn't find a prog id on iface IN\n");
43 else
44 printf("program on iface IN changed, not removing\n");
45
46 if (ifindex_out_xdp_dummy_attached) {
47 curr_prog_id = 0;
48 if (bpf_get_link_xdp_id(ifindex_out, &curr_prog_id,
49 xdp_flags)) {
50 printf("bpf_get_link_xdp_id failed\n");
51 exit(1);
52 }
53 if (dummy_prog_id == curr_prog_id)
54 bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags);
55 else if (!curr_prog_id)
56 printf("couldn't find a prog id on iface OUT\n");
57 else
58 printf("program on iface OUT changed, not removing\n");
59 }
60 exit(0);
61}
62
63static void poll_stats(int interval, int ifindex)
64{
65 unsigned int nr_cpus = bpf_num_possible_cpus();
66 __u64 values[nr_cpus], prev[nr_cpus];
67
68 memset(prev, 0, sizeof(prev));
69
70 while (1) {
71 __u64 sum = 0;
72 __u32 key = 0;
73 int i;
74
75 sleep(interval);
76 assert(bpf_map_lookup_elem(rxcnt_map_fd, &key, values) == 0);
77 for (i = 0; i < nr_cpus; i++)
78 sum += (values[i] - prev[i]);
79 if (sum)
80 printf("ifindex %i: %10llu pkt/s\n",
81 ifindex, sum / interval);
82 memcpy(prev, values, sizeof(values));
83 }
84}
85
86static void usage(const char *prog)
87{
88 fprintf(stderr,
89 "usage: %s [OPTS] <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n\n"
90 "OPTS:\n"
91 " -S use skb-mode\n"
92 " -N enforce native mode\n"
93 " -F force loading prog\n",
94 prog);
95}
96
97int main(int argc, char **argv)
98{
99 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
100 struct bpf_prog_load_attr prog_load_attr = {
101 .prog_type = BPF_PROG_TYPE_XDP,
102 };
103 struct bpf_program *prog, *dummy_prog;
104 struct bpf_prog_info info = {};
105 __u32 info_len = sizeof(info);
106 int prog_fd, dummy_prog_fd;
107 const char *optstr = "FSN";
108 struct bpf_object *obj;
109 int ret, opt, key = 0;
110 char filename[256];
111 int tx_port_map_fd;
112
113 while ((opt = getopt(argc, argv, optstr)) != -1) {
114 switch (opt) {
115 case 'S':
116 xdp_flags |= XDP_FLAGS_SKB_MODE;
117 break;
118 case 'N':
119 /* default, set below */
120 break;
121 case 'F':
122 xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
123 break;
124 default:
125 usage(basename(argv[0]));
126 return 1;
127 }
128 }
129
130 if (!(xdp_flags & XDP_FLAGS_SKB_MODE))
131 xdp_flags |= XDP_FLAGS_DRV_MODE;
132
133 if (optind == argc) {
134 printf("usage: %s <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n", argv[0]);
135 return 1;
136 }
137
138 if (setrlimit(RLIMIT_MEMLOCK, &r)) {
139 perror("setrlimit(RLIMIT_MEMLOCK)");
140 return 1;
141 }
142
143 ifindex_in = if_nametoindex(argv[optind]);
144 if (!ifindex_in)
145 ifindex_in = strtoul(argv[optind], NULL, 0);
146
147 ifindex_out = if_nametoindex(argv[optind + 1]);
148 if (!ifindex_out)
149 ifindex_out = strtoul(argv[optind + 1], NULL, 0);
150
151 printf("input: %d output: %d\n", ifindex_in, ifindex_out);
152
153 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
154 prog_load_attr.file = filename;
155
156 if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
157 return 1;
158
159 prog = bpf_program__next(NULL, obj);
160 dummy_prog = bpf_program__next(prog, obj);
161 if (!prog || !dummy_prog) {
162 printf("finding a prog in obj file failed\n");
163 return 1;
164 }
165 /* bpf_prog_load_xattr gives us the pointer to first prog's fd,
166 * so we're missing only the fd for dummy prog
167 */
168 dummy_prog_fd = bpf_program__fd(dummy_prog);
169 if (prog_fd < 0 || dummy_prog_fd < 0) {
170 printf("bpf_prog_load_xattr: %s\n", strerror(errno));
171 return 1;
172 }
173
174 tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port");
175 rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt");
176 if (tx_port_map_fd < 0 || rxcnt_map_fd < 0) {
177 printf("bpf_object__find_map_fd_by_name failed\n");
178 return 1;
179 }
180
181 if (bpf_set_link_xdp_fd(ifindex_in, prog_fd, xdp_flags) < 0) {
182 printf("ERROR: link set xdp fd failed on %d\n", ifindex_in);
183 return 1;
184 }
185
186 ret = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
187 if (ret) {
188 printf("can't get prog info - %s\n", strerror(errno));
189 return ret;
190 }
191 prog_id = info.id;
192
193 /* Loading dummy XDP prog on out-device */
194 if (bpf_set_link_xdp_fd(ifindex_out, dummy_prog_fd,
195 (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) {
196 printf("WARN: link set xdp fd failed on %d\n", ifindex_out);
197 ifindex_out_xdp_dummy_attached = false;
198 }
199
200 memset(&info, 0, sizeof(info));
201 ret = bpf_obj_get_info_by_fd(dummy_prog_fd, &info, &info_len);
202 if (ret) {
203 printf("can't get prog info - %s\n", strerror(errno));
204 return ret;
205 }
206 dummy_prog_id = info.id;
207
208 signal(SIGINT, int_exit);
209 signal(SIGTERM, int_exit);
210
211 /* populate virtual to physical port map */
212 ret = bpf_map_update_elem(tx_port_map_fd, &key, &ifindex_out, 0);
213 if (ret) {
214 perror("bpf_update_elem");
215 goto out;
216 }
217
218 poll_stats(2, ifindex_out);
219
220out:
221 return 0;
222}
diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c
new file mode 100644
index 000000000..3c92adc2a
--- /dev/null
+++ b/samples/bpf/xdp_redirect_user.c
@@ -0,0 +1,223 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright (c) 2016 John Fastabend <john.r.fastabend@intel.com>
3 */
4#include <linux/bpf.h>
5#include <linux/if_link.h>
6#include <assert.h>
7#include <errno.h>
8#include <signal.h>
9#include <stdio.h>
10#include <stdlib.h>
11#include <stdbool.h>
12#include <string.h>
13#include <net/if.h>
14#include <unistd.h>
15#include <libgen.h>
16#include <sys/resource.h>
17
18#include "bpf_util.h"
19#include <bpf/bpf.h>
20#include <bpf/libbpf.h>
21
22static int ifindex_in;
23static int ifindex_out;
24static bool ifindex_out_xdp_dummy_attached = true;
25static __u32 prog_id;
26static __u32 dummy_prog_id;
27
28static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
29static int rxcnt_map_fd;
30
31static void int_exit(int sig)
32{
33 __u32 curr_prog_id = 0;
34
35 if (bpf_get_link_xdp_id(ifindex_in, &curr_prog_id, xdp_flags)) {
36 printf("bpf_get_link_xdp_id failed\n");
37 exit(1);
38 }
39 if (prog_id == curr_prog_id)
40 bpf_set_link_xdp_fd(ifindex_in, -1, xdp_flags);
41 else if (!curr_prog_id)
42 printf("couldn't find a prog id on iface IN\n");
43 else
44 printf("program on iface IN changed, not removing\n");
45
46 if (ifindex_out_xdp_dummy_attached) {
47 curr_prog_id = 0;
48 if (bpf_get_link_xdp_id(ifindex_out, &curr_prog_id,
49 xdp_flags)) {
50 printf("bpf_get_link_xdp_id failed\n");
51 exit(1);
52 }
53 if (dummy_prog_id == curr_prog_id)
54 bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags);
55 else if (!curr_prog_id)
56 printf("couldn't find a prog id on iface OUT\n");
57 else
58 printf("program on iface OUT changed, not removing\n");
59 }
60 exit(0);
61}
62
63static void poll_stats(int interval, int ifindex)
64{
65 unsigned int nr_cpus = bpf_num_possible_cpus();
66 __u64 values[nr_cpus], prev[nr_cpus];
67
68 memset(prev, 0, sizeof(prev));
69
70 while (1) {
71 __u64 sum = 0;
72 __u32 key = 0;
73 int i;
74
75 sleep(interval);
76 assert(bpf_map_lookup_elem(rxcnt_map_fd, &key, values) == 0);
77 for (i = 0; i < nr_cpus; i++)
78 sum += (values[i] - prev[i]);
79 if (sum)
80 printf("ifindex %i: %10llu pkt/s\n",
81 ifindex, sum / interval);
82 memcpy(prev, values, sizeof(values));
83 }
84}
85
86static void usage(const char *prog)
87{
88 fprintf(stderr,
89 "usage: %s [OPTS] <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n\n"
90 "OPTS:\n"
91 " -S use skb-mode\n"
92 " -N enforce native mode\n"
93 " -F force loading prog\n",
94 prog);
95}
96
97
98int main(int argc, char **argv)
99{
100 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
101 struct bpf_prog_load_attr prog_load_attr = {
102 .prog_type = BPF_PROG_TYPE_XDP,
103 };
104 struct bpf_program *prog, *dummy_prog;
105 int prog_fd, tx_port_map_fd, opt;
106 struct bpf_prog_info info = {};
107 __u32 info_len = sizeof(info);
108 const char *optstr = "FSN";
109 struct bpf_object *obj;
110 char filename[256];
111 int dummy_prog_fd;
112 int ret, key = 0;
113
114 while ((opt = getopt(argc, argv, optstr)) != -1) {
115 switch (opt) {
116 case 'S':
117 xdp_flags |= XDP_FLAGS_SKB_MODE;
118 break;
119 case 'N':
120 /* default, set below */
121 break;
122 case 'F':
123 xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
124 break;
125 default:
126 usage(basename(argv[0]));
127 return 1;
128 }
129 }
130
131 if (!(xdp_flags & XDP_FLAGS_SKB_MODE))
132 xdp_flags |= XDP_FLAGS_DRV_MODE;
133
134 if (optind + 2 != argc) {
135 printf("usage: %s <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n", argv[0]);
136 return 1;
137 }
138
139 if (setrlimit(RLIMIT_MEMLOCK, &r)) {
140 perror("setrlimit(RLIMIT_MEMLOCK)");
141 return 1;
142 }
143
144 ifindex_in = if_nametoindex(argv[optind]);
145 if (!ifindex_in)
146 ifindex_in = strtoul(argv[optind], NULL, 0);
147
148 ifindex_out = if_nametoindex(argv[optind + 1]);
149 if (!ifindex_out)
150 ifindex_out = strtoul(argv[optind + 1], NULL, 0);
151
152 printf("input: %d output: %d\n", ifindex_in, ifindex_out);
153
154 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
155 prog_load_attr.file = filename;
156
157 if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
158 return 1;
159
160 prog = bpf_program__next(NULL, obj);
161 dummy_prog = bpf_program__next(prog, obj);
162 if (!prog || !dummy_prog) {
163 printf("finding a prog in obj file failed\n");
164 return 1;
165 }
166 /* bpf_prog_load_xattr gives us the pointer to first prog's fd,
167 * so we're missing only the fd for dummy prog
168 */
169 dummy_prog_fd = bpf_program__fd(dummy_prog);
170 if (prog_fd < 0 || dummy_prog_fd < 0) {
171 printf("bpf_prog_load_xattr: %s\n", strerror(errno));
172 return 1;
173 }
174
175 tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port");
176 rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt");
177 if (tx_port_map_fd < 0 || rxcnt_map_fd < 0) {
178 printf("bpf_object__find_map_fd_by_name failed\n");
179 return 1;
180 }
181
182 if (bpf_set_link_xdp_fd(ifindex_in, prog_fd, xdp_flags) < 0) {
183 printf("ERROR: link set xdp fd failed on %d\n", ifindex_in);
184 return 1;
185 }
186
187 ret = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
188 if (ret) {
189 printf("can't get prog info - %s\n", strerror(errno));
190 return ret;
191 }
192 prog_id = info.id;
193
194 /* Loading dummy XDP prog on out-device */
195 if (bpf_set_link_xdp_fd(ifindex_out, dummy_prog_fd,
196 (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) {
197 printf("WARN: link set xdp fd failed on %d\n", ifindex_out);
198 ifindex_out_xdp_dummy_attached = false;
199 }
200
201 memset(&info, 0, sizeof(info));
202 ret = bpf_obj_get_info_by_fd(dummy_prog_fd, &info, &info_len);
203 if (ret) {
204 printf("can't get prog info - %s\n", strerror(errno));
205 return ret;
206 }
207 dummy_prog_id = info.id;
208
209 signal(SIGINT, int_exit);
210 signal(SIGTERM, int_exit);
211
212 /* bpf redirect port */
213 ret = bpf_map_update_elem(tx_port_map_fd, &key, &ifindex_out, 0);
214 if (ret) {
215 perror("bpf_update_elem");
216 goto out;
217 }
218
219 poll_stats(2, ifindex_out);
220
221out:
222 return ret;
223}
diff --git a/samples/bpf/xdp_router_ipv4_kern.c b/samples/bpf/xdp_router_ipv4_kern.c
new file mode 100644
index 000000000..b37ca2b13
--- /dev/null
+++ b/samples/bpf/xdp_router_ipv4_kern.c
@@ -0,0 +1,186 @@
1/* Copyright (C) 2017 Cavium, Inc.
2 *
3 * This program is free software; you can redistribute it and/or modify it
4 * under the terms of version 2 of the GNU General Public License
5 * as published by the Free Software Foundation.
6 */
7#define KBUILD_MODNAME "foo"
8#include <uapi/linux/bpf.h>
9#include <linux/in.h>
10#include <linux/if_ether.h>
11#include <linux/if_packet.h>
12#include <linux/if_vlan.h>
13#include <linux/ip.h>
14#include <linux/ipv6.h>
15#include <bpf/bpf_helpers.h>
16#include <linux/slab.h>
17#include <net/ip_fib.h>
18
19struct trie_value {
20 __u8 prefix[4];
21 __be64 value;
22 int ifindex;
23 int metric;
24 __be32 gw;
25};
26
27/* Key for lpm_trie*/
28union key_4 {
29 u32 b32[2];
30 u8 b8[8];
31};
32
33struct arp_entry {
34 __be64 mac;
35 __be32 dst;
36};
37
38struct direct_map {
39 struct arp_entry arp;
40 int ifindex;
41 __be64 mac;
42};
43
44/* Map for trie implementation*/
45struct {
46 __uint(type, BPF_MAP_TYPE_LPM_TRIE);
47 __uint(key_size, 8);
48 __uint(value_size, sizeof(struct trie_value));
49 __uint(max_entries, 50);
50 __uint(map_flags, BPF_F_NO_PREALLOC);
51} lpm_map SEC(".maps");
52
53/* Map for counter*/
54struct {
55 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
56 __type(key, u32);
57 __type(value, u64);
58 __uint(max_entries, 256);
59} rxcnt SEC(".maps");
60
61/* Map for ARP table*/
62struct {
63 __uint(type, BPF_MAP_TYPE_HASH);
64 __type(key, __be32);
65 __type(value, __be64);
66 __uint(max_entries, 50);
67} arp_table SEC(".maps");
68
69/* Map to keep the exact match entries in the route table*/
70struct {
71 __uint(type, BPF_MAP_TYPE_HASH);
72 __type(key, __be32);
73 __type(value, struct direct_map);
74 __uint(max_entries, 50);
75} exact_match SEC(".maps");
76
77struct {
78 __uint(type, BPF_MAP_TYPE_DEVMAP);
79 __uint(key_size, sizeof(int));
80 __uint(value_size, sizeof(int));
81 __uint(max_entries, 100);
82} tx_port SEC(".maps");
83
84/* Function to set source and destination mac of the packet */
85static inline void set_src_dst_mac(void *data, void *src, void *dst)
86{
87 unsigned short *source = src;
88 unsigned short *dest = dst;
89 unsigned short *p = data;
90
91 __builtin_memcpy(p, dest, 6);
92 __builtin_memcpy(p + 3, source, 6);
93}
94
95/* Parse IPV4 packet to get SRC, DST IP and protocol */
96static inline int parse_ipv4(void *data, u64 nh_off, void *data_end,
97 __be32 *src, __be32 *dest)
98{
99 struct iphdr *iph = data + nh_off;
100
101 if (iph + 1 > data_end)
102 return 0;
103 *src = iph->saddr;
104 *dest = iph->daddr;
105 return iph->protocol;
106}
107
108SEC("xdp_router_ipv4")
109int xdp_router_ipv4_prog(struct xdp_md *ctx)
110{
111 void *data_end = (void *)(long)ctx->data_end;
112 __be64 *dest_mac = NULL, *src_mac = NULL;
113 void *data = (void *)(long)ctx->data;
114 struct trie_value *prefix_value;
115 int rc = XDP_DROP, forward_to;
116 struct ethhdr *eth = data;
117 union key_4 key4;
118 long *value;
119 u16 h_proto;
120 u32 ipproto;
121 u64 nh_off;
122
123 nh_off = sizeof(*eth);
124 if (data + nh_off > data_end)
125 return rc;
126
127 h_proto = eth->h_proto;
128
129 if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
130 struct vlan_hdr *vhdr;
131
132 vhdr = data + nh_off;
133 nh_off += sizeof(struct vlan_hdr);
134 if (data + nh_off > data_end)
135 return rc;
136 h_proto = vhdr->h_vlan_encapsulated_proto;
137 }
138 if (h_proto == htons(ETH_P_ARP)) {
139 return XDP_PASS;
140 } else if (h_proto == htons(ETH_P_IP)) {
141 struct direct_map *direct_entry;
142 __be32 src_ip = 0, dest_ip = 0;
143
144 ipproto = parse_ipv4(data, nh_off, data_end, &src_ip, &dest_ip);
145 direct_entry = bpf_map_lookup_elem(&exact_match, &dest_ip);
146 /* Check for exact match, this would give a faster lookup*/
147 if (direct_entry && direct_entry->mac && direct_entry->arp.mac) {
148 src_mac = &direct_entry->mac;
149 dest_mac = &direct_entry->arp.mac;
150 forward_to = direct_entry->ifindex;
151 } else {
152 /* Look up in the trie for lpm*/
153 key4.b32[0] = 32;
154 key4.b8[4] = dest_ip & 0xff;
155 key4.b8[5] = (dest_ip >> 8) & 0xff;
156 key4.b8[6] = (dest_ip >> 16) & 0xff;
157 key4.b8[7] = (dest_ip >> 24) & 0xff;
158 prefix_value = bpf_map_lookup_elem(&lpm_map, &key4);
159 if (!prefix_value)
160 return XDP_DROP;
161 src_mac = &prefix_value->value;
162 if (!src_mac)
163 return XDP_DROP;
164 dest_mac = bpf_map_lookup_elem(&arp_table, &dest_ip);
165 if (!dest_mac) {
166 if (!prefix_value->gw)
167 return XDP_DROP;
168 dest_ip = prefix_value->gw;
169 dest_mac = bpf_map_lookup_elem(&arp_table, &dest_ip);
170 }
171 forward_to = prefix_value->ifindex;
172 }
173 } else {
174 ipproto = 0;
175 }
176 if (src_mac && dest_mac) {
177 set_src_dst_mac(data, src_mac, dest_mac);
178 value = bpf_map_lookup_elem(&rxcnt, &ipproto);
179 if (value)
180 *value += 1;
181 return bpf_redirect_map(&tx_port, forward_to, 0);
182 }
183 return rc;
184}
185
186char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c
new file mode 100644
index 000000000..c2da1b51f
--- /dev/null
+++ b/samples/bpf/xdp_router_ipv4_user.c
@@ -0,0 +1,741 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright (C) 2017 Cavium, Inc.
3 */
4#include <linux/bpf.h>
5#include <linux/netlink.h>
6#include <linux/rtnetlink.h>
7#include <assert.h>
8#include <errno.h>
9#include <signal.h>
10#include <stdio.h>
11#include <stdlib.h>
12#include <string.h>
13#include <sys/socket.h>
14#include <unistd.h>
15#include <bpf/bpf.h>
16#include <arpa/inet.h>
17#include <fcntl.h>
18#include <poll.h>
19#include <net/if.h>
20#include <netdb.h>
21#include <sys/ioctl.h>
22#include <sys/syscall.h>
23#include "bpf_util.h"
24#include <bpf/libbpf.h>
25#include <sys/resource.h>
26#include <libgen.h>
27
28int sock, sock_arp, flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
29static int total_ifindex;
30static int *ifindex_list;
31static __u32 *prog_id_list;
32char buf[8192];
33static int lpm_map_fd;
34static int rxcnt_map_fd;
35static int arp_table_map_fd;
36static int exact_match_map_fd;
37static int tx_port_map_fd;
38
39static int get_route_table(int rtm_family);
40static void int_exit(int sig)
41{
42 __u32 prog_id = 0;
43 int i = 0;
44
45 for (i = 0; i < total_ifindex; i++) {
46 if (bpf_get_link_xdp_id(ifindex_list[i], &prog_id, flags)) {
47 printf("bpf_get_link_xdp_id on iface %d failed\n",
48 ifindex_list[i]);
49 exit(1);
50 }
51 if (prog_id_list[i] == prog_id)
52 bpf_set_link_xdp_fd(ifindex_list[i], -1, flags);
53 else if (!prog_id)
54 printf("couldn't find a prog id on iface %d\n",
55 ifindex_list[i]);
56 else
57 printf("program on iface %d changed, not removing\n",
58 ifindex_list[i]);
59 prog_id = 0;
60 }
61 exit(0);
62}
63
64static void close_and_exit(int sig)
65{
66 close(sock);
67 close(sock_arp);
68
69 int_exit(0);
70}
71
72/* Get the mac address of the interface given interface name */
73static __be64 getmac(char *iface)
74{
75 struct ifreq ifr;
76 __be64 mac = 0;
77 int fd, i;
78
79 fd = socket(AF_INET, SOCK_DGRAM, 0);
80 ifr.ifr_addr.sa_family = AF_INET;
81 strncpy(ifr.ifr_name, iface, IFNAMSIZ - 1);
82 if (ioctl(fd, SIOCGIFHWADDR, &ifr) < 0) {
83 printf("ioctl failed leaving....\n");
84 return -1;
85 }
86 for (i = 0; i < 6 ; i++)
87 *((__u8 *)&mac + i) = (__u8)ifr.ifr_hwaddr.sa_data[i];
88 close(fd);
89 return mac;
90}
91
92static int recv_msg(struct sockaddr_nl sock_addr, int sock)
93{
94 struct nlmsghdr *nh;
95 int len, nll = 0;
96 char *buf_ptr;
97
98 buf_ptr = buf;
99 while (1) {
100 len = recv(sock, buf_ptr, sizeof(buf) - nll, 0);
101 if (len < 0)
102 return len;
103
104 nh = (struct nlmsghdr *)buf_ptr;
105
106 if (nh->nlmsg_type == NLMSG_DONE)
107 break;
108 buf_ptr += len;
109 nll += len;
110 if ((sock_addr.nl_groups & RTMGRP_NEIGH) == RTMGRP_NEIGH)
111 break;
112
113 if ((sock_addr.nl_groups & RTMGRP_IPV4_ROUTE) == RTMGRP_IPV4_ROUTE)
114 break;
115 }
116 return nll;
117}
118
119/* Function to parse the route entry returned by netlink
120 * Updates the route entry related map entries
121 */
122static void read_route(struct nlmsghdr *nh, int nll)
123{
124 char dsts[24], gws[24], ifs[16], dsts_len[24], metrics[24];
125 struct bpf_lpm_trie_key *prefix_key;
126 struct rtattr *rt_attr;
127 struct rtmsg *rt_msg;
128 int rtm_family;
129 int rtl;
130 int i;
131 struct route_table {
132 int dst_len, iface, metric;
133 char *iface_name;
134 __be32 dst, gw;
135 __be64 mac;
136 } route;
137 struct arp_table {
138 __be64 mac;
139 __be32 dst;
140 };
141
142 struct direct_map {
143 struct arp_table arp;
144 int ifindex;
145 __be64 mac;
146 } direct_entry;
147
148 if (nh->nlmsg_type == RTM_DELROUTE)
149 printf("DELETING Route entry\n");
150 else if (nh->nlmsg_type == RTM_GETROUTE)
151 printf("READING Route entry\n");
152 else if (nh->nlmsg_type == RTM_NEWROUTE)
153 printf("NEW Route entry\n");
154 else
155 printf("%d\n", nh->nlmsg_type);
156
157 memset(&route, 0, sizeof(route));
158 printf("Destination\t\tGateway\t\tGenmask\t\tMetric\t\tIface\n");
159 for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) {
160 rt_msg = (struct rtmsg *)NLMSG_DATA(nh);
161 rtm_family = rt_msg->rtm_family;
162 if (rtm_family == AF_INET)
163 if (rt_msg->rtm_table != RT_TABLE_MAIN)
164 continue;
165 rt_attr = (struct rtattr *)RTM_RTA(rt_msg);
166 rtl = RTM_PAYLOAD(nh);
167
168 for (; RTA_OK(rt_attr, rtl); rt_attr = RTA_NEXT(rt_attr, rtl)) {
169 switch (rt_attr->rta_type) {
170 case NDA_DST:
171 sprintf(dsts, "%u",
172 (*((__be32 *)RTA_DATA(rt_attr))));
173 break;
174 case RTA_GATEWAY:
175 sprintf(gws, "%u",
176 *((__be32 *)RTA_DATA(rt_attr)));
177 break;
178 case RTA_OIF:
179 sprintf(ifs, "%u",
180 *((int *)RTA_DATA(rt_attr)));
181 break;
182 case RTA_METRICS:
183 sprintf(metrics, "%u",
184 *((int *)RTA_DATA(rt_attr)));
185 default:
186 break;
187 }
188 }
189 sprintf(dsts_len, "%d", rt_msg->rtm_dst_len);
190 route.dst = atoi(dsts);
191 route.dst_len = atoi(dsts_len);
192 route.gw = atoi(gws);
193 route.iface = atoi(ifs);
194 route.metric = atoi(metrics);
195 route.iface_name = alloca(sizeof(char *) * IFNAMSIZ);
196 route.iface_name = if_indextoname(route.iface, route.iface_name);
197 route.mac = getmac(route.iface_name);
198 if (route.mac == -1)
199 int_exit(0);
200 assert(bpf_map_update_elem(tx_port_map_fd,
201 &route.iface, &route.iface, 0) == 0);
202 if (rtm_family == AF_INET) {
203 struct trie_value {
204 __u8 prefix[4];
205 __be64 value;
206 int ifindex;
207 int metric;
208 __be32 gw;
209 } *prefix_value;
210
211 prefix_key = alloca(sizeof(*prefix_key) + 3);
212 prefix_value = alloca(sizeof(*prefix_value));
213
214 prefix_key->prefixlen = 32;
215 prefix_key->prefixlen = route.dst_len;
216 direct_entry.mac = route.mac & 0xffffffffffff;
217 direct_entry.ifindex = route.iface;
218 direct_entry.arp.mac = 0;
219 direct_entry.arp.dst = 0;
220 if (route.dst_len == 32) {
221 if (nh->nlmsg_type == RTM_DELROUTE) {
222 assert(bpf_map_delete_elem(exact_match_map_fd,
223 &route.dst) == 0);
224 } else {
225 if (bpf_map_lookup_elem(arp_table_map_fd,
226 &route.dst,
227 &direct_entry.arp.mac) == 0)
228 direct_entry.arp.dst = route.dst;
229 assert(bpf_map_update_elem(exact_match_map_fd,
230 &route.dst,
231 &direct_entry, 0) == 0);
232 }
233 }
234 for (i = 0; i < 4; i++)
235 prefix_key->data[i] = (route.dst >> i * 8) & 0xff;
236
237 printf("%3d.%d.%d.%d\t\t%3x\t\t%d\t\t%d\t\t%s\n",
238 (int)prefix_key->data[0],
239 (int)prefix_key->data[1],
240 (int)prefix_key->data[2],
241 (int)prefix_key->data[3],
242 route.gw, route.dst_len,
243 route.metric,
244 route.iface_name);
245 if (bpf_map_lookup_elem(lpm_map_fd, prefix_key,
246 prefix_value) < 0) {
247 for (i = 0; i < 4; i++)
248 prefix_value->prefix[i] = prefix_key->data[i];
249 prefix_value->value = route.mac & 0xffffffffffff;
250 prefix_value->ifindex = route.iface;
251 prefix_value->gw = route.gw;
252 prefix_value->metric = route.metric;
253
254 assert(bpf_map_update_elem(lpm_map_fd,
255 prefix_key,
256 prefix_value, 0
257 ) == 0);
258 } else {
259 if (nh->nlmsg_type == RTM_DELROUTE) {
260 printf("deleting entry\n");
261 printf("prefix key=%d.%d.%d.%d/%d",
262 prefix_key->data[0],
263 prefix_key->data[1],
264 prefix_key->data[2],
265 prefix_key->data[3],
266 prefix_key->prefixlen);
267 assert(bpf_map_delete_elem(lpm_map_fd,
268 prefix_key
269 ) == 0);
270 /* Rereading the route table to check if
271 * there is an entry with the same
272 * prefix but a different metric as the
273 * deleted enty.
274 */
275 get_route_table(AF_INET);
276 } else if (prefix_key->data[0] ==
277 prefix_value->prefix[0] &&
278 prefix_key->data[1] ==
279 prefix_value->prefix[1] &&
280 prefix_key->data[2] ==
281 prefix_value->prefix[2] &&
282 prefix_key->data[3] ==
283 prefix_value->prefix[3] &&
284 route.metric >= prefix_value->metric) {
285 continue;
286 } else {
287 for (i = 0; i < 4; i++)
288 prefix_value->prefix[i] =
289 prefix_key->data[i];
290 prefix_value->value =
291 route.mac & 0xffffffffffff;
292 prefix_value->ifindex = route.iface;
293 prefix_value->gw = route.gw;
294 prefix_value->metric = route.metric;
295 assert(bpf_map_update_elem(lpm_map_fd,
296 prefix_key,
297 prefix_value,
298 0) == 0);
299 }
300 }
301 }
302 memset(&route, 0, sizeof(route));
303 memset(dsts, 0, sizeof(dsts));
304 memset(dsts_len, 0, sizeof(dsts_len));
305 memset(gws, 0, sizeof(gws));
306 memset(ifs, 0, sizeof(ifs));
307 memset(&route, 0, sizeof(route));
308 }
309}
310
311/* Function to read the existing route table when the process is launched*/
312static int get_route_table(int rtm_family)
313{
314 struct sockaddr_nl sa;
315 struct nlmsghdr *nh;
316 int sock, seq = 0;
317 struct msghdr msg;
318 struct iovec iov;
319 int ret = 0;
320 int nll;
321
322 struct {
323 struct nlmsghdr nl;
324 struct rtmsg rt;
325 char buf[8192];
326 } req;
327
328 sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
329 if (sock < 0) {
330 printf("open netlink socket: %s\n", strerror(errno));
331 return -1;
332 }
333 memset(&sa, 0, sizeof(sa));
334 sa.nl_family = AF_NETLINK;
335 if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
336 printf("bind to netlink: %s\n", strerror(errno));
337 ret = -1;
338 goto cleanup;
339 }
340 memset(&req, 0, sizeof(req));
341 req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
342 req.nl.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
343 req.nl.nlmsg_type = RTM_GETROUTE;
344
345 req.rt.rtm_family = rtm_family;
346 req.rt.rtm_table = RT_TABLE_MAIN;
347 req.nl.nlmsg_pid = 0;
348 req.nl.nlmsg_seq = ++seq;
349 memset(&msg, 0, sizeof(msg));
350 iov.iov_base = (void *)&req.nl;
351 iov.iov_len = req.nl.nlmsg_len;
352 msg.msg_iov = &iov;
353 msg.msg_iovlen = 1;
354 ret = sendmsg(sock, &msg, 0);
355 if (ret < 0) {
356 printf("send to netlink: %s\n", strerror(errno));
357 ret = -1;
358 goto cleanup;
359 }
360 memset(buf, 0, sizeof(buf));
361 nll = recv_msg(sa, sock);
362 if (nll < 0) {
363 printf("recv from netlink: %s\n", strerror(nll));
364 ret = -1;
365 goto cleanup;
366 }
367 nh = (struct nlmsghdr *)buf;
368 read_route(nh, nll);
369cleanup:
370 close(sock);
371 return ret;
372}
373
374/* Function to parse the arp entry returned by netlink
375 * Updates the arp entry related map entries
376 */
377static void read_arp(struct nlmsghdr *nh, int nll)
378{
379 struct rtattr *rt_attr;
380 char dsts[24], mac[24];
381 struct ndmsg *rt_msg;
382 int rtl, ndm_family;
383
384 struct arp_table {
385 __be64 mac;
386 __be32 dst;
387 } arp_entry;
388 struct direct_map {
389 struct arp_table arp;
390 int ifindex;
391 __be64 mac;
392 } direct_entry;
393
394 if (nh->nlmsg_type == RTM_GETNEIGH)
395 printf("READING arp entry\n");
396 printf("Address\tHwAddress\n");
397 for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) {
398 rt_msg = (struct ndmsg *)NLMSG_DATA(nh);
399 rt_attr = (struct rtattr *)RTM_RTA(rt_msg);
400 ndm_family = rt_msg->ndm_family;
401 rtl = RTM_PAYLOAD(nh);
402 for (; RTA_OK(rt_attr, rtl); rt_attr = RTA_NEXT(rt_attr, rtl)) {
403 switch (rt_attr->rta_type) {
404 case NDA_DST:
405 sprintf(dsts, "%u",
406 *((__be32 *)RTA_DATA(rt_attr)));
407 break;
408 case NDA_LLADDR:
409 sprintf(mac, "%lld",
410 *((__be64 *)RTA_DATA(rt_attr)));
411 break;
412 default:
413 break;
414 }
415 }
416 arp_entry.dst = atoi(dsts);
417 arp_entry.mac = atol(mac);
418 printf("%x\t\t%llx\n", arp_entry.dst, arp_entry.mac);
419 if (ndm_family == AF_INET) {
420 if (bpf_map_lookup_elem(exact_match_map_fd,
421 &arp_entry.dst,
422 &direct_entry) == 0) {
423 if (nh->nlmsg_type == RTM_DELNEIGH) {
424 direct_entry.arp.dst = 0;
425 direct_entry.arp.mac = 0;
426 } else if (nh->nlmsg_type == RTM_NEWNEIGH) {
427 direct_entry.arp.dst = arp_entry.dst;
428 direct_entry.arp.mac = arp_entry.mac;
429 }
430 assert(bpf_map_update_elem(exact_match_map_fd,
431 &arp_entry.dst,
432 &direct_entry, 0
433 ) == 0);
434 memset(&direct_entry, 0, sizeof(direct_entry));
435 }
436 if (nh->nlmsg_type == RTM_DELNEIGH) {
437 assert(bpf_map_delete_elem(arp_table_map_fd,
438 &arp_entry.dst) == 0);
439 } else if (nh->nlmsg_type == RTM_NEWNEIGH) {
440 assert(bpf_map_update_elem(arp_table_map_fd,
441 &arp_entry.dst,
442 &arp_entry.mac, 0
443 ) == 0);
444 }
445 }
446 memset(&arp_entry, 0, sizeof(arp_entry));
447 memset(dsts, 0, sizeof(dsts));
448 }
449}
450
451/* Function to read the existing arp table when the process is launched*/
452static int get_arp_table(int rtm_family)
453{
454 struct sockaddr_nl sa;
455 struct nlmsghdr *nh;
456 int sock, seq = 0;
457 struct msghdr msg;
458 struct iovec iov;
459 int ret = 0;
460 int nll;
461 struct {
462 struct nlmsghdr nl;
463 struct ndmsg rt;
464 char buf[8192];
465 } req;
466
467 sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
468 if (sock < 0) {
469 printf("open netlink socket: %s\n", strerror(errno));
470 return -1;
471 }
472 memset(&sa, 0, sizeof(sa));
473 sa.nl_family = AF_NETLINK;
474 if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
475 printf("bind to netlink: %s\n", strerror(errno));
476 ret = -1;
477 goto cleanup;
478 }
479 memset(&req, 0, sizeof(req));
480 req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
481 req.nl.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
482 req.nl.nlmsg_type = RTM_GETNEIGH;
483 req.rt.ndm_state = NUD_REACHABLE;
484 req.rt.ndm_family = rtm_family;
485 req.nl.nlmsg_pid = 0;
486 req.nl.nlmsg_seq = ++seq;
487 memset(&msg, 0, sizeof(msg));
488 iov.iov_base = (void *)&req.nl;
489 iov.iov_len = req.nl.nlmsg_len;
490 msg.msg_iov = &iov;
491 msg.msg_iovlen = 1;
492 ret = sendmsg(sock, &msg, 0);
493 if (ret < 0) {
494 printf("send to netlink: %s\n", strerror(errno));
495 ret = -1;
496 goto cleanup;
497 }
498 memset(buf, 0, sizeof(buf));
499 nll = recv_msg(sa, sock);
500 if (nll < 0) {
501 printf("recv from netlink: %s\n", strerror(nll));
502 ret = -1;
503 goto cleanup;
504 }
505 nh = (struct nlmsghdr *)buf;
506 read_arp(nh, nll);
507cleanup:
508 close(sock);
509 return ret;
510}
511
512/* Function to keep track and update changes in route and arp table
513 * Give regular statistics of packets forwarded
514 */
515static int monitor_route(void)
516{
517 unsigned int nr_cpus = bpf_num_possible_cpus();
518 const unsigned int nr_keys = 256;
519 struct pollfd fds_route, fds_arp;
520 __u64 prev[nr_keys][nr_cpus];
521 struct sockaddr_nl la, lr;
522 __u64 values[nr_cpus];
523 struct nlmsghdr *nh;
524 int nll, ret = 0;
525 int interval = 5;
526 __u32 key;
527 int i;
528
529 sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
530 if (sock < 0) {
531 printf("open netlink socket: %s\n", strerror(errno));
532 return -1;
533 }
534
535 fcntl(sock, F_SETFL, O_NONBLOCK);
536 memset(&lr, 0, sizeof(lr));
537 lr.nl_family = AF_NETLINK;
538 lr.nl_groups = RTMGRP_IPV6_ROUTE | RTMGRP_IPV4_ROUTE | RTMGRP_NOTIFY;
539 if (bind(sock, (struct sockaddr *)&lr, sizeof(lr)) < 0) {
540 printf("bind to netlink: %s\n", strerror(errno));
541 ret = -1;
542 goto cleanup;
543 }
544 fds_route.fd = sock;
545 fds_route.events = POLL_IN;
546
547 sock_arp = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
548 if (sock_arp < 0) {
549 printf("open netlink socket: %s\n", strerror(errno));
550 return -1;
551 }
552
553 fcntl(sock_arp, F_SETFL, O_NONBLOCK);
554 memset(&la, 0, sizeof(la));
555 la.nl_family = AF_NETLINK;
556 la.nl_groups = RTMGRP_NEIGH | RTMGRP_NOTIFY;
557 if (bind(sock_arp, (struct sockaddr *)&la, sizeof(la)) < 0) {
558 printf("bind to netlink: %s\n", strerror(errno));
559 ret = -1;
560 goto cleanup;
561 }
562 fds_arp.fd = sock_arp;
563 fds_arp.events = POLL_IN;
564
565 memset(prev, 0, sizeof(prev));
566 do {
567 signal(SIGINT, close_and_exit);
568 signal(SIGTERM, close_and_exit);
569
570 sleep(interval);
571 for (key = 0; key < nr_keys; key++) {
572 __u64 sum = 0;
573
574 assert(bpf_map_lookup_elem(rxcnt_map_fd,
575 &key, values) == 0);
576 for (i = 0; i < nr_cpus; i++)
577 sum += (values[i] - prev[key][i]);
578 if (sum)
579 printf("proto %u: %10llu pkt/s\n",
580 key, sum / interval);
581 memcpy(prev[key], values, sizeof(values));
582 }
583
584 memset(buf, 0, sizeof(buf));
585 if (poll(&fds_route, 1, 3) == POLL_IN) {
586 nll = recv_msg(lr, sock);
587 if (nll < 0) {
588 printf("recv from netlink: %s\n", strerror(nll));
589 ret = -1;
590 goto cleanup;
591 }
592
593 nh = (struct nlmsghdr *)buf;
594 printf("Routing table updated.\n");
595 read_route(nh, nll);
596 }
597 memset(buf, 0, sizeof(buf));
598 if (poll(&fds_arp, 1, 3) == POLL_IN) {
599 nll = recv_msg(la, sock_arp);
600 if (nll < 0) {
601 printf("recv from netlink: %s\n", strerror(nll));
602 ret = -1;
603 goto cleanup;
604 }
605
606 nh = (struct nlmsghdr *)buf;
607 read_arp(nh, nll);
608 }
609
610 } while (1);
611cleanup:
612 close(sock);
613 return ret;
614}
615
616static void usage(const char *prog)
617{
618 fprintf(stderr,
619 "%s: %s [OPTS] interface name list\n\n"
620 "OPTS:\n"
621 " -S use skb-mode\n"
622 " -F force loading prog\n",
623 __func__, prog);
624}
625
626int main(int ac, char **argv)
627{
628 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
629 struct bpf_prog_load_attr prog_load_attr = {
630 .prog_type = BPF_PROG_TYPE_XDP,
631 };
632 struct bpf_prog_info info = {};
633 __u32 info_len = sizeof(info);
634 const char *optstr = "SF";
635 struct bpf_object *obj;
636 char filename[256];
637 char **ifname_list;
638 int prog_fd, opt;
639 int err, i = 1;
640
641 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
642 prog_load_attr.file = filename;
643
644 total_ifindex = ac - 1;
645 ifname_list = (argv + 1);
646
647 while ((opt = getopt(ac, argv, optstr)) != -1) {
648 switch (opt) {
649 case 'S':
650 flags |= XDP_FLAGS_SKB_MODE;
651 total_ifindex--;
652 ifname_list++;
653 break;
654 case 'F':
655 flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
656 total_ifindex--;
657 ifname_list++;
658 break;
659 default:
660 usage(basename(argv[0]));
661 return 1;
662 }
663 }
664
665 if (!(flags & XDP_FLAGS_SKB_MODE))
666 flags |= XDP_FLAGS_DRV_MODE;
667
668 if (optind == ac) {
669 usage(basename(argv[0]));
670 return 1;
671 }
672
673 if (setrlimit(RLIMIT_MEMLOCK, &r)) {
674 perror("setrlimit(RLIMIT_MEMLOCK)");
675 return 1;
676 }
677
678 if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
679 return 1;
680
681 printf("\n**************loading bpf file*********************\n\n\n");
682 if (!prog_fd) {
683 printf("bpf_prog_load_xattr: %s\n", strerror(errno));
684 return 1;
685 }
686
687 lpm_map_fd = bpf_object__find_map_fd_by_name(obj, "lpm_map");
688 rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt");
689 arp_table_map_fd = bpf_object__find_map_fd_by_name(obj, "arp_table");
690 exact_match_map_fd = bpf_object__find_map_fd_by_name(obj,
691 "exact_match");
692 tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port");
693 if (lpm_map_fd < 0 || rxcnt_map_fd < 0 || arp_table_map_fd < 0 ||
694 exact_match_map_fd < 0 || tx_port_map_fd < 0) {
695 printf("bpf_object__find_map_fd_by_name failed\n");
696 return 1;
697 }
698
699 ifindex_list = (int *)calloc(total_ifindex, sizeof(int *));
700 for (i = 0; i < total_ifindex; i++) {
701 ifindex_list[i] = if_nametoindex(ifname_list[i]);
702 if (!ifindex_list[i]) {
703 printf("Couldn't translate interface name: %s",
704 strerror(errno));
705 return 1;
706 }
707 }
708 prog_id_list = (__u32 *)calloc(total_ifindex, sizeof(__u32 *));
709 for (i = 0; i < total_ifindex; i++) {
710 if (bpf_set_link_xdp_fd(ifindex_list[i], prog_fd, flags) < 0) {
711 printf("link set xdp fd failed\n");
712 int recovery_index = i;
713
714 for (i = 0; i < recovery_index; i++)
715 bpf_set_link_xdp_fd(ifindex_list[i], -1, flags);
716
717 return 1;
718 }
719 err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
720 if (err) {
721 printf("can't get prog info - %s\n", strerror(errno));
722 return err;
723 }
724 prog_id_list[i] = info.id;
725 memset(&info, 0, sizeof(info));
726 printf("Attached to %d\n", ifindex_list[i]);
727 }
728 signal(SIGINT, int_exit);
729 signal(SIGTERM, int_exit);
730
731 printf("*******************ROUTE TABLE*************************\n\n\n");
732 get_route_table(AF_INET);
733 printf("*******************ARP TABLE***************************\n\n\n");
734 get_arp_table(AF_INET);
735 if (monitor_route() < 0) {
736 printf("Error in receiving route update");
737 return 1;
738 }
739
740 return 0;
741}
diff --git a/samples/bpf/xdp_rxq_info_kern.c b/samples/bpf/xdp_rxq_info_kern.c
new file mode 100644
index 000000000..5e7459f9b
--- /dev/null
+++ b/samples/bpf/xdp_rxq_info_kern.c
@@ -0,0 +1,140 @@
1/* SPDX-License-Identifier: GPL-2.0
2 * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
3 *
4 * Example howto extract XDP RX-queue info
5 */
6#include <uapi/linux/bpf.h>
7#include <uapi/linux/if_ether.h>
8#include <uapi/linux/in.h>
9#include <bpf/bpf_helpers.h>
10
11/* Config setup from with userspace
12 *
13 * User-side setup ifindex in config_map, to verify that
14 * ctx->ingress_ifindex is correct (against configured ifindex)
15 */
16struct config {
17 __u32 action;
18 int ifindex;
19 __u32 options;
20};
21enum cfg_options_flags {
22 NO_TOUCH = 0x0U,
23 READ_MEM = 0x1U,
24 SWAP_MAC = 0x2U,
25};
26
27struct {
28 __uint(type, BPF_MAP_TYPE_ARRAY);
29 __type(key, int);
30 __type(value, struct config);
31 __uint(max_entries, 1);
32} config_map SEC(".maps");
33
34/* Common stats data record (shared with userspace) */
35struct datarec {
36 __u64 processed;
37 __u64 issue;
38};
39
40struct {
41 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
42 __type(key, u32);
43 __type(value, struct datarec);
44 __uint(max_entries, 1);
45} stats_global_map SEC(".maps");
46
47#define MAX_RXQs 64
48
49/* Stats per rx_queue_index (per CPU) */
50struct {
51 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
52 __type(key, u32);
53 __type(value, struct datarec);
54 __uint(max_entries, MAX_RXQs + 1);
55} rx_queue_index_map SEC(".maps");
56
57static __always_inline
58void swap_src_dst_mac(void *data)
59{
60 unsigned short *p = data;
61 unsigned short dst[3];
62
63 dst[0] = p[0];
64 dst[1] = p[1];
65 dst[2] = p[2];
66 p[0] = p[3];
67 p[1] = p[4];
68 p[2] = p[5];
69 p[3] = dst[0];
70 p[4] = dst[1];
71 p[5] = dst[2];
72}
73
74SEC("xdp_prog0")
75int xdp_prognum0(struct xdp_md *ctx)
76{
77 void *data_end = (void *)(long)ctx->data_end;
78 void *data = (void *)(long)ctx->data;
79 struct datarec *rec, *rxq_rec;
80 int ingress_ifindex;
81 struct config *config;
82 u32 key = 0;
83
84 /* Global stats record */
85 rec = bpf_map_lookup_elem(&stats_global_map, &key);
86 if (!rec)
87 return XDP_ABORTED;
88 rec->processed++;
89
90 /* Accessing ctx->ingress_ifindex, cause BPF to rewrite BPF
91 * instructions inside kernel to access xdp_rxq->dev->ifindex
92 */
93 ingress_ifindex = ctx->ingress_ifindex;
94
95 config = bpf_map_lookup_elem(&config_map, &key);
96 if (!config)
97 return XDP_ABORTED;
98
99 /* Simple test: check ctx provided ifindex is as expected */
100 if (ingress_ifindex != config->ifindex) {
101 /* count this error case */
102 rec->issue++;
103 return XDP_ABORTED;
104 }
105
106 /* Update stats per rx_queue_index. Handle if rx_queue_index
107 * is larger than stats map can contain info for.
108 */
109 key = ctx->rx_queue_index;
110 if (key >= MAX_RXQs)
111 key = MAX_RXQs;
112 rxq_rec = bpf_map_lookup_elem(&rx_queue_index_map, &key);
113 if (!rxq_rec)
114 return XDP_ABORTED;
115 rxq_rec->processed++;
116 if (key == MAX_RXQs)
117 rxq_rec->issue++;
118
119 /* Default: Don't touch packet data, only count packets */
120 if (unlikely(config->options & (READ_MEM|SWAP_MAC))) {
121 struct ethhdr *eth = data;
122
123 if (eth + 1 > data_end)
124 return XDP_ABORTED;
125
126 /* Avoid compiler removing this: Drop non 802.3 Ethertypes */
127 if (ntohs(eth->h_proto) < ETH_P_802_3_MIN)
128 return XDP_ABORTED;
129
130 /* XDP_TX requires changing MAC-addrs, else HW may drop.
131 * Can also be enabled with --swapmac (for test purposes)
132 */
133 if (unlikely(config->options & SWAP_MAC))
134 swap_src_dst_mac(data);
135 }
136
137 return config->action;
138}
139
140char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c
new file mode 100644
index 000000000..93fa1bc54
--- /dev/null
+++ b/samples/bpf/xdp_rxq_info_user.c
@@ -0,0 +1,605 @@
1/* SPDX-License-Identifier: GPL-2.0
2 * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
3 */
4static const char *__doc__ = " XDP RX-queue info extract example\n\n"
5 "Monitor how many packets per sec (pps) are received\n"
6 "per NIC RX queue index and which CPU processed the packet\n"
7 ;
8
9#include <errno.h>
10#include <signal.h>
11#include <stdio.h>
12#include <stdlib.h>
13#include <stdbool.h>
14#include <string.h>
15#include <unistd.h>
16#include <locale.h>
17#include <sys/resource.h>
18#include <getopt.h>
19#include <net/if.h>
20#include <time.h>
21
22#include <arpa/inet.h>
23#include <linux/if_link.h>
24
25#include <bpf/bpf.h>
26#include <bpf/libbpf.h>
27#include "bpf_util.h"
28
29static int ifindex = -1;
30static char ifname_buf[IF_NAMESIZE];
31static char *ifname;
32static __u32 prog_id;
33
34static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
35
36static struct bpf_map *stats_global_map;
37static struct bpf_map *rx_queue_index_map;
38
39/* Exit return codes */
40#define EXIT_OK 0
41#define EXIT_FAIL 1
42#define EXIT_FAIL_OPTION 2
43#define EXIT_FAIL_XDP 3
44#define EXIT_FAIL_BPF 4
45#define EXIT_FAIL_MEM 5
46
47static const struct option long_options[] = {
48 {"help", no_argument, NULL, 'h' },
49 {"dev", required_argument, NULL, 'd' },
50 {"skb-mode", no_argument, NULL, 'S' },
51 {"sec", required_argument, NULL, 's' },
52 {"no-separators", no_argument, NULL, 'z' },
53 {"action", required_argument, NULL, 'a' },
54 {"readmem", no_argument, NULL, 'r' },
55 {"swapmac", no_argument, NULL, 'm' },
56 {"force", no_argument, NULL, 'F' },
57 {0, 0, NULL, 0 }
58};
59
60static void int_exit(int sig)
61{
62 __u32 curr_prog_id = 0;
63
64 if (ifindex > -1) {
65 if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) {
66 printf("bpf_get_link_xdp_id failed\n");
67 exit(EXIT_FAIL);
68 }
69 if (prog_id == curr_prog_id) {
70 fprintf(stderr,
71 "Interrupted: Removing XDP program on ifindex:%d device:%s\n",
72 ifindex, ifname);
73 bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
74 } else if (!curr_prog_id) {
75 printf("couldn't find a prog id on a given iface\n");
76 } else {
77 printf("program on interface changed, not removing\n");
78 }
79 }
80 exit(EXIT_OK);
81}
82
83struct config {
84 __u32 action;
85 int ifindex;
86 __u32 options;
87};
88enum cfg_options_flags {
89 NO_TOUCH = 0x0U,
90 READ_MEM = 0x1U,
91 SWAP_MAC = 0x2U,
92};
93#define XDP_ACTION_MAX (XDP_TX + 1)
94#define XDP_ACTION_MAX_STRLEN 11
95static const char *xdp_action_names[XDP_ACTION_MAX] = {
96 [XDP_ABORTED] = "XDP_ABORTED",
97 [XDP_DROP] = "XDP_DROP",
98 [XDP_PASS] = "XDP_PASS",
99 [XDP_TX] = "XDP_TX",
100};
101
102static const char *action2str(int action)
103{
104 if (action < XDP_ACTION_MAX)
105 return xdp_action_names[action];
106 return NULL;
107}
108
109static int parse_xdp_action(char *action_str)
110{
111 size_t maxlen;
112 __u64 action = -1;
113 int i;
114
115 for (i = 0; i < XDP_ACTION_MAX; i++) {
116 maxlen = XDP_ACTION_MAX_STRLEN;
117 if (strncmp(xdp_action_names[i], action_str, maxlen) == 0) {
118 action = i;
119 break;
120 }
121 }
122 return action;
123}
124
125static void list_xdp_actions(void)
126{
127 int i;
128
129 printf("Available XDP --action <options>\n");
130 for (i = 0; i < XDP_ACTION_MAX; i++)
131 printf("\t%s\n", xdp_action_names[i]);
132 printf("\n");
133}
134
135static char* options2str(enum cfg_options_flags flag)
136{
137 if (flag == NO_TOUCH)
138 return "no_touch";
139 if (flag & SWAP_MAC)
140 return "swapmac";
141 if (flag & READ_MEM)
142 return "read";
143 fprintf(stderr, "ERR: Unknown config option flags");
144 exit(EXIT_FAIL);
145}
146
147static void usage(char *argv[])
148{
149 int i;
150
151 printf("\nDOCUMENTATION:\n%s\n", __doc__);
152 printf(" Usage: %s (options-see-below)\n", argv[0]);
153 printf(" Listing options:\n");
154 for (i = 0; long_options[i].name != 0; i++) {
155 printf(" --%-12s", long_options[i].name);
156 if (long_options[i].flag != NULL)
157 printf(" flag (internal value:%d)",
158 *long_options[i].flag);
159 else
160 printf(" short-option: -%c",
161 long_options[i].val);
162 printf("\n");
163 }
164 printf("\n");
165 list_xdp_actions();
166}
167
168#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
169static __u64 gettime(void)
170{
171 struct timespec t;
172 int res;
173
174 res = clock_gettime(CLOCK_MONOTONIC, &t);
175 if (res < 0) {
176 fprintf(stderr, "Error with gettimeofday! (%i)\n", res);
177 exit(EXIT_FAIL);
178 }
179 return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
180}
181
182/* Common stats data record shared with _kern.c */
183struct datarec {
184 __u64 processed;
185 __u64 issue;
186};
187struct record {
188 __u64 timestamp;
189 struct datarec total;
190 struct datarec *cpu;
191};
192struct stats_record {
193 struct record stats;
194 struct record *rxq;
195};
196
197static struct datarec *alloc_record_per_cpu(void)
198{
199 unsigned int nr_cpus = bpf_num_possible_cpus();
200 struct datarec *array;
201
202 array = calloc(nr_cpus, sizeof(struct datarec));
203 if (!array) {
204 fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
205 exit(EXIT_FAIL_MEM);
206 }
207 return array;
208}
209
210static struct record *alloc_record_per_rxq(void)
211{
212 unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
213 struct record *array;
214
215 array = calloc(nr_rxqs, sizeof(struct record));
216 if (!array) {
217 fprintf(stderr, "Mem alloc error (nr_rxqs:%u)\n", nr_rxqs);
218 exit(EXIT_FAIL_MEM);
219 }
220 return array;
221}
222
223static struct stats_record *alloc_stats_record(void)
224{
225 unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
226 struct stats_record *rec;
227 int i;
228
229 rec = calloc(1, sizeof(struct stats_record));
230 if (!rec) {
231 fprintf(stderr, "Mem alloc error\n");
232 exit(EXIT_FAIL_MEM);
233 }
234 rec->rxq = alloc_record_per_rxq();
235 for (i = 0; i < nr_rxqs; i++)
236 rec->rxq[i].cpu = alloc_record_per_cpu();
237
238 rec->stats.cpu = alloc_record_per_cpu();
239 return rec;
240}
241
242static void free_stats_record(struct stats_record *r)
243{
244 unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
245 int i;
246
247 for (i = 0; i < nr_rxqs; i++)
248 free(r->rxq[i].cpu);
249
250 free(r->rxq);
251 free(r->stats.cpu);
252 free(r);
253}
254
255static bool map_collect_percpu(int fd, __u32 key, struct record *rec)
256{
257 /* For percpu maps, userspace gets a value per possible CPU */
258 unsigned int nr_cpus = bpf_num_possible_cpus();
259 struct datarec values[nr_cpus];
260 __u64 sum_processed = 0;
261 __u64 sum_issue = 0;
262 int i;
263
264 if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
265 fprintf(stderr,
266 "ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
267 return false;
268 }
269 /* Get time as close as possible to reading map contents */
270 rec->timestamp = gettime();
271
272 /* Record and sum values from each CPU */
273 for (i = 0; i < nr_cpus; i++) {
274 rec->cpu[i].processed = values[i].processed;
275 sum_processed += values[i].processed;
276 rec->cpu[i].issue = values[i].issue;
277 sum_issue += values[i].issue;
278 }
279 rec->total.processed = sum_processed;
280 rec->total.issue = sum_issue;
281 return true;
282}
283
284static void stats_collect(struct stats_record *rec)
285{
286 int fd, i, max_rxqs;
287
288 fd = bpf_map__fd(stats_global_map);
289 map_collect_percpu(fd, 0, &rec->stats);
290
291 fd = bpf_map__fd(rx_queue_index_map);
292 max_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
293 for (i = 0; i < max_rxqs; i++)
294 map_collect_percpu(fd, i, &rec->rxq[i]);
295}
296
297static double calc_period(struct record *r, struct record *p)
298{
299 double period_ = 0;
300 __u64 period = 0;
301
302 period = r->timestamp - p->timestamp;
303 if (period > 0)
304 period_ = ((double) period / NANOSEC_PER_SEC);
305
306 return period_;
307}
308
309static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_)
310{
311 __u64 packets = 0;
312 __u64 pps = 0;
313
314 if (period_ > 0) {
315 packets = r->processed - p->processed;
316 pps = packets / period_;
317 }
318 return pps;
319}
320
321static __u64 calc_errs_pps(struct datarec *r,
322 struct datarec *p, double period_)
323{
324 __u64 packets = 0;
325 __u64 pps = 0;
326
327 if (period_ > 0) {
328 packets = r->issue - p->issue;
329 pps = packets / period_;
330 }
331 return pps;
332}
333
334static void stats_print(struct stats_record *stats_rec,
335 struct stats_record *stats_prev,
336 int action, __u32 cfg_opt)
337{
338 unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
339 unsigned int nr_cpus = bpf_num_possible_cpus();
340 double pps = 0, err = 0;
341 struct record *rec, *prev;
342 double t;
343 int rxq;
344 int i;
345
346 /* Header */
347 printf("\nRunning XDP on dev:%s (ifindex:%d) action:%s options:%s\n",
348 ifname, ifindex, action2str(action), options2str(cfg_opt));
349
350 /* stats_global_map */
351 {
352 char *fmt_rx = "%-15s %-7d %'-11.0f %'-10.0f %s\n";
353 char *fm2_rx = "%-15s %-7s %'-11.0f\n";
354 char *errstr = "";
355
356 printf("%-15s %-7s %-11s %-11s\n",
357 "XDP stats", "CPU", "pps", "issue-pps");
358
359 rec = &stats_rec->stats;
360 prev = &stats_prev->stats;
361 t = calc_period(rec, prev);
362 for (i = 0; i < nr_cpus; i++) {
363 struct datarec *r = &rec->cpu[i];
364 struct datarec *p = &prev->cpu[i];
365
366 pps = calc_pps (r, p, t);
367 err = calc_errs_pps(r, p, t);
368 if (err > 0)
369 errstr = "invalid-ifindex";
370 if (pps > 0)
371 printf(fmt_rx, "XDP-RX CPU",
372 i, pps, err, errstr);
373 }
374 pps = calc_pps (&rec->total, &prev->total, t);
375 err = calc_errs_pps(&rec->total, &prev->total, t);
376 printf(fm2_rx, "XDP-RX CPU", "total", pps, err);
377 }
378
379 /* rx_queue_index_map */
380 printf("\n%-15s %-7s %-11s %-11s\n",
381 "RXQ stats", "RXQ:CPU", "pps", "issue-pps");
382
383 for (rxq = 0; rxq < nr_rxqs; rxq++) {
384 char *fmt_rx = "%-15s %3d:%-3d %'-11.0f %'-10.0f %s\n";
385 char *fm2_rx = "%-15s %3d:%-3s %'-11.0f\n";
386 char *errstr = "";
387 int rxq_ = rxq;
388
389 /* Last RXQ in map catch overflows */
390 if (rxq_ == nr_rxqs - 1)
391 rxq_ = -1;
392
393 rec = &stats_rec->rxq[rxq];
394 prev = &stats_prev->rxq[rxq];
395 t = calc_period(rec, prev);
396 for (i = 0; i < nr_cpus; i++) {
397 struct datarec *r = &rec->cpu[i];
398 struct datarec *p = &prev->cpu[i];
399
400 pps = calc_pps (r, p, t);
401 err = calc_errs_pps(r, p, t);
402 if (err > 0) {
403 if (rxq_ == -1)
404 errstr = "map-overflow-RXQ";
405 else
406 errstr = "err";
407 }
408 if (pps > 0)
409 printf(fmt_rx, "rx_queue_index",
410 rxq_, i, pps, err, errstr);
411 }
412 pps = calc_pps (&rec->total, &prev->total, t);
413 err = calc_errs_pps(&rec->total, &prev->total, t);
414 if (pps || err)
415 printf(fm2_rx, "rx_queue_index", rxq_, "sum", pps, err);
416 }
417}
418
419
420/* Pointer swap trick */
421static inline void swap(struct stats_record **a, struct stats_record **b)
422{
423 struct stats_record *tmp;
424
425 tmp = *a;
426 *a = *b;
427 *b = tmp;
428}
429
430static void stats_poll(int interval, int action, __u32 cfg_opt)
431{
432 struct stats_record *record, *prev;
433
434 record = alloc_stats_record();
435 prev = alloc_stats_record();
436 stats_collect(record);
437
438 while (1) {
439 swap(&prev, &record);
440 stats_collect(record);
441 stats_print(record, prev, action, cfg_opt);
442 sleep(interval);
443 }
444
445 free_stats_record(record);
446 free_stats_record(prev);
447}
448
449
450int main(int argc, char **argv)
451{
452 __u32 cfg_options= NO_TOUCH ; /* Default: Don't touch packet memory */
453 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
454 struct bpf_prog_load_attr prog_load_attr = {
455 .prog_type = BPF_PROG_TYPE_XDP,
456 };
457 struct bpf_prog_info info = {};
458 __u32 info_len = sizeof(info);
459 int prog_fd, map_fd, opt, err;
460 bool use_separators = true;
461 struct config cfg = { 0 };
462 struct bpf_object *obj;
463 struct bpf_map *map;
464 char filename[256];
465 int longindex = 0;
466 int interval = 2;
467 __u32 key = 0;
468
469
470 char action_str_buf[XDP_ACTION_MAX_STRLEN + 1 /* for \0 */] = { 0 };
471 int action = XDP_PASS; /* Default action */
472 char *action_str = NULL;
473
474 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
475 prog_load_attr.file = filename;
476
477 if (setrlimit(RLIMIT_MEMLOCK, &r)) {
478 perror("setrlimit(RLIMIT_MEMLOCK)");
479 return 1;
480 }
481
482 if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
483 return EXIT_FAIL;
484
485 map = bpf_object__find_map_by_name(obj, "config_map");
486 stats_global_map = bpf_object__find_map_by_name(obj, "stats_global_map");
487 rx_queue_index_map = bpf_object__find_map_by_name(obj, "rx_queue_index_map");
488 if (!map || !stats_global_map || !rx_queue_index_map) {
489 printf("finding a map in obj file failed\n");
490 return EXIT_FAIL;
491 }
492 map_fd = bpf_map__fd(map);
493
494 if (!prog_fd) {
495 fprintf(stderr, "ERR: bpf_prog_load_xattr: %s\n", strerror(errno));
496 return EXIT_FAIL;
497 }
498
499 /* Parse commands line args */
500 while ((opt = getopt_long(argc, argv, "FhSrmzd:s:a:",
501 long_options, &longindex)) != -1) {
502 switch (opt) {
503 case 'd':
504 if (strlen(optarg) >= IF_NAMESIZE) {
505 fprintf(stderr, "ERR: --dev name too long\n");
506 goto error;
507 }
508 ifname = (char *)&ifname_buf;
509 strncpy(ifname, optarg, IF_NAMESIZE);
510 ifindex = if_nametoindex(ifname);
511 if (ifindex == 0) {
512 fprintf(stderr,
513 "ERR: --dev name unknown err(%d):%s\n",
514 errno, strerror(errno));
515 goto error;
516 }
517 break;
518 case 's':
519 interval = atoi(optarg);
520 break;
521 case 'S':
522 xdp_flags |= XDP_FLAGS_SKB_MODE;
523 break;
524 case 'z':
525 use_separators = false;
526 break;
527 case 'a':
528 action_str = (char *)&action_str_buf;
529 strncpy(action_str, optarg, XDP_ACTION_MAX_STRLEN);
530 break;
531 case 'r':
532 cfg_options |= READ_MEM;
533 break;
534 case 'm':
535 cfg_options |= SWAP_MAC;
536 break;
537 case 'F':
538 xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
539 break;
540 case 'h':
541 error:
542 default:
543 usage(argv);
544 return EXIT_FAIL_OPTION;
545 }
546 }
547
548 if (!(xdp_flags & XDP_FLAGS_SKB_MODE))
549 xdp_flags |= XDP_FLAGS_DRV_MODE;
550
551 /* Required option */
552 if (ifindex == -1) {
553 fprintf(stderr, "ERR: required option --dev missing\n");
554 usage(argv);
555 return EXIT_FAIL_OPTION;
556 }
557 cfg.ifindex = ifindex;
558
559 /* Parse action string */
560 if (action_str) {
561 action = parse_xdp_action(action_str);
562 if (action < 0) {
563 fprintf(stderr, "ERR: Invalid XDP --action: %s\n",
564 action_str);
565 list_xdp_actions();
566 return EXIT_FAIL_OPTION;
567 }
568 }
569 cfg.action = action;
570
571 /* XDP_TX requires changing MAC-addrs, else HW may drop */
572 if (action == XDP_TX)
573 cfg_options |= SWAP_MAC;
574 cfg.options = cfg_options;
575
576 /* Trick to pretty printf with thousands separators use %' */
577 if (use_separators)
578 setlocale(LC_NUMERIC, "en_US");
579
580 /* User-side setup ifindex in config_map */
581 err = bpf_map_update_elem(map_fd, &key, &cfg, 0);
582 if (err) {
583 fprintf(stderr, "Store config failed (err:%d)\n", err);
584 exit(EXIT_FAIL_BPF);
585 }
586
587 /* Remove XDP program when program is interrupted or killed */
588 signal(SIGINT, int_exit);
589 signal(SIGTERM, int_exit);
590
591 if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) {
592 fprintf(stderr, "link set xdp fd failed\n");
593 return EXIT_FAIL_XDP;
594 }
595
596 err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
597 if (err) {
598 printf("can't get prog info - %s\n", strerror(errno));
599 return err;
600 }
601 prog_id = info.id;
602
603 stats_poll(interval, action, cfg_options);
604 return EXIT_OK;
605}
diff --git a/samples/bpf/xdp_sample_pkts_kern.c b/samples/bpf/xdp_sample_pkts_kern.c
new file mode 100644
index 000000000..9cf76b340
--- /dev/null
+++ b/samples/bpf/xdp_sample_pkts_kern.c
@@ -0,0 +1,57 @@
1// SPDX-License-Identifier: GPL-2.0
2#include <linux/ptrace.h>
3#include <linux/version.h>
4#include <uapi/linux/bpf.h>
5#include <bpf/bpf_helpers.h>
6
7#define SAMPLE_SIZE 64ul
8
9struct {
10 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
11 __uint(key_size, sizeof(int));
12 __uint(value_size, sizeof(u32));
13} my_map SEC(".maps");
14
15SEC("xdp_sample")
16int xdp_sample_prog(struct xdp_md *ctx)
17{
18 void *data_end = (void *)(long)ctx->data_end;
19 void *data = (void *)(long)ctx->data;
20
21 /* Metadata will be in the perf event before the packet data. */
22 struct S {
23 u16 cookie;
24 u16 pkt_len;
25 } __packed metadata;
26
27 if (data < data_end) {
28 /* The XDP perf_event_output handler will use the upper 32 bits
29 * of the flags argument as a number of bytes to include of the
30 * packet payload in the event data. If the size is too big, the
31 * call to bpf_perf_event_output will fail and return -EFAULT.
32 *
33 * See bpf_xdp_event_output in net/core/filter.c.
34 *
35 * The BPF_F_CURRENT_CPU flag means that the event output fd
36 * will be indexed by the CPU number in the event map.
37 */
38 u64 flags = BPF_F_CURRENT_CPU;
39 u16 sample_size;
40 int ret;
41
42 metadata.cookie = 0xdead;
43 metadata.pkt_len = (u16)(data_end - data);
44 sample_size = min(metadata.pkt_len, SAMPLE_SIZE);
45 flags |= (u64)sample_size << 32;
46
47 ret = bpf_perf_event_output(ctx, &my_map, flags,
48 &metadata, sizeof(metadata));
49 if (ret)
50 bpf_printk("perf_event_output failed: %d\n", ret);
51 }
52
53 return XDP_PASS;
54}
55
56char _license[] SEC("license") = "GPL";
57u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/xdp_sample_pkts_user.c b/samples/bpf/xdp_sample_pkts_user.c
new file mode 100644
index 000000000..4b2a300c7
--- /dev/null
+++ b/samples/bpf/xdp_sample_pkts_user.c
@@ -0,0 +1,202 @@
1// SPDX-License-Identifier: GPL-2.0
2#include <stdio.h>
3#include <stdlib.h>
4#include <string.h>
5#include <linux/perf_event.h>
6#include <linux/bpf.h>
7#include <net/if.h>
8#include <errno.h>
9#include <assert.h>
10#include <sys/sysinfo.h>
11#include <sys/ioctl.h>
12#include <signal.h>
13#include <bpf/libbpf.h>
14#include <bpf/bpf.h>
15#include <sys/resource.h>
16#include <libgen.h>
17#include <linux/if_link.h>
18
19#include "perf-sys.h"
20
21static int if_idx;
22static char *if_name;
23static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
24static __u32 prog_id;
25static struct perf_buffer *pb = NULL;
26
27static int do_attach(int idx, int fd, const char *name)
28{
29 struct bpf_prog_info info = {};
30 __u32 info_len = sizeof(info);
31 int err;
32
33 err = bpf_set_link_xdp_fd(idx, fd, xdp_flags);
34 if (err < 0) {
35 printf("ERROR: failed to attach program to %s\n", name);
36 return err;
37 }
38
39 err = bpf_obj_get_info_by_fd(fd, &info, &info_len);
40 if (err) {
41 printf("can't get prog info - %s\n", strerror(errno));
42 return err;
43 }
44 prog_id = info.id;
45
46 return err;
47}
48
49static int do_detach(int idx, const char *name)
50{
51 __u32 curr_prog_id = 0;
52 int err = 0;
53
54 err = bpf_get_link_xdp_id(idx, &curr_prog_id, xdp_flags);
55 if (err) {
56 printf("bpf_get_link_xdp_id failed\n");
57 return err;
58 }
59 if (prog_id == curr_prog_id) {
60 err = bpf_set_link_xdp_fd(idx, -1, xdp_flags);
61 if (err < 0)
62 printf("ERROR: failed to detach prog from %s\n", name);
63 } else if (!curr_prog_id) {
64 printf("couldn't find a prog id on a %s\n", name);
65 } else {
66 printf("program on interface changed, not removing\n");
67 }
68
69 return err;
70}
71
72#define SAMPLE_SIZE 64
73
74static void print_bpf_output(void *ctx, int cpu, void *data, __u32 size)
75{
76 struct {
77 __u16 cookie;
78 __u16 pkt_len;
79 __u8 pkt_data[SAMPLE_SIZE];
80 } __packed *e = data;
81 int i;
82
83 if (e->cookie != 0xdead) {
84 printf("BUG cookie %x sized %d\n", e->cookie, size);
85 return;
86 }
87
88 printf("Pkt len: %-5d bytes. Ethernet hdr: ", e->pkt_len);
89 for (i = 0; i < 14 && i < e->pkt_len; i++)
90 printf("%02x ", e->pkt_data[i]);
91 printf("\n");
92}
93
94static void sig_handler(int signo)
95{
96 do_detach(if_idx, if_name);
97 perf_buffer__free(pb);
98 exit(0);
99}
100
101static void usage(const char *prog)
102{
103 fprintf(stderr,
104 "%s: %s [OPTS] <ifname|ifindex>\n\n"
105 "OPTS:\n"
106 " -F force loading prog\n",
107 __func__, prog);
108}
109
110int main(int argc, char **argv)
111{
112 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
113 struct bpf_prog_load_attr prog_load_attr = {
114 .prog_type = BPF_PROG_TYPE_XDP,
115 };
116 struct perf_buffer_opts pb_opts = {};
117 const char *optstr = "FS";
118 int prog_fd, map_fd, opt;
119 struct bpf_object *obj;
120 struct bpf_map *map;
121 char filename[256];
122 int ret, err;
123
124 while ((opt = getopt(argc, argv, optstr)) != -1) {
125 switch (opt) {
126 case 'F':
127 xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
128 break;
129 case 'S':
130 xdp_flags |= XDP_FLAGS_SKB_MODE;
131 break;
132 default:
133 usage(basename(argv[0]));
134 return 1;
135 }
136 }
137
138 if (!(xdp_flags & XDP_FLAGS_SKB_MODE))
139 xdp_flags |= XDP_FLAGS_DRV_MODE;
140
141 if (optind == argc) {
142 usage(basename(argv[0]));
143 return 1;
144 }
145
146 if (setrlimit(RLIMIT_MEMLOCK, &r)) {
147 perror("setrlimit(RLIMIT_MEMLOCK)");
148 return 1;
149 }
150
151 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
152 prog_load_attr.file = filename;
153
154 if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
155 return 1;
156
157 if (!prog_fd) {
158 printf("bpf_prog_load_xattr: %s\n", strerror(errno));
159 return 1;
160 }
161
162 map = bpf_map__next(NULL, obj);
163 if (!map) {
164 printf("finding a map in obj file failed\n");
165 return 1;
166 }
167 map_fd = bpf_map__fd(map);
168
169 if_idx = if_nametoindex(argv[optind]);
170 if (!if_idx)
171 if_idx = strtoul(argv[optind], NULL, 0);
172
173 if (!if_idx) {
174 fprintf(stderr, "Invalid ifname\n");
175 return 1;
176 }
177 if_name = argv[optind];
178 err = do_attach(if_idx, prog_fd, if_name);
179 if (err)
180 return err;
181
182 if (signal(SIGINT, sig_handler) ||
183 signal(SIGHUP, sig_handler) ||
184 signal(SIGTERM, sig_handler)) {
185 perror("signal");
186 return 1;
187 }
188
189 pb_opts.sample_cb = print_bpf_output;
190 pb = perf_buffer__new(map_fd, 8, &pb_opts);
191 err = libbpf_get_error(pb);
192 if (err) {
193 perror("perf_buffer setup failed");
194 return 1;
195 }
196
197 while ((ret = perf_buffer__poll(pb, 1000)) >= 0) {
198 }
199
200 kill(0, SIGINT);
201 return ret;
202}
diff --git a/samples/bpf/xdp_tx_iptunnel_common.h b/samples/bpf/xdp_tx_iptunnel_common.h
new file mode 100644
index 000000000..be839892c
--- /dev/null
+++ b/samples/bpf/xdp_tx_iptunnel_common.h
@@ -0,0 +1,34 @@
1/* SPDX-License-Identifier: GPL-2.0-only */
2/* Copyright (c) 2016 Facebook
3 */
4#ifndef _SAMPLES_BPF_XDP_TX_IPTNL_COMMON_H
5#define _SAMPLES_BPF_XDP_TX_IPTNL_COMMON_H
6
7#include <linux/types.h>
8
9#define MAX_IPTNL_ENTRIES 256U
10
11struct vip {
12 union {
13 __u32 v6[4];
14 __u32 v4;
15 } daddr;
16 __u16 dport;
17 __u16 family;
18 __u8 protocol;
19};
20
21struct iptnl_info {
22 union {
23 __u32 v6[4];
24 __u32 v4;
25 } saddr;
26 union {
27 __u32 v6[4];
28 __u32 v4;
29 } daddr;
30 __u16 family;
31 __u8 dmac[6];
32};
33
34#endif
diff --git a/samples/bpf/xdp_tx_iptunnel_kern.c b/samples/bpf/xdp_tx_iptunnel_kern.c
new file mode 100644
index 000000000..575d57e4b
--- /dev/null
+++ b/samples/bpf/xdp_tx_iptunnel_kern.c
@@ -0,0 +1,237 @@
1/* Copyright (c) 2016 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program shows how to use bpf_xdp_adjust_head() by
8 * encapsulating the incoming packet in an IPv4/v6 header
9 * and then XDP_TX it out.
10 */
11#define KBUILD_MODNAME "foo"
12#include <uapi/linux/bpf.h>
13#include <linux/in.h>
14#include <linux/if_ether.h>
15#include <linux/if_packet.h>
16#include <linux/if_vlan.h>
17#include <linux/ip.h>
18#include <linux/ipv6.h>
19#include <bpf/bpf_helpers.h>
20#include "xdp_tx_iptunnel_common.h"
21
22struct {
23 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
24 __type(key, __u32);
25 __type(value, __u64);
26 __uint(max_entries, 256);
27} rxcnt SEC(".maps");
28
29struct {
30 __uint(type, BPF_MAP_TYPE_HASH);
31 __type(key, struct vip);
32 __type(value, struct iptnl_info);
33 __uint(max_entries, MAX_IPTNL_ENTRIES);
34} vip2tnl SEC(".maps");
35
36static __always_inline void count_tx(u32 protocol)
37{
38 u64 *rxcnt_count;
39
40 rxcnt_count = bpf_map_lookup_elem(&rxcnt, &protocol);
41 if (rxcnt_count)
42 *rxcnt_count += 1;
43}
44
45static __always_inline int get_dport(void *trans_data, void *data_end,
46 u8 protocol)
47{
48 struct tcphdr *th;
49 struct udphdr *uh;
50
51 switch (protocol) {
52 case IPPROTO_TCP:
53 th = (struct tcphdr *)trans_data;
54 if (th + 1 > data_end)
55 return -1;
56 return th->dest;
57 case IPPROTO_UDP:
58 uh = (struct udphdr *)trans_data;
59 if (uh + 1 > data_end)
60 return -1;
61 return uh->dest;
62 default:
63 return 0;
64 }
65}
66
67static __always_inline void set_ethhdr(struct ethhdr *new_eth,
68 const struct ethhdr *old_eth,
69 const struct iptnl_info *tnl,
70 __be16 h_proto)
71{
72 memcpy(new_eth->h_source, old_eth->h_dest, sizeof(new_eth->h_source));
73 memcpy(new_eth->h_dest, tnl->dmac, sizeof(new_eth->h_dest));
74 new_eth->h_proto = h_proto;
75}
76
77static __always_inline int handle_ipv4(struct xdp_md *xdp)
78{
79 void *data_end = (void *)(long)xdp->data_end;
80 void *data = (void *)(long)xdp->data;
81 struct iptnl_info *tnl;
82 struct ethhdr *new_eth;
83 struct ethhdr *old_eth;
84 struct iphdr *iph = data + sizeof(struct ethhdr);
85 u16 *next_iph_u16;
86 u16 payload_len;
87 struct vip vip = {};
88 int dport;
89 u32 csum = 0;
90 int i;
91
92 if (iph + 1 > data_end)
93 return XDP_DROP;
94
95 dport = get_dport(iph + 1, data_end, iph->protocol);
96 if (dport == -1)
97 return XDP_DROP;
98
99 vip.protocol = iph->protocol;
100 vip.family = AF_INET;
101 vip.daddr.v4 = iph->daddr;
102 vip.dport = dport;
103 payload_len = ntohs(iph->tot_len);
104
105 tnl = bpf_map_lookup_elem(&vip2tnl, &vip);
106 /* It only does v4-in-v4 */
107 if (!tnl || tnl->family != AF_INET)
108 return XDP_PASS;
109
110 /* The vip key is found. Add an IP header and send it out */
111
112 if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct iphdr)))
113 return XDP_DROP;
114
115 data = (void *)(long)xdp->data;
116 data_end = (void *)(long)xdp->data_end;
117
118 new_eth = data;
119 iph = data + sizeof(*new_eth);
120 old_eth = data + sizeof(*iph);
121
122 if (new_eth + 1 > data_end ||
123 old_eth + 1 > data_end ||
124 iph + 1 > data_end)
125 return XDP_DROP;
126
127 set_ethhdr(new_eth, old_eth, tnl, htons(ETH_P_IP));
128
129 iph->version = 4;
130 iph->ihl = sizeof(*iph) >> 2;
131 iph->frag_off = 0;
132 iph->protocol = IPPROTO_IPIP;
133 iph->check = 0;
134 iph->tos = 0;
135 iph->tot_len = htons(payload_len + sizeof(*iph));
136 iph->daddr = tnl->daddr.v4;
137 iph->saddr = tnl->saddr.v4;
138 iph->ttl = 8;
139
140 next_iph_u16 = (u16 *)iph;
141#pragma clang loop unroll(full)
142 for (i = 0; i < sizeof(*iph) >> 1; i++)
143 csum += *next_iph_u16++;
144
145 iph->check = ~((csum & 0xffff) + (csum >> 16));
146
147 count_tx(vip.protocol);
148
149 return XDP_TX;
150}
151
152static __always_inline int handle_ipv6(struct xdp_md *xdp)
153{
154 void *data_end = (void *)(long)xdp->data_end;
155 void *data = (void *)(long)xdp->data;
156 struct iptnl_info *tnl;
157 struct ethhdr *new_eth;
158 struct ethhdr *old_eth;
159 struct ipv6hdr *ip6h = data + sizeof(struct ethhdr);
160 __u16 payload_len;
161 struct vip vip = {};
162 int dport;
163
164 if (ip6h + 1 > data_end)
165 return XDP_DROP;
166
167 dport = get_dport(ip6h + 1, data_end, ip6h->nexthdr);
168 if (dport == -1)
169 return XDP_DROP;
170
171 vip.protocol = ip6h->nexthdr;
172 vip.family = AF_INET6;
173 memcpy(vip.daddr.v6, ip6h->daddr.s6_addr32, sizeof(vip.daddr));
174 vip.dport = dport;
175 payload_len = ip6h->payload_len;
176
177 tnl = bpf_map_lookup_elem(&vip2tnl, &vip);
178 /* It only does v6-in-v6 */
179 if (!tnl || tnl->family != AF_INET6)
180 return XDP_PASS;
181
182 /* The vip key is found. Add an IP header and send it out */
183
184 if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct ipv6hdr)))
185 return XDP_DROP;
186
187 data = (void *)(long)xdp->data;
188 data_end = (void *)(long)xdp->data_end;
189
190 new_eth = data;
191 ip6h = data + sizeof(*new_eth);
192 old_eth = data + sizeof(*ip6h);
193
194 if (new_eth + 1 > data_end ||
195 old_eth + 1 > data_end ||
196 ip6h + 1 > data_end)
197 return XDP_DROP;
198
199 set_ethhdr(new_eth, old_eth, tnl, htons(ETH_P_IPV6));
200
201 ip6h->version = 6;
202 ip6h->priority = 0;
203 memset(ip6h->flow_lbl, 0, sizeof(ip6h->flow_lbl));
204 ip6h->payload_len = htons(ntohs(payload_len) + sizeof(*ip6h));
205 ip6h->nexthdr = IPPROTO_IPV6;
206 ip6h->hop_limit = 8;
207 memcpy(ip6h->saddr.s6_addr32, tnl->saddr.v6, sizeof(tnl->saddr.v6));
208 memcpy(ip6h->daddr.s6_addr32, tnl->daddr.v6, sizeof(tnl->daddr.v6));
209
210 count_tx(vip.protocol);
211
212 return XDP_TX;
213}
214
215SEC("xdp_tx_iptunnel")
216int _xdp_tx_iptunnel(struct xdp_md *xdp)
217{
218 void *data_end = (void *)(long)xdp->data_end;
219 void *data = (void *)(long)xdp->data;
220 struct ethhdr *eth = data;
221 __u16 h_proto;
222
223 if (eth + 1 > data_end)
224 return XDP_DROP;
225
226 h_proto = eth->h_proto;
227
228 if (h_proto == htons(ETH_P_IP))
229 return handle_ipv4(xdp);
230 else if (h_proto == htons(ETH_P_IPV6))
231
232 return handle_ipv6(xdp);
233 else
234 return XDP_PASS;
235}
236
237char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c
new file mode 100644
index 000000000..a419bee15
--- /dev/null
+++ b/samples/bpf/xdp_tx_iptunnel_user.c
@@ -0,0 +1,314 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/* Copyright (c) 2016 Facebook
3 */
4#include <linux/bpf.h>
5#include <linux/if_link.h>
6#include <assert.h>
7#include <errno.h>
8#include <signal.h>
9#include <stdio.h>
10#include <stdlib.h>
11#include <string.h>
12#include <net/if.h>
13#include <sys/resource.h>
14#include <arpa/inet.h>
15#include <netinet/ether.h>
16#include <unistd.h>
17#include <time.h>
18#include <bpf/libbpf.h>
19#include <bpf/bpf.h>
20#include "bpf_util.h"
21#include "xdp_tx_iptunnel_common.h"
22
23#define STATS_INTERVAL_S 2U
24
25static int ifindex = -1;
26static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
27static int rxcnt_map_fd;
28static __u32 prog_id;
29
30static void int_exit(int sig)
31{
32 __u32 curr_prog_id = 0;
33
34 if (ifindex > -1) {
35 if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) {
36 printf("bpf_get_link_xdp_id failed\n");
37 exit(1);
38 }
39 if (prog_id == curr_prog_id)
40 bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
41 else if (!curr_prog_id)
42 printf("couldn't find a prog id on a given iface\n");
43 else
44 printf("program on interface changed, not removing\n");
45 }
46 exit(0);
47}
48
49/* simple per-protocol drop counter
50 */
51static void poll_stats(unsigned int kill_after_s)
52{
53 const unsigned int nr_protos = 256;
54 unsigned int nr_cpus = bpf_num_possible_cpus();
55 time_t started_at = time(NULL);
56 __u64 values[nr_cpus], prev[nr_protos][nr_cpus];
57 __u32 proto;
58 int i;
59
60 memset(prev, 0, sizeof(prev));
61
62 while (!kill_after_s || time(NULL) - started_at <= kill_after_s) {
63 sleep(STATS_INTERVAL_S);
64
65 for (proto = 0; proto < nr_protos; proto++) {
66 __u64 sum = 0;
67
68 assert(bpf_map_lookup_elem(rxcnt_map_fd, &proto,
69 values) == 0);
70 for (i = 0; i < nr_cpus; i++)
71 sum += (values[i] - prev[proto][i]);
72
73 if (sum)
74 printf("proto %u: sum:%10llu pkts, rate:%10llu pkts/s\n",
75 proto, sum, sum / STATS_INTERVAL_S);
76 memcpy(prev[proto], values, sizeof(values));
77 }
78 }
79}
80
81static void usage(const char *cmd)
82{
83 printf("Start a XDP prog which encapsulates incoming packets\n"
84 "in an IPv4/v6 header and XDP_TX it out. The dst <VIP:PORT>\n"
85 "is used to select packets to encapsulate\n\n");
86 printf("Usage: %s [...]\n", cmd);
87 printf(" -i <ifname|ifindex> Interface\n");
88 printf(" -a <vip-service-address> IPv4 or IPv6\n");
89 printf(" -p <vip-service-port> A port range (e.g. 433-444) is also allowed\n");
90 printf(" -s <source-ip> Used in the IPTunnel header\n");
91 printf(" -d <dest-ip> Used in the IPTunnel header\n");
92 printf(" -m <dest-MAC> Used in sending the IP Tunneled pkt\n");
93 printf(" -T <stop-after-X-seconds> Default: 0 (forever)\n");
94 printf(" -P <IP-Protocol> Default is TCP\n");
95 printf(" -S use skb-mode\n");
96 printf(" -N enforce native mode\n");
97 printf(" -F Force loading the XDP prog\n");
98 printf(" -h Display this help\n");
99}
100
101static int parse_ipstr(const char *ipstr, unsigned int *addr)
102{
103 if (inet_pton(AF_INET6, ipstr, addr) == 1) {
104 return AF_INET6;
105 } else if (inet_pton(AF_INET, ipstr, addr) == 1) {
106 addr[1] = addr[2] = addr[3] = 0;
107 return AF_INET;
108 }
109
110 fprintf(stderr, "%s is an invalid IP\n", ipstr);
111 return AF_UNSPEC;
112}
113
114static int parse_ports(const char *port_str, int *min_port, int *max_port)
115{
116 char *end;
117 long tmp_min_port;
118 long tmp_max_port;
119
120 tmp_min_port = strtol(optarg, &end, 10);
121 if (tmp_min_port < 1 || tmp_min_port > 65535) {
122 fprintf(stderr, "Invalid port(s):%s\n", optarg);
123 return 1;
124 }
125
126 if (*end == '-') {
127 end++;
128 tmp_max_port = strtol(end, NULL, 10);
129 if (tmp_max_port < 1 || tmp_max_port > 65535) {
130 fprintf(stderr, "Invalid port(s):%s\n", optarg);
131 return 1;
132 }
133 } else {
134 tmp_max_port = tmp_min_port;
135 }
136
137 if (tmp_min_port > tmp_max_port) {
138 fprintf(stderr, "Invalid port(s):%s\n", optarg);
139 return 1;
140 }
141
142 if (tmp_max_port - tmp_min_port + 1 > MAX_IPTNL_ENTRIES) {
143 fprintf(stderr, "Port range (%s) is larger than %u\n",
144 port_str, MAX_IPTNL_ENTRIES);
145 return 1;
146 }
147 *min_port = tmp_min_port;
148 *max_port = tmp_max_port;
149
150 return 0;
151}
152
153int main(int argc, char **argv)
154{
155 struct bpf_prog_load_attr prog_load_attr = {
156 .prog_type = BPF_PROG_TYPE_XDP,
157 };
158 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
159 int min_port = 0, max_port = 0, vip2tnl_map_fd;
160 const char *optstr = "i:a:p:s:d:m:T:P:FSNh";
161 unsigned char opt_flags[256] = {};
162 struct bpf_prog_info info = {};
163 __u32 info_len = sizeof(info);
164 unsigned int kill_after_s = 0;
165 struct iptnl_info tnl = {};
166 struct bpf_object *obj;
167 struct vip vip = {};
168 char filename[256];
169 int opt, prog_fd;
170 int i, err;
171
172 tnl.family = AF_UNSPEC;
173 vip.protocol = IPPROTO_TCP;
174
175 for (i = 0; i < strlen(optstr); i++)
176 if (optstr[i] != 'h' && 'a' <= optstr[i] && optstr[i] <= 'z')
177 opt_flags[(unsigned char)optstr[i]] = 1;
178
179 while ((opt = getopt(argc, argv, optstr)) != -1) {
180 unsigned short family;
181 unsigned int *v6;
182
183 switch (opt) {
184 case 'i':
185 ifindex = if_nametoindex(optarg);
186 if (!ifindex)
187 ifindex = atoi(optarg);
188 break;
189 case 'a':
190 vip.family = parse_ipstr(optarg, vip.daddr.v6);
191 if (vip.family == AF_UNSPEC)
192 return 1;
193 break;
194 case 'p':
195 if (parse_ports(optarg, &min_port, &max_port))
196 return 1;
197 break;
198 case 'P':
199 vip.protocol = atoi(optarg);
200 break;
201 case 's':
202 case 'd':
203 if (opt == 's')
204 v6 = tnl.saddr.v6;
205 else
206 v6 = tnl.daddr.v6;
207
208 family = parse_ipstr(optarg, v6);
209 if (family == AF_UNSPEC)
210 return 1;
211 if (tnl.family == AF_UNSPEC) {
212 tnl.family = family;
213 } else if (tnl.family != family) {
214 fprintf(stderr,
215 "The IP version of the src and dst addresses used in the IP encapsulation does not match\n");
216 return 1;
217 }
218 break;
219 case 'm':
220 if (!ether_aton_r(optarg,
221 (struct ether_addr *)tnl.dmac)) {
222 fprintf(stderr, "Invalid mac address:%s\n",
223 optarg);
224 return 1;
225 }
226 break;
227 case 'T':
228 kill_after_s = atoi(optarg);
229 break;
230 case 'S':
231 xdp_flags |= XDP_FLAGS_SKB_MODE;
232 break;
233 case 'N':
234 /* default, set below */
235 break;
236 case 'F':
237 xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
238 break;
239 default:
240 usage(argv[0]);
241 return 1;
242 }
243 opt_flags[opt] = 0;
244 }
245
246 if (!(xdp_flags & XDP_FLAGS_SKB_MODE))
247 xdp_flags |= XDP_FLAGS_DRV_MODE;
248
249 for (i = 0; i < strlen(optstr); i++) {
250 if (opt_flags[(unsigned int)optstr[i]]) {
251 fprintf(stderr, "Missing argument -%c\n", optstr[i]);
252 usage(argv[0]);
253 return 1;
254 }
255 }
256
257 if (setrlimit(RLIMIT_MEMLOCK, &r)) {
258 perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)");
259 return 1;
260 }
261
262 if (!ifindex) {
263 fprintf(stderr, "Invalid ifname\n");
264 return 1;
265 }
266
267 snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
268 prog_load_attr.file = filename;
269
270 if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
271 return 1;
272
273 if (!prog_fd) {
274 printf("bpf_prog_load_xattr: %s\n", strerror(errno));
275 return 1;
276 }
277
278 rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt");
279 vip2tnl_map_fd = bpf_object__find_map_fd_by_name(obj, "vip2tnl");
280 if (vip2tnl_map_fd < 0 || rxcnt_map_fd < 0) {
281 printf("bpf_object__find_map_fd_by_name failed\n");
282 return 1;
283 }
284
285 signal(SIGINT, int_exit);
286 signal(SIGTERM, int_exit);
287
288 while (min_port <= max_port) {
289 vip.dport = htons(min_port++);
290 if (bpf_map_update_elem(vip2tnl_map_fd, &vip, &tnl,
291 BPF_NOEXIST)) {
292 perror("bpf_map_update_elem(&vip2tnl)");
293 return 1;
294 }
295 }
296
297 if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) {
298 printf("link set xdp fd failed\n");
299 return 1;
300 }
301
302 err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
303 if (err) {
304 printf("can't get prog info - %s\n", strerror(errno));
305 return err;
306 }
307 prog_id = info.id;
308
309 poll_stats(kill_after_s);
310
311 bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
312
313 return 0;
314}
diff --git a/samples/bpf/xdpsock.h b/samples/bpf/xdpsock.h
new file mode 100644
index 000000000..b7eca15c7
--- /dev/null
+++ b/samples/bpf/xdpsock.h
@@ -0,0 +1,11 @@
1/* SPDX-License-Identifier: GPL-2.0
2 *
3 * Copyright(c) 2019 Intel Corporation.
4 */
5
6#ifndef XDPSOCK_H_
7#define XDPSOCK_H_
8
9#define MAX_SOCKS 4
10
11#endif /* XDPSOCK_H */
diff --git a/samples/bpf/xdpsock_kern.c b/samples/bpf/xdpsock_kern.c
new file mode 100644
index 000000000..054304843
--- /dev/null
+++ b/samples/bpf/xdpsock_kern.c
@@ -0,0 +1,24 @@
1// SPDX-License-Identifier: GPL-2.0
2#include <linux/bpf.h>
3#include <bpf/bpf_helpers.h>
4#include "xdpsock.h"
5
6/* This XDP program is only needed for the XDP_SHARED_UMEM mode.
7 * If you do not use this mode, libbpf can supply an XDP program for you.
8 */
9
10struct {
11 __uint(type, BPF_MAP_TYPE_XSKMAP);
12 __uint(max_entries, MAX_SOCKS);
13 __uint(key_size, sizeof(int));
14 __uint(value_size, sizeof(int));
15} xsks_map SEC(".maps");
16
17static unsigned int rr;
18
19SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
20{
21 rr = (rr + 1) & (MAX_SOCKS - 1);
22
23 return bpf_redirect_map(&xsks_map, rr, XDP_DROP);
24}
diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
new file mode 100644
index 000000000..cf5b0a895
--- /dev/null
+++ b/samples/bpf/xdpsock_user.c
@@ -0,0 +1,1550 @@
1// SPDX-License-Identifier: GPL-2.0
2/* Copyright(c) 2017 - 2018 Intel Corporation. */
3
4#include <asm/barrier.h>
5#include <errno.h>
6#include <getopt.h>
7#include <libgen.h>
8#include <linux/bpf.h>
9#include <linux/compiler.h>
10#include <linux/if_link.h>
11#include <linux/if_xdp.h>
12#include <linux/if_ether.h>
13#include <linux/ip.h>
14#include <linux/limits.h>
15#include <linux/udp.h>
16#include <arpa/inet.h>
17#include <locale.h>
18#include <net/ethernet.h>
19#include <net/if.h>
20#include <poll.h>
21#include <pthread.h>
22#include <signal.h>
23#include <stdbool.h>
24#include <stdio.h>
25#include <stdlib.h>
26#include <string.h>
27#include <sys/mman.h>
28#include <sys/resource.h>
29#include <sys/socket.h>
30#include <sys/types.h>
31#include <time.h>
32#include <unistd.h>
33
34#include <bpf/libbpf.h>
35#include <bpf/xsk.h>
36#include <bpf/bpf.h>
37#include "xdpsock.h"
38
39#ifndef SOL_XDP
40#define SOL_XDP 283
41#endif
42
43#ifndef AF_XDP
44#define AF_XDP 44
45#endif
46
47#ifndef PF_XDP
48#define PF_XDP AF_XDP
49#endif
50
51#define NUM_FRAMES (4 * 1024)
52#define MIN_PKT_SIZE 64
53
54#define DEBUG_HEXDUMP 0
55
56typedef __u64 u64;
57typedef __u32 u32;
58typedef __u16 u16;
59typedef __u8 u8;
60
61static unsigned long prev_time;
62
63enum benchmark_type {
64 BENCH_RXDROP = 0,
65 BENCH_TXONLY = 1,
66 BENCH_L2FWD = 2,
67};
68
69static enum benchmark_type opt_bench = BENCH_RXDROP;
70static u32 opt_xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
71static const char *opt_if = "";
72static int opt_ifindex;
73static int opt_queue;
74static unsigned long opt_duration;
75static unsigned long start_time;
76static bool benchmark_done;
77static u32 opt_batch_size = 64;
78static int opt_pkt_count;
79static u16 opt_pkt_size = MIN_PKT_SIZE;
80static u32 opt_pkt_fill_pattern = 0x12345678;
81static bool opt_extra_stats;
82static bool opt_quiet;
83static bool opt_app_stats;
84static const char *opt_irq_str = "";
85static u32 irq_no;
86static int irqs_at_init = -1;
87static int opt_poll;
88static int opt_interval = 1;
89static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP;
90static u32 opt_umem_flags;
91static int opt_unaligned_chunks;
92static int opt_mmap_flags;
93static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
94static int opt_timeout = 1000;
95static bool opt_need_wakeup = true;
96static u32 opt_num_xsks = 1;
97static u32 prog_id;
98
99struct xsk_ring_stats {
100 unsigned long rx_npkts;
101 unsigned long tx_npkts;
102 unsigned long rx_dropped_npkts;
103 unsigned long rx_invalid_npkts;
104 unsigned long tx_invalid_npkts;
105 unsigned long rx_full_npkts;
106 unsigned long rx_fill_empty_npkts;
107 unsigned long tx_empty_npkts;
108 unsigned long prev_rx_npkts;
109 unsigned long prev_tx_npkts;
110 unsigned long prev_rx_dropped_npkts;
111 unsigned long prev_rx_invalid_npkts;
112 unsigned long prev_tx_invalid_npkts;
113 unsigned long prev_rx_full_npkts;
114 unsigned long prev_rx_fill_empty_npkts;
115 unsigned long prev_tx_empty_npkts;
116};
117
118struct xsk_driver_stats {
119 unsigned long intrs;
120 unsigned long prev_intrs;
121};
122
123struct xsk_app_stats {
124 unsigned long rx_empty_polls;
125 unsigned long fill_fail_polls;
126 unsigned long copy_tx_sendtos;
127 unsigned long tx_wakeup_sendtos;
128 unsigned long opt_polls;
129 unsigned long prev_rx_empty_polls;
130 unsigned long prev_fill_fail_polls;
131 unsigned long prev_copy_tx_sendtos;
132 unsigned long prev_tx_wakeup_sendtos;
133 unsigned long prev_opt_polls;
134};
135
136struct xsk_umem_info {
137 struct xsk_ring_prod fq;
138 struct xsk_ring_cons cq;
139 struct xsk_umem *umem;
140 void *buffer;
141};
142
143struct xsk_socket_info {
144 struct xsk_ring_cons rx;
145 struct xsk_ring_prod tx;
146 struct xsk_umem_info *umem;
147 struct xsk_socket *xsk;
148 struct xsk_ring_stats ring_stats;
149 struct xsk_app_stats app_stats;
150 struct xsk_driver_stats drv_stats;
151 u32 outstanding_tx;
152};
153
154static int num_socks;
155struct xsk_socket_info *xsks[MAX_SOCKS];
156
157static unsigned long get_nsecs(void)
158{
159 struct timespec ts;
160
161 clock_gettime(CLOCK_MONOTONIC, &ts);
162 return ts.tv_sec * 1000000000UL + ts.tv_nsec;
163}
164
165static void print_benchmark(bool running)
166{
167 const char *bench_str = "INVALID";
168
169 if (opt_bench == BENCH_RXDROP)
170 bench_str = "rxdrop";
171 else if (opt_bench == BENCH_TXONLY)
172 bench_str = "txonly";
173 else if (opt_bench == BENCH_L2FWD)
174 bench_str = "l2fwd";
175
176 printf("%s:%d %s ", opt_if, opt_queue, bench_str);
177 if (opt_xdp_flags & XDP_FLAGS_SKB_MODE)
178 printf("xdp-skb ");
179 else if (opt_xdp_flags & XDP_FLAGS_DRV_MODE)
180 printf("xdp-drv ");
181 else
182 printf(" ");
183
184 if (opt_poll)
185 printf("poll() ");
186
187 if (running) {
188 printf("running...");
189 fflush(stdout);
190 }
191}
192
193static int xsk_get_xdp_stats(int fd, struct xsk_socket_info *xsk)
194{
195 struct xdp_statistics stats;
196 socklen_t optlen;
197 int err;
198
199 optlen = sizeof(stats);
200 err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen);
201 if (err)
202 return err;
203
204 if (optlen == sizeof(struct xdp_statistics)) {
205 xsk->ring_stats.rx_dropped_npkts = stats.rx_dropped;
206 xsk->ring_stats.rx_invalid_npkts = stats.rx_invalid_descs;
207 xsk->ring_stats.tx_invalid_npkts = stats.tx_invalid_descs;
208 xsk->ring_stats.rx_full_npkts = stats.rx_ring_full;
209 xsk->ring_stats.rx_fill_empty_npkts = stats.rx_fill_ring_empty_descs;
210 xsk->ring_stats.tx_empty_npkts = stats.tx_ring_empty_descs;
211 return 0;
212 }
213
214 return -EINVAL;
215}
216
217static void dump_app_stats(long dt)
218{
219 int i;
220
221 for (i = 0; i < num_socks && xsks[i]; i++) {
222 char *fmt = "%-18s %'-14.0f %'-14lu\n";
223 double rx_empty_polls_ps, fill_fail_polls_ps, copy_tx_sendtos_ps,
224 tx_wakeup_sendtos_ps, opt_polls_ps;
225
226 rx_empty_polls_ps = (xsks[i]->app_stats.rx_empty_polls -
227 xsks[i]->app_stats.prev_rx_empty_polls) * 1000000000. / dt;
228 fill_fail_polls_ps = (xsks[i]->app_stats.fill_fail_polls -
229 xsks[i]->app_stats.prev_fill_fail_polls) * 1000000000. / dt;
230 copy_tx_sendtos_ps = (xsks[i]->app_stats.copy_tx_sendtos -
231 xsks[i]->app_stats.prev_copy_tx_sendtos) * 1000000000. / dt;
232 tx_wakeup_sendtos_ps = (xsks[i]->app_stats.tx_wakeup_sendtos -
233 xsks[i]->app_stats.prev_tx_wakeup_sendtos)
234 * 1000000000. / dt;
235 opt_polls_ps = (xsks[i]->app_stats.opt_polls -
236 xsks[i]->app_stats.prev_opt_polls) * 1000000000. / dt;
237
238 printf("\n%-18s %-14s %-14s\n", "", "calls/s", "count");
239 printf(fmt, "rx empty polls", rx_empty_polls_ps, xsks[i]->app_stats.rx_empty_polls);
240 printf(fmt, "fill fail polls", fill_fail_polls_ps,
241 xsks[i]->app_stats.fill_fail_polls);
242 printf(fmt, "copy tx sendtos", copy_tx_sendtos_ps,
243 xsks[i]->app_stats.copy_tx_sendtos);
244 printf(fmt, "tx wakeup sendtos", tx_wakeup_sendtos_ps,
245 xsks[i]->app_stats.tx_wakeup_sendtos);
246 printf(fmt, "opt polls", opt_polls_ps, xsks[i]->app_stats.opt_polls);
247
248 xsks[i]->app_stats.prev_rx_empty_polls = xsks[i]->app_stats.rx_empty_polls;
249 xsks[i]->app_stats.prev_fill_fail_polls = xsks[i]->app_stats.fill_fail_polls;
250 xsks[i]->app_stats.prev_copy_tx_sendtos = xsks[i]->app_stats.copy_tx_sendtos;
251 xsks[i]->app_stats.prev_tx_wakeup_sendtos = xsks[i]->app_stats.tx_wakeup_sendtos;
252 xsks[i]->app_stats.prev_opt_polls = xsks[i]->app_stats.opt_polls;
253 }
254}
255
256static bool get_interrupt_number(void)
257{
258 FILE *f_int_proc;
259 char line[4096];
260 bool found = false;
261
262 f_int_proc = fopen("/proc/interrupts", "r");
263 if (f_int_proc == NULL) {
264 printf("Failed to open /proc/interrupts.\n");
265 return found;
266 }
267
268 while (!feof(f_int_proc) && !found) {
269 /* Make sure to read a full line at a time */
270 if (fgets(line, sizeof(line), f_int_proc) == NULL ||
271 line[strlen(line) - 1] != '\n') {
272 printf("Error reading from interrupts file\n");
273 break;
274 }
275
276 /* Extract interrupt number from line */
277 if (strstr(line, opt_irq_str) != NULL) {
278 irq_no = atoi(line);
279 found = true;
280 break;
281 }
282 }
283
284 fclose(f_int_proc);
285
286 return found;
287}
288
289static int get_irqs(void)
290{
291 char count_path[PATH_MAX];
292 int total_intrs = -1;
293 FILE *f_count_proc;
294 char line[4096];
295
296 snprintf(count_path, sizeof(count_path),
297 "/sys/kernel/irq/%i/per_cpu_count", irq_no);
298 f_count_proc = fopen(count_path, "r");
299 if (f_count_proc == NULL) {
300 printf("Failed to open %s\n", count_path);
301 return total_intrs;
302 }
303
304 if (fgets(line, sizeof(line), f_count_proc) == NULL ||
305 line[strlen(line) - 1] != '\n') {
306 printf("Error reading from %s\n", count_path);
307 } else {
308 static const char com[2] = ",";
309 char *token;
310
311 total_intrs = 0;
312 token = strtok(line, com);
313 while (token != NULL) {
314 /* sum up interrupts across all cores */
315 total_intrs += atoi(token);
316 token = strtok(NULL, com);
317 }
318 }
319
320 fclose(f_count_proc);
321
322 return total_intrs;
323}
324
325static void dump_driver_stats(long dt)
326{
327 int i;
328
329 for (i = 0; i < num_socks && xsks[i]; i++) {
330 char *fmt = "%-18s %'-14.0f %'-14lu\n";
331 double intrs_ps;
332 int n_ints = get_irqs();
333
334 if (n_ints < 0) {
335 printf("error getting intr info for intr %i\n", irq_no);
336 return;
337 }
338 xsks[i]->drv_stats.intrs = n_ints - irqs_at_init;
339
340 intrs_ps = (xsks[i]->drv_stats.intrs - xsks[i]->drv_stats.prev_intrs) *
341 1000000000. / dt;
342
343 printf("\n%-18s %-14s %-14s\n", "", "intrs/s", "count");
344 printf(fmt, "irqs", intrs_ps, xsks[i]->drv_stats.intrs);
345
346 xsks[i]->drv_stats.prev_intrs = xsks[i]->drv_stats.intrs;
347 }
348}
349
350static void dump_stats(void)
351{
352 unsigned long now = get_nsecs();
353 long dt = now - prev_time;
354 int i;
355
356 prev_time = now;
357
358 for (i = 0; i < num_socks && xsks[i]; i++) {
359 char *fmt = "%-18s %'-14.0f %'-14lu\n";
360 double rx_pps, tx_pps, dropped_pps, rx_invalid_pps, full_pps, fill_empty_pps,
361 tx_invalid_pps, tx_empty_pps;
362
363 rx_pps = (xsks[i]->ring_stats.rx_npkts - xsks[i]->ring_stats.prev_rx_npkts) *
364 1000000000. / dt;
365 tx_pps = (xsks[i]->ring_stats.tx_npkts - xsks[i]->ring_stats.prev_tx_npkts) *
366 1000000000. / dt;
367
368 printf("\n sock%d@", i);
369 print_benchmark(false);
370 printf("\n");
371
372 printf("%-18s %-14s %-14s %-14.2f\n", "", "pps", "pkts",
373 dt / 1000000000.);
374 printf(fmt, "rx", rx_pps, xsks[i]->ring_stats.rx_npkts);
375 printf(fmt, "tx", tx_pps, xsks[i]->ring_stats.tx_npkts);
376
377 xsks[i]->ring_stats.prev_rx_npkts = xsks[i]->ring_stats.rx_npkts;
378 xsks[i]->ring_stats.prev_tx_npkts = xsks[i]->ring_stats.tx_npkts;
379
380 if (opt_extra_stats) {
381 if (!xsk_get_xdp_stats(xsk_socket__fd(xsks[i]->xsk), xsks[i])) {
382 dropped_pps = (xsks[i]->ring_stats.rx_dropped_npkts -
383 xsks[i]->ring_stats.prev_rx_dropped_npkts) *
384 1000000000. / dt;
385 rx_invalid_pps = (xsks[i]->ring_stats.rx_invalid_npkts -
386 xsks[i]->ring_stats.prev_rx_invalid_npkts) *
387 1000000000. / dt;
388 tx_invalid_pps = (xsks[i]->ring_stats.tx_invalid_npkts -
389 xsks[i]->ring_stats.prev_tx_invalid_npkts) *
390 1000000000. / dt;
391 full_pps = (xsks[i]->ring_stats.rx_full_npkts -
392 xsks[i]->ring_stats.prev_rx_full_npkts) *
393 1000000000. / dt;
394 fill_empty_pps = (xsks[i]->ring_stats.rx_fill_empty_npkts -
395 xsks[i]->ring_stats.prev_rx_fill_empty_npkts) *
396 1000000000. / dt;
397 tx_empty_pps = (xsks[i]->ring_stats.tx_empty_npkts -
398 xsks[i]->ring_stats.prev_tx_empty_npkts) *
399 1000000000. / dt;
400
401 printf(fmt, "rx dropped", dropped_pps,
402 xsks[i]->ring_stats.rx_dropped_npkts);
403 printf(fmt, "rx invalid", rx_invalid_pps,
404 xsks[i]->ring_stats.rx_invalid_npkts);
405 printf(fmt, "tx invalid", tx_invalid_pps,
406 xsks[i]->ring_stats.tx_invalid_npkts);
407 printf(fmt, "rx queue full", full_pps,
408 xsks[i]->ring_stats.rx_full_npkts);
409 printf(fmt, "fill ring empty", fill_empty_pps,
410 xsks[i]->ring_stats.rx_fill_empty_npkts);
411 printf(fmt, "tx ring empty", tx_empty_pps,
412 xsks[i]->ring_stats.tx_empty_npkts);
413
414 xsks[i]->ring_stats.prev_rx_dropped_npkts =
415 xsks[i]->ring_stats.rx_dropped_npkts;
416 xsks[i]->ring_stats.prev_rx_invalid_npkts =
417 xsks[i]->ring_stats.rx_invalid_npkts;
418 xsks[i]->ring_stats.prev_tx_invalid_npkts =
419 xsks[i]->ring_stats.tx_invalid_npkts;
420 xsks[i]->ring_stats.prev_rx_full_npkts =
421 xsks[i]->ring_stats.rx_full_npkts;
422 xsks[i]->ring_stats.prev_rx_fill_empty_npkts =
423 xsks[i]->ring_stats.rx_fill_empty_npkts;
424 xsks[i]->ring_stats.prev_tx_empty_npkts =
425 xsks[i]->ring_stats.tx_empty_npkts;
426 } else {
427 printf("%-15s\n", "Error retrieving extra stats");
428 }
429 }
430 }
431
432 if (opt_app_stats)
433 dump_app_stats(dt);
434 if (irq_no)
435 dump_driver_stats(dt);
436}
437
438static bool is_benchmark_done(void)
439{
440 if (opt_duration > 0) {
441 unsigned long dt = (get_nsecs() - start_time);
442
443 if (dt >= opt_duration)
444 benchmark_done = true;
445 }
446 return benchmark_done;
447}
448
449static void *poller(void *arg)
450{
451 (void)arg;
452 while (!is_benchmark_done()) {
453 sleep(opt_interval);
454 dump_stats();
455 }
456
457 return NULL;
458}
459
460static void remove_xdp_program(void)
461{
462 u32 curr_prog_id = 0;
463
464 if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) {
465 printf("bpf_get_link_xdp_id failed\n");
466 exit(EXIT_FAILURE);
467 }
468 if (prog_id == curr_prog_id)
469 bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags);
470 else if (!curr_prog_id)
471 printf("couldn't find a prog id on a given interface\n");
472 else
473 printf("program on interface changed, not removing\n");
474}
475
476static void int_exit(int sig)
477{
478 benchmark_done = true;
479}
480
481static void xdpsock_cleanup(void)
482{
483 struct xsk_umem *umem = xsks[0]->umem->umem;
484 int i;
485
486 dump_stats();
487 for (i = 0; i < num_socks; i++)
488 xsk_socket__delete(xsks[i]->xsk);
489 (void)xsk_umem__delete(umem);
490 remove_xdp_program();
491}
492
493static void __exit_with_error(int error, const char *file, const char *func,
494 int line)
495{
496 fprintf(stderr, "%s:%s:%i: errno: %d/\"%s\"\n", file, func,
497 line, error, strerror(error));
498 dump_stats();
499 remove_xdp_program();
500 exit(EXIT_FAILURE);
501}
502
503#define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, \
504 __LINE__)
505static void swap_mac_addresses(void *data)
506{
507 struct ether_header *eth = (struct ether_header *)data;
508 struct ether_addr *src_addr = (struct ether_addr *)&eth->ether_shost;
509 struct ether_addr *dst_addr = (struct ether_addr *)&eth->ether_dhost;
510 struct ether_addr tmp;
511
512 tmp = *src_addr;
513 *src_addr = *dst_addr;
514 *dst_addr = tmp;
515}
516
517static void hex_dump(void *pkt, size_t length, u64 addr)
518{
519 const unsigned char *address = (unsigned char *)pkt;
520 const unsigned char *line = address;
521 size_t line_size = 32;
522 unsigned char c;
523 char buf[32];
524 int i = 0;
525
526 if (!DEBUG_HEXDUMP)
527 return;
528
529 sprintf(buf, "addr=%llu", addr);
530 printf("length = %zu\n", length);
531 printf("%s | ", buf);
532 while (length-- > 0) {
533 printf("%02X ", *address++);
534 if (!(++i % line_size) || (length == 0 && i % line_size)) {
535 if (length == 0) {
536 while (i++ % line_size)
537 printf("__ ");
538 }
539 printf(" | "); /* right close */
540 while (line < address) {
541 c = *line++;
542 printf("%c", (c < 33 || c == 255) ? 0x2E : c);
543 }
544 printf("\n");
545 if (length > 0)
546 printf("%s | ", buf);
547 }
548 }
549 printf("\n");
550}
551
552static void *memset32_htonl(void *dest, u32 val, u32 size)
553{
554 u32 *ptr = (u32 *)dest;
555 int i;
556
557 val = htonl(val);
558
559 for (i = 0; i < (size & (~0x3)); i += 4)
560 ptr[i >> 2] = val;
561
562 for (; i < size; i++)
563 ((char *)dest)[i] = ((char *)&val)[i & 3];
564
565 return dest;
566}
567
568/*
569 * This function code has been taken from
570 * Linux kernel lib/checksum.c
571 */
572static inline unsigned short from32to16(unsigned int x)
573{
574 /* add up 16-bit and 16-bit for 16+c bit */
575 x = (x & 0xffff) + (x >> 16);
576 /* add up carry.. */
577 x = (x & 0xffff) + (x >> 16);
578 return x;
579}
580
581/*
582 * This function code has been taken from
583 * Linux kernel lib/checksum.c
584 */
585static unsigned int do_csum(const unsigned char *buff, int len)
586{
587 unsigned int result = 0;
588 int odd;
589
590 if (len <= 0)
591 goto out;
592 odd = 1 & (unsigned long)buff;
593 if (odd) {
594#ifdef __LITTLE_ENDIAN
595 result += (*buff << 8);
596#else
597 result = *buff;
598#endif
599 len--;
600 buff++;
601 }
602 if (len >= 2) {
603 if (2 & (unsigned long)buff) {
604 result += *(unsigned short *)buff;
605 len -= 2;
606 buff += 2;
607 }
608 if (len >= 4) {
609 const unsigned char *end = buff +
610 ((unsigned int)len & ~3);
611 unsigned int carry = 0;
612
613 do {
614 unsigned int w = *(unsigned int *)buff;
615
616 buff += 4;
617 result += carry;
618 result += w;
619 carry = (w > result);
620 } while (buff < end);
621 result += carry;
622 result = (result & 0xffff) + (result >> 16);
623 }
624 if (len & 2) {
625 result += *(unsigned short *)buff;
626 buff += 2;
627 }
628 }
629 if (len & 1)
630#ifdef __LITTLE_ENDIAN
631 result += *buff;
632#else
633 result += (*buff << 8);
634#endif
635 result = from32to16(result);
636 if (odd)
637 result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
638out:
639 return result;
640}
641
642__sum16 ip_fast_csum(const void *iph, unsigned int ihl);
643
644/*
645 * This is a version of ip_compute_csum() optimized for IP headers,
646 * which always checksum on 4 octet boundaries.
647 * This function code has been taken from
648 * Linux kernel lib/checksum.c
649 */
650__sum16 ip_fast_csum(const void *iph, unsigned int ihl)
651{
652 return (__force __sum16)~do_csum(iph, ihl * 4);
653}
654
655/*
656 * Fold a partial checksum
657 * This function code has been taken from
658 * Linux kernel include/asm-generic/checksum.h
659 */
660static inline __sum16 csum_fold(__wsum csum)
661{
662 u32 sum = (__force u32)csum;
663
664 sum = (sum & 0xffff) + (sum >> 16);
665 sum = (sum & 0xffff) + (sum >> 16);
666 return (__force __sum16)~sum;
667}
668
669/*
670 * This function code has been taken from
671 * Linux kernel lib/checksum.c
672 */
673static inline u32 from64to32(u64 x)
674{
675 /* add up 32-bit and 32-bit for 32+c bit */
676 x = (x & 0xffffffff) + (x >> 32);
677 /* add up carry.. */
678 x = (x & 0xffffffff) + (x >> 32);
679 return (u32)x;
680}
681
682__wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
683 __u32 len, __u8 proto, __wsum sum);
684
685/*
686 * This function code has been taken from
687 * Linux kernel lib/checksum.c
688 */
689__wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
690 __u32 len, __u8 proto, __wsum sum)
691{
692 unsigned long long s = (__force u32)sum;
693
694 s += (__force u32)saddr;
695 s += (__force u32)daddr;
696#ifdef __BIG_ENDIAN__
697 s += proto + len;
698#else
699 s += (proto + len) << 8;
700#endif
701 return (__force __wsum)from64to32(s);
702}
703
704/*
705 * This function has been taken from
706 * Linux kernel include/asm-generic/checksum.h
707 */
708static inline __sum16
709csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len,
710 __u8 proto, __wsum sum)
711{
712 return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
713}
714
715static inline u16 udp_csum(u32 saddr, u32 daddr, u32 len,
716 u8 proto, u16 *udp_pkt)
717{
718 u32 csum = 0;
719 u32 cnt = 0;
720
721 /* udp hdr and data */
722 for (; cnt < len; cnt += 2)
723 csum += udp_pkt[cnt >> 1];
724
725 return csum_tcpudp_magic(saddr, daddr, len, proto, csum);
726}
727
728#define ETH_FCS_SIZE 4
729
730#define PKT_HDR_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \
731 sizeof(struct udphdr))
732
733#define PKT_SIZE (opt_pkt_size - ETH_FCS_SIZE)
734#define IP_PKT_SIZE (PKT_SIZE - sizeof(struct ethhdr))
735#define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr))
736#define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr))
737
738static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE];
739
740static void gen_eth_hdr_data(void)
741{
742 struct udphdr *udp_hdr = (struct udphdr *)(pkt_data +
743 sizeof(struct ethhdr) +
744 sizeof(struct iphdr));
745 struct iphdr *ip_hdr = (struct iphdr *)(pkt_data +
746 sizeof(struct ethhdr));
747 struct ethhdr *eth_hdr = (struct ethhdr *)pkt_data;
748
749 /* ethernet header */
750 memcpy(eth_hdr->h_dest, "\x3c\xfd\xfe\x9e\x7f\x71", ETH_ALEN);
751 memcpy(eth_hdr->h_source, "\xec\xb1\xd7\x98\x3a\xc0", ETH_ALEN);
752 eth_hdr->h_proto = htons(ETH_P_IP);
753
754 /* IP header */
755 ip_hdr->version = IPVERSION;
756 ip_hdr->ihl = 0x5; /* 20 byte header */
757 ip_hdr->tos = 0x0;
758 ip_hdr->tot_len = htons(IP_PKT_SIZE);
759 ip_hdr->id = 0;
760 ip_hdr->frag_off = 0;
761 ip_hdr->ttl = IPDEFTTL;
762 ip_hdr->protocol = IPPROTO_UDP;
763 ip_hdr->saddr = htonl(0x0a0a0a10);
764 ip_hdr->daddr = htonl(0x0a0a0a20);
765
766 /* IP header checksum */
767 ip_hdr->check = 0;
768 ip_hdr->check = ip_fast_csum((const void *)ip_hdr, ip_hdr->ihl);
769
770 /* UDP header */
771 udp_hdr->source = htons(0x1000);
772 udp_hdr->dest = htons(0x1000);
773 udp_hdr->len = htons(UDP_PKT_SIZE);
774
775 /* UDP data */
776 memset32_htonl(pkt_data + PKT_HDR_SIZE, opt_pkt_fill_pattern,
777 UDP_PKT_DATA_SIZE);
778
779 /* UDP header checksum */
780 udp_hdr->check = 0;
781 udp_hdr->check = udp_csum(ip_hdr->saddr, ip_hdr->daddr, UDP_PKT_SIZE,
782 IPPROTO_UDP, (u16 *)udp_hdr);
783}
784
785static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr)
786{
787 memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data,
788 PKT_SIZE);
789}
790
791static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size)
792{
793 struct xsk_umem_info *umem;
794 struct xsk_umem_config cfg = {
795 /* We recommend that you set the fill ring size >= HW RX ring size +
796 * AF_XDP RX ring size. Make sure you fill up the fill ring
797 * with buffers at regular intervals, and you will with this setting
798 * avoid allocation failures in the driver. These are usually quite
799 * expensive since drivers have not been written to assume that
800 * allocation failures are common. For regular sockets, kernel
801 * allocated memory is used that only runs out in OOM situations
802 * that should be rare.
803 */
804 .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2,
805 .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
806 .frame_size = opt_xsk_frame_size,
807 .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM,
808 .flags = opt_umem_flags
809 };
810 int ret;
811
812 umem = calloc(1, sizeof(*umem));
813 if (!umem)
814 exit_with_error(errno);
815
816 ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq,
817 &cfg);
818 if (ret)
819 exit_with_error(-ret);
820
821 umem->buffer = buffer;
822 return umem;
823}
824
825static void xsk_populate_fill_ring(struct xsk_umem_info *umem)
826{
827 int ret, i;
828 u32 idx;
829
830 ret = xsk_ring_prod__reserve(&umem->fq,
831 XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, &idx);
832 if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS * 2)
833 exit_with_error(-ret);
834 for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS * 2; i++)
835 *xsk_ring_prod__fill_addr(&umem->fq, idx++) =
836 i * opt_xsk_frame_size;
837 xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS * 2);
838}
839
840static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem,
841 bool rx, bool tx)
842{
843 struct xsk_socket_config cfg;
844 struct xsk_socket_info *xsk;
845 struct xsk_ring_cons *rxr;
846 struct xsk_ring_prod *txr;
847 int ret;
848
849 xsk = calloc(1, sizeof(*xsk));
850 if (!xsk)
851 exit_with_error(errno);
852
853 xsk->umem = umem;
854 cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
855 cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
856 if (opt_num_xsks > 1)
857 cfg.libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD;
858 else
859 cfg.libbpf_flags = 0;
860 cfg.xdp_flags = opt_xdp_flags;
861 cfg.bind_flags = opt_xdp_bind_flags;
862
863 rxr = rx ? &xsk->rx : NULL;
864 txr = tx ? &xsk->tx : NULL;
865 ret = xsk_socket__create(&xsk->xsk, opt_if, opt_queue, umem->umem,
866 rxr, txr, &cfg);
867 if (ret)
868 exit_with_error(-ret);
869
870 ret = bpf_get_link_xdp_id(opt_ifindex, &prog_id, opt_xdp_flags);
871 if (ret)
872 exit_with_error(-ret);
873
874 xsk->app_stats.rx_empty_polls = 0;
875 xsk->app_stats.fill_fail_polls = 0;
876 xsk->app_stats.copy_tx_sendtos = 0;
877 xsk->app_stats.tx_wakeup_sendtos = 0;
878 xsk->app_stats.opt_polls = 0;
879 xsk->app_stats.prev_rx_empty_polls = 0;
880 xsk->app_stats.prev_fill_fail_polls = 0;
881 xsk->app_stats.prev_copy_tx_sendtos = 0;
882 xsk->app_stats.prev_tx_wakeup_sendtos = 0;
883 xsk->app_stats.prev_opt_polls = 0;
884
885 return xsk;
886}
887
888static struct option long_options[] = {
889 {"rxdrop", no_argument, 0, 'r'},
890 {"txonly", no_argument, 0, 't'},
891 {"l2fwd", no_argument, 0, 'l'},
892 {"interface", required_argument, 0, 'i'},
893 {"queue", required_argument, 0, 'q'},
894 {"poll", no_argument, 0, 'p'},
895 {"xdp-skb", no_argument, 0, 'S'},
896 {"xdp-native", no_argument, 0, 'N'},
897 {"interval", required_argument, 0, 'n'},
898 {"zero-copy", no_argument, 0, 'z'},
899 {"copy", no_argument, 0, 'c'},
900 {"frame-size", required_argument, 0, 'f'},
901 {"no-need-wakeup", no_argument, 0, 'm'},
902 {"unaligned", no_argument, 0, 'u'},
903 {"shared-umem", no_argument, 0, 'M'},
904 {"force", no_argument, 0, 'F'},
905 {"duration", required_argument, 0, 'd'},
906 {"batch-size", required_argument, 0, 'b'},
907 {"tx-pkt-count", required_argument, 0, 'C'},
908 {"tx-pkt-size", required_argument, 0, 's'},
909 {"tx-pkt-pattern", required_argument, 0, 'P'},
910 {"extra-stats", no_argument, 0, 'x'},
911 {"quiet", no_argument, 0, 'Q'},
912 {"app-stats", no_argument, 0, 'a'},
913 {"irq-string", no_argument, 0, 'I'},
914 {0, 0, 0, 0}
915};
916
917static void usage(const char *prog)
918{
919 const char *str =
920 " Usage: %s [OPTIONS]\n"
921 " Options:\n"
922 " -r, --rxdrop Discard all incoming packets (default)\n"
923 " -t, --txonly Only send packets\n"
924 " -l, --l2fwd MAC swap L2 forwarding\n"
925 " -i, --interface=n Run on interface n\n"
926 " -q, --queue=n Use queue n (default 0)\n"
927 " -p, --poll Use poll syscall\n"
928 " -S, --xdp-skb=n Use XDP skb-mod\n"
929 " -N, --xdp-native=n Enforce XDP native mode\n"
930 " -n, --interval=n Specify statistics update interval (default 1 sec).\n"
931 " -z, --zero-copy Force zero-copy mode.\n"
932 " -c, --copy Force copy mode.\n"
933 " -m, --no-need-wakeup Turn off use of driver need wakeup flag.\n"
934 " -f, --frame-size=n Set the frame size (must be a power of two in aligned mode, default is %d).\n"
935 " -u, --unaligned Enable unaligned chunk placement\n"
936 " -M, --shared-umem Enable XDP_SHARED_UMEM\n"
937 " -F, --force Force loading the XDP prog\n"
938 " -d, --duration=n Duration in secs to run command.\n"
939 " Default: forever.\n"
940 " -b, --batch-size=n Batch size for sending or receiving\n"
941 " packets. Default: %d\n"
942 " -C, --tx-pkt-count=n Number of packets to send.\n"
943 " Default: Continuous packets.\n"
944 " -s, --tx-pkt-size=n Transmit packet size.\n"
945 " (Default: %d bytes)\n"
946 " Min size: %d, Max size %d.\n"
947 " -P, --tx-pkt-pattern=nPacket fill pattern. Default: 0x%x\n"
948 " -x, --extra-stats Display extra statistics.\n"
949 " -Q, --quiet Do not display any stats.\n"
950 " -a, --app-stats Display application (syscall) statistics.\n"
951 " -I, --irq-string Display driver interrupt statistics for interface associated with irq-string.\n"
952 "\n";
953 fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE,
954 opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE,
955 XSK_UMEM__DEFAULT_FRAME_SIZE, opt_pkt_fill_pattern);
956
957 exit(EXIT_FAILURE);
958}
959
960static void parse_command_line(int argc, char **argv)
961{
962 int option_index, c;
963
964 opterr = 0;
965
966 for (;;) {
967 c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQaI:",
968 long_options, &option_index);
969 if (c == -1)
970 break;
971
972 switch (c) {
973 case 'r':
974 opt_bench = BENCH_RXDROP;
975 break;
976 case 't':
977 opt_bench = BENCH_TXONLY;
978 break;
979 case 'l':
980 opt_bench = BENCH_L2FWD;
981 break;
982 case 'i':
983 opt_if = optarg;
984 break;
985 case 'q':
986 opt_queue = atoi(optarg);
987 break;
988 case 'p':
989 opt_poll = 1;
990 break;
991 case 'S':
992 opt_xdp_flags |= XDP_FLAGS_SKB_MODE;
993 opt_xdp_bind_flags |= XDP_COPY;
994 break;
995 case 'N':
996 /* default, set below */
997 break;
998 case 'n':
999 opt_interval = atoi(optarg);
1000 break;
1001 case 'z':
1002 opt_xdp_bind_flags |= XDP_ZEROCOPY;
1003 break;
1004 case 'c':
1005 opt_xdp_bind_flags |= XDP_COPY;
1006 break;
1007 case 'u':
1008 opt_umem_flags |= XDP_UMEM_UNALIGNED_CHUNK_FLAG;
1009 opt_unaligned_chunks = 1;
1010 opt_mmap_flags = MAP_HUGETLB;
1011 break;
1012 case 'F':
1013 opt_xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
1014 break;
1015 case 'f':
1016 opt_xsk_frame_size = atoi(optarg);
1017 break;
1018 case 'm':
1019 opt_need_wakeup = false;
1020 opt_xdp_bind_flags &= ~XDP_USE_NEED_WAKEUP;
1021 break;
1022 case 'M':
1023 opt_num_xsks = MAX_SOCKS;
1024 break;
1025 case 'd':
1026 opt_duration = atoi(optarg);
1027 opt_duration *= 1000000000;
1028 break;
1029 case 'b':
1030 opt_batch_size = atoi(optarg);
1031 break;
1032 case 'C':
1033 opt_pkt_count = atoi(optarg);
1034 break;
1035 case 's':
1036 opt_pkt_size = atoi(optarg);
1037 if (opt_pkt_size > (XSK_UMEM__DEFAULT_FRAME_SIZE) ||
1038 opt_pkt_size < MIN_PKT_SIZE) {
1039 fprintf(stderr,
1040 "ERROR: Invalid frame size %d\n",
1041 opt_pkt_size);
1042 usage(basename(argv[0]));
1043 }
1044 break;
1045 case 'P':
1046 opt_pkt_fill_pattern = strtol(optarg, NULL, 16);
1047 break;
1048 case 'x':
1049 opt_extra_stats = 1;
1050 break;
1051 case 'Q':
1052 opt_quiet = 1;
1053 break;
1054 case 'a':
1055 opt_app_stats = 1;
1056 break;
1057 case 'I':
1058 opt_irq_str = optarg;
1059 if (get_interrupt_number())
1060 irqs_at_init = get_irqs();
1061 if (irqs_at_init < 0) {
1062 fprintf(stderr, "ERROR: Failed to get irqs for %s\n", opt_irq_str);
1063 usage(basename(argv[0]));
1064 }
1065
1066 break;
1067 default:
1068 usage(basename(argv[0]));
1069 }
1070 }
1071
1072 if (!(opt_xdp_flags & XDP_FLAGS_SKB_MODE))
1073 opt_xdp_flags |= XDP_FLAGS_DRV_MODE;
1074
1075 opt_ifindex = if_nametoindex(opt_if);
1076 if (!opt_ifindex) {
1077 fprintf(stderr, "ERROR: interface \"%s\" does not exist\n",
1078 opt_if);
1079 usage(basename(argv[0]));
1080 }
1081
1082 if ((opt_xsk_frame_size & (opt_xsk_frame_size - 1)) &&
1083 !opt_unaligned_chunks) {
1084 fprintf(stderr, "--frame-size=%d is not a power of two\n",
1085 opt_xsk_frame_size);
1086 usage(basename(argv[0]));
1087 }
1088}
1089
1090static void kick_tx(struct xsk_socket_info *xsk)
1091{
1092 int ret;
1093
1094 ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0);
1095 if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN ||
1096 errno == EBUSY || errno == ENETDOWN)
1097 return;
1098 exit_with_error(errno);
1099}
1100
1101static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
1102 struct pollfd *fds)
1103{
1104 struct xsk_umem_info *umem = xsk->umem;
1105 u32 idx_cq = 0, idx_fq = 0;
1106 unsigned int rcvd;
1107 size_t ndescs;
1108
1109 if (!xsk->outstanding_tx)
1110 return;
1111
1112 /* In copy mode, Tx is driven by a syscall so we need to use e.g. sendto() to
1113 * really send the packets. In zero-copy mode we do not have to do this, since Tx
1114 * is driven by the NAPI loop. So as an optimization, we do not have to call
1115 * sendto() all the time in zero-copy mode for l2fwd.
1116 */
1117 if (opt_xdp_bind_flags & XDP_COPY) {
1118 xsk->app_stats.copy_tx_sendtos++;
1119 kick_tx(xsk);
1120 }
1121
1122 ndescs = (xsk->outstanding_tx > opt_batch_size) ? opt_batch_size :
1123 xsk->outstanding_tx;
1124
1125 /* re-add completed Tx buffers */
1126 rcvd = xsk_ring_cons__peek(&umem->cq, ndescs, &idx_cq);
1127 if (rcvd > 0) {
1128 unsigned int i;
1129 int ret;
1130
1131 ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
1132 while (ret != rcvd) {
1133 if (ret < 0)
1134 exit_with_error(-ret);
1135 if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
1136 xsk->app_stats.fill_fail_polls++;
1137 ret = poll(fds, num_socks, opt_timeout);
1138 }
1139 ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
1140 }
1141
1142 for (i = 0; i < rcvd; i++)
1143 *xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) =
1144 *xsk_ring_cons__comp_addr(&umem->cq, idx_cq++);
1145
1146 xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
1147 xsk_ring_cons__release(&xsk->umem->cq, rcvd);
1148 xsk->outstanding_tx -= rcvd;
1149 xsk->ring_stats.tx_npkts += rcvd;
1150 }
1151}
1152
1153static inline void complete_tx_only(struct xsk_socket_info *xsk,
1154 int batch_size)
1155{
1156 unsigned int rcvd;
1157 u32 idx;
1158
1159 if (!xsk->outstanding_tx)
1160 return;
1161
1162 if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) {
1163 xsk->app_stats.tx_wakeup_sendtos++;
1164 kick_tx(xsk);
1165 }
1166
1167 rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx);
1168 if (rcvd > 0) {
1169 xsk_ring_cons__release(&xsk->umem->cq, rcvd);
1170 xsk->outstanding_tx -= rcvd;
1171 xsk->ring_stats.tx_npkts += rcvd;
1172 }
1173}
1174
1175static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
1176{
1177 unsigned int rcvd, i;
1178 u32 idx_rx = 0, idx_fq = 0;
1179 int ret;
1180
1181 rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx);
1182 if (!rcvd) {
1183 if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
1184 xsk->app_stats.rx_empty_polls++;
1185 ret = poll(fds, num_socks, opt_timeout);
1186 }
1187 return;
1188 }
1189
1190 ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
1191 while (ret != rcvd) {
1192 if (ret < 0)
1193 exit_with_error(-ret);
1194 if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
1195 xsk->app_stats.fill_fail_polls++;
1196 ret = poll(fds, num_socks, opt_timeout);
1197 }
1198 ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
1199 }
1200
1201 for (i = 0; i < rcvd; i++) {
1202 u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr;
1203 u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len;
1204 u64 orig = xsk_umem__extract_addr(addr);
1205
1206 addr = xsk_umem__add_offset_to_addr(addr);
1207 char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
1208
1209 hex_dump(pkt, len, addr);
1210 *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig;
1211 }
1212
1213 xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
1214 xsk_ring_cons__release(&xsk->rx, rcvd);
1215 xsk->ring_stats.rx_npkts += rcvd;
1216}
1217
1218static void rx_drop_all(void)
1219{
1220 struct pollfd fds[MAX_SOCKS] = {};
1221 int i, ret;
1222
1223 for (i = 0; i < num_socks; i++) {
1224 fds[i].fd = xsk_socket__fd(xsks[i]->xsk);
1225 fds[i].events = POLLIN;
1226 }
1227
1228 for (;;) {
1229 if (opt_poll) {
1230 for (i = 0; i < num_socks; i++)
1231 xsks[i]->app_stats.opt_polls++;
1232 ret = poll(fds, num_socks, opt_timeout);
1233 if (ret <= 0)
1234 continue;
1235 }
1236
1237 for (i = 0; i < num_socks; i++)
1238 rx_drop(xsks[i], fds);
1239
1240 if (benchmark_done)
1241 break;
1242 }
1243}
1244
1245static void tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size)
1246{
1247 u32 idx;
1248 unsigned int i;
1249
1250 while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) <
1251 batch_size) {
1252 complete_tx_only(xsk, batch_size);
1253 if (benchmark_done)
1254 return;
1255 }
1256
1257 for (i = 0; i < batch_size; i++) {
1258 struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx,
1259 idx + i);
1260 tx_desc->addr = (*frame_nb + i) * opt_xsk_frame_size;
1261 tx_desc->len = PKT_SIZE;
1262 }
1263
1264 xsk_ring_prod__submit(&xsk->tx, batch_size);
1265 xsk->outstanding_tx += batch_size;
1266 *frame_nb += batch_size;
1267 *frame_nb %= NUM_FRAMES;
1268 complete_tx_only(xsk, batch_size);
1269}
1270
1271static inline int get_batch_size(int pkt_cnt)
1272{
1273 if (!opt_pkt_count)
1274 return opt_batch_size;
1275
1276 if (pkt_cnt + opt_batch_size <= opt_pkt_count)
1277 return opt_batch_size;
1278
1279 return opt_pkt_count - pkt_cnt;
1280}
1281
1282static void complete_tx_only_all(void)
1283{
1284 bool pending;
1285 int i;
1286
1287 do {
1288 pending = false;
1289 for (i = 0; i < num_socks; i++) {
1290 if (xsks[i]->outstanding_tx) {
1291 complete_tx_only(xsks[i], opt_batch_size);
1292 pending = !!xsks[i]->outstanding_tx;
1293 }
1294 }
1295 } while (pending);
1296}
1297
1298static void tx_only_all(void)
1299{
1300 struct pollfd fds[MAX_SOCKS] = {};
1301 u32 frame_nb[MAX_SOCKS] = {};
1302 int pkt_cnt = 0;
1303 int i, ret;
1304
1305 for (i = 0; i < num_socks; i++) {
1306 fds[0].fd = xsk_socket__fd(xsks[i]->xsk);
1307 fds[0].events = POLLOUT;
1308 }
1309
1310 while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) {
1311 int batch_size = get_batch_size(pkt_cnt);
1312
1313 if (opt_poll) {
1314 for (i = 0; i < num_socks; i++)
1315 xsks[i]->app_stats.opt_polls++;
1316 ret = poll(fds, num_socks, opt_timeout);
1317 if (ret <= 0)
1318 continue;
1319
1320 if (!(fds[0].revents & POLLOUT))
1321 continue;
1322 }
1323
1324 for (i = 0; i < num_socks; i++)
1325 tx_only(xsks[i], &frame_nb[i], batch_size);
1326
1327 pkt_cnt += batch_size;
1328
1329 if (benchmark_done)
1330 break;
1331 }
1332
1333 if (opt_pkt_count)
1334 complete_tx_only_all();
1335}
1336
1337static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
1338{
1339 unsigned int rcvd, i;
1340 u32 idx_rx = 0, idx_tx = 0;
1341 int ret;
1342
1343 complete_tx_l2fwd(xsk, fds);
1344
1345 rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx);
1346 if (!rcvd) {
1347 if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
1348 xsk->app_stats.rx_empty_polls++;
1349 ret = poll(fds, num_socks, opt_timeout);
1350 }
1351 return;
1352 }
1353
1354 ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
1355 while (ret != rcvd) {
1356 if (ret < 0)
1357 exit_with_error(-ret);
1358 complete_tx_l2fwd(xsk, fds);
1359 if (xsk_ring_prod__needs_wakeup(&xsk->tx)) {
1360 xsk->app_stats.tx_wakeup_sendtos++;
1361 kick_tx(xsk);
1362 }
1363 ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
1364 }
1365
1366 for (i = 0; i < rcvd; i++) {
1367 u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr;
1368 u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len;
1369 u64 orig = addr;
1370
1371 addr = xsk_umem__add_offset_to_addr(addr);
1372 char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
1373
1374 swap_mac_addresses(pkt);
1375
1376 hex_dump(pkt, len, addr);
1377 xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = orig;
1378 xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len;
1379 }
1380
1381 xsk_ring_prod__submit(&xsk->tx, rcvd);
1382 xsk_ring_cons__release(&xsk->rx, rcvd);
1383
1384 xsk->ring_stats.rx_npkts += rcvd;
1385 xsk->outstanding_tx += rcvd;
1386}
1387
1388static void l2fwd_all(void)
1389{
1390 struct pollfd fds[MAX_SOCKS] = {};
1391 int i, ret;
1392
1393 for (i = 0; i < num_socks; i++) {
1394 fds[i].fd = xsk_socket__fd(xsks[i]->xsk);
1395 fds[i].events = POLLOUT | POLLIN;
1396 }
1397
1398 for (;;) {
1399 if (opt_poll) {
1400 for (i = 0; i < num_socks; i++)
1401 xsks[i]->app_stats.opt_polls++;
1402 ret = poll(fds, num_socks, opt_timeout);
1403 if (ret <= 0)
1404 continue;
1405 }
1406
1407 for (i = 0; i < num_socks; i++)
1408 l2fwd(xsks[i], fds);
1409
1410 if (benchmark_done)
1411 break;
1412 }
1413}
1414
1415static void load_xdp_program(char **argv, struct bpf_object **obj)
1416{
1417 struct bpf_prog_load_attr prog_load_attr = {
1418 .prog_type = BPF_PROG_TYPE_XDP,
1419 };
1420 char xdp_filename[256];
1421 int prog_fd;
1422
1423 snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv[0]);
1424 prog_load_attr.file = xdp_filename;
1425
1426 if (bpf_prog_load_xattr(&prog_load_attr, obj, &prog_fd))
1427 exit(EXIT_FAILURE);
1428 if (prog_fd < 0) {
1429 fprintf(stderr, "ERROR: no program found: %s\n",
1430 strerror(prog_fd));
1431 exit(EXIT_FAILURE);
1432 }
1433
1434 if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd, opt_xdp_flags) < 0) {
1435 fprintf(stderr, "ERROR: link set xdp fd failed\n");
1436 exit(EXIT_FAILURE);
1437 }
1438}
1439
1440static void enter_xsks_into_map(struct bpf_object *obj)
1441{
1442 struct bpf_map *map;
1443 int i, xsks_map;
1444
1445 map = bpf_object__find_map_by_name(obj, "xsks_map");
1446 xsks_map = bpf_map__fd(map);
1447 if (xsks_map < 0) {
1448 fprintf(stderr, "ERROR: no xsks map found: %s\n",
1449 strerror(xsks_map));
1450 exit(EXIT_FAILURE);
1451 }
1452
1453 for (i = 0; i < num_socks; i++) {
1454 int fd = xsk_socket__fd(xsks[i]->xsk);
1455 int key, ret;
1456
1457 key = i;
1458 ret = bpf_map_update_elem(xsks_map, &key, &fd, 0);
1459 if (ret) {
1460 fprintf(stderr, "ERROR: bpf_map_update_elem %d\n", i);
1461 exit(EXIT_FAILURE);
1462 }
1463 }
1464}
1465
1466int main(int argc, char **argv)
1467{
1468 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
1469 bool rx = false, tx = false;
1470 struct xsk_umem_info *umem;
1471 struct bpf_object *obj;
1472 pthread_t pt;
1473 int i, ret;
1474 void *bufs;
1475
1476 parse_command_line(argc, argv);
1477
1478 if (setrlimit(RLIMIT_MEMLOCK, &r)) {
1479 fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n",
1480 strerror(errno));
1481 exit(EXIT_FAILURE);
1482 }
1483
1484 if (opt_num_xsks > 1)
1485 load_xdp_program(argv, &obj);
1486
1487 /* Reserve memory for the umem. Use hugepages if unaligned chunk mode */
1488 bufs = mmap(NULL, NUM_FRAMES * opt_xsk_frame_size,
1489 PROT_READ | PROT_WRITE,
1490 MAP_PRIVATE | MAP_ANONYMOUS | opt_mmap_flags, -1, 0);
1491 if (bufs == MAP_FAILED) {
1492 printf("ERROR: mmap failed\n");
1493 exit(EXIT_FAILURE);
1494 }
1495
1496 /* Create sockets... */
1497 umem = xsk_configure_umem(bufs, NUM_FRAMES * opt_xsk_frame_size);
1498 if (opt_bench == BENCH_RXDROP || opt_bench == BENCH_L2FWD) {
1499 rx = true;
1500 xsk_populate_fill_ring(umem);
1501 }
1502 if (opt_bench == BENCH_L2FWD || opt_bench == BENCH_TXONLY)
1503 tx = true;
1504 for (i = 0; i < opt_num_xsks; i++)
1505 xsks[num_socks++] = xsk_configure_socket(umem, rx, tx);
1506
1507 if (opt_bench == BENCH_TXONLY) {
1508 gen_eth_hdr_data();
1509
1510 for (i = 0; i < NUM_FRAMES; i++)
1511 gen_eth_frame(umem, i * opt_xsk_frame_size);
1512 }
1513
1514 if (opt_num_xsks > 1 && opt_bench != BENCH_TXONLY)
1515 enter_xsks_into_map(obj);
1516
1517 signal(SIGINT, int_exit);
1518 signal(SIGTERM, int_exit);
1519 signal(SIGABRT, int_exit);
1520
1521 setlocale(LC_ALL, "");
1522
1523 prev_time = get_nsecs();
1524 start_time = prev_time;
1525
1526 if (!opt_quiet) {
1527 ret = pthread_create(&pt, NULL, poller, NULL);
1528 if (ret)
1529 exit_with_error(ret);
1530 }
1531
1532
1533 if (opt_bench == BENCH_RXDROP)
1534 rx_drop_all();
1535 else if (opt_bench == BENCH_TXONLY)
1536 tx_only_all();
1537 else
1538 l2fwd_all();
1539
1540 benchmark_done = true;
1541
1542 if (!opt_quiet)
1543 pthread_join(pt, NULL);
1544
1545 xdpsock_cleanup();
1546
1547 munmap(bufs, NUM_FRAMES * opt_xsk_frame_size);
1548
1549 return 0;
1550}
diff --git a/samples/bpf/xsk_fwd.c b/samples/bpf/xsk_fwd.c
new file mode 100644
index 000000000..1cd97c84c
--- /dev/null
+++ b/samples/bpf/xsk_fwd.c
@@ -0,0 +1,1085 @@
1// SPDX-License-Identifier: GPL-2.0
2/* Copyright(c) 2020 Intel Corporation. */
3
4#define _GNU_SOURCE
5#include <poll.h>
6#include <pthread.h>
7#include <signal.h>
8#include <sched.h>
9#include <stdio.h>
10#include <stdlib.h>
11#include <string.h>
12#include <sys/mman.h>
13#include <sys/resource.h>
14#include <sys/socket.h>
15#include <sys/types.h>
16#include <time.h>
17#include <unistd.h>
18#include <getopt.h>
19#include <netinet/ether.h>
20#include <net/if.h>
21
22#include <linux/bpf.h>
23#include <linux/if_link.h>
24#include <linux/if_xdp.h>
25
26#include <bpf/libbpf.h>
27#include <bpf/xsk.h>
28#include <bpf/bpf.h>
29
30#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
31
32typedef __u64 u64;
33typedef __u32 u32;
34typedef __u16 u16;
35typedef __u8 u8;
36
37/* This program illustrates the packet forwarding between multiple AF_XDP
38 * sockets in multi-threaded environment. All threads are sharing a common
39 * buffer pool, with each socket having its own private buffer cache.
40 *
41 * Example 1: Single thread handling two sockets. The packets received by socket
42 * A (interface IFA, queue QA) are forwarded to socket B (interface IFB, queue
43 * QB), while the packets received by socket B are forwarded to socket A. The
44 * thread is running on CPU core X:
45 *
46 * ./xsk_fwd -i IFA -q QA -i IFB -q QB -c X
47 *
48 * Example 2: Two threads, each handling two sockets. The thread running on CPU
49 * core X forwards all the packets received by socket A to socket B, and all the
50 * packets received by socket B to socket A. The thread running on CPU core Y is
51 * performing the same packet forwarding between sockets C and D:
52 *
53 * ./xsk_fwd -i IFA -q QA -i IFB -q QB -i IFC -q QC -i IFD -q QD
54 * -c CX -c CY
55 */
56
57/*
58 * Buffer pool and buffer cache
59 *
60 * For packet forwarding, the packet buffers are typically allocated from the
61 * pool for packet reception and freed back to the pool for further reuse once
62 * the packet transmission is completed.
63 *
64 * The buffer pool is shared between multiple threads. In order to minimize the
65 * access latency to the shared buffer pool, each thread creates one (or
66 * several) buffer caches, which, unlike the buffer pool, are private to the
67 * thread that creates them and therefore cannot be shared with other threads.
68 * The access to the shared pool is only needed either (A) when the cache gets
69 * empty due to repeated buffer allocations and it needs to be replenished from
70 * the pool, or (B) when the cache gets full due to repeated buffer free and it
71 * needs to be flushed back to the pull.
72 *
73 * In a packet forwarding system, a packet received on any input port can
74 * potentially be transmitted on any output port, depending on the forwarding
75 * configuration. For AF_XDP sockets, for this to work with zero-copy of the
76 * packet buffers when, it is required that the buffer pool memory fits into the
77 * UMEM area shared by all the sockets.
78 */
79
80struct bpool_params {
81 u32 n_buffers;
82 u32 buffer_size;
83 int mmap_flags;
84
85 u32 n_users_max;
86 u32 n_buffers_per_slab;
87};
88
89/* This buffer pool implementation organizes the buffers into equally sized
90 * slabs of *n_buffers_per_slab*. Initially, there are *n_slabs* slabs in the
91 * pool that are completely filled with buffer pointers (full slabs).
92 *
93 * Each buffer cache has a slab for buffer allocation and a slab for buffer
94 * free, with both of these slabs initially empty. When the cache's allocation
95 * slab goes empty, it is swapped with one of the available full slabs from the
96 * pool, if any is available. When the cache's free slab goes full, it is
97 * swapped for one of the empty slabs from the pool, which is guaranteed to
98 * succeed.
99 *
100 * Partially filled slabs never get traded between the cache and the pool
101 * (except when the cache itself is destroyed), which enables fast operation
102 * through pointer swapping.
103 */
104struct bpool {
105 struct bpool_params params;
106 pthread_mutex_t lock;
107 void *addr;
108
109 u64 **slabs;
110 u64 **slabs_reserved;
111 u64 *buffers;
112 u64 *buffers_reserved;
113
114 u64 n_slabs;
115 u64 n_slabs_reserved;
116 u64 n_buffers;
117
118 u64 n_slabs_available;
119 u64 n_slabs_reserved_available;
120
121 struct xsk_umem_config umem_cfg;
122 struct xsk_ring_prod umem_fq;
123 struct xsk_ring_cons umem_cq;
124 struct xsk_umem *umem;
125};
126
127static struct bpool *
128bpool_init(struct bpool_params *params,
129 struct xsk_umem_config *umem_cfg)
130{
131 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
132 u64 n_slabs, n_slabs_reserved, n_buffers, n_buffers_reserved;
133 u64 slabs_size, slabs_reserved_size;
134 u64 buffers_size, buffers_reserved_size;
135 u64 total_size, i;
136 struct bpool *bp;
137 u8 *p;
138 int status;
139
140 /* mmap prep. */
141 if (setrlimit(RLIMIT_MEMLOCK, &r))
142 return NULL;
143
144 /* bpool internals dimensioning. */
145 n_slabs = (params->n_buffers + params->n_buffers_per_slab - 1) /
146 params->n_buffers_per_slab;
147 n_slabs_reserved = params->n_users_max * 2;
148 n_buffers = n_slabs * params->n_buffers_per_slab;
149 n_buffers_reserved = n_slabs_reserved * params->n_buffers_per_slab;
150
151 slabs_size = n_slabs * sizeof(u64 *);
152 slabs_reserved_size = n_slabs_reserved * sizeof(u64 *);
153 buffers_size = n_buffers * sizeof(u64);
154 buffers_reserved_size = n_buffers_reserved * sizeof(u64);
155
156 total_size = sizeof(struct bpool) +
157 slabs_size + slabs_reserved_size +
158 buffers_size + buffers_reserved_size;
159
160 /* bpool memory allocation. */
161 p = calloc(total_size, sizeof(u8));
162 if (!p)
163 return NULL;
164
165 /* bpool memory initialization. */
166 bp = (struct bpool *)p;
167 memcpy(&bp->params, params, sizeof(*params));
168 bp->params.n_buffers = n_buffers;
169
170 bp->slabs = (u64 **)&p[sizeof(struct bpool)];
171 bp->slabs_reserved = (u64 **)&p[sizeof(struct bpool) +
172 slabs_size];
173 bp->buffers = (u64 *)&p[sizeof(struct bpool) +
174 slabs_size + slabs_reserved_size];
175 bp->buffers_reserved = (u64 *)&p[sizeof(struct bpool) +
176 slabs_size + slabs_reserved_size + buffers_size];
177
178 bp->n_slabs = n_slabs;
179 bp->n_slabs_reserved = n_slabs_reserved;
180 bp->n_buffers = n_buffers;
181
182 for (i = 0; i < n_slabs; i++)
183 bp->slabs[i] = &bp->buffers[i * params->n_buffers_per_slab];
184 bp->n_slabs_available = n_slabs;
185
186 for (i = 0; i < n_slabs_reserved; i++)
187 bp->slabs_reserved[i] = &bp->buffers_reserved[i *
188 params->n_buffers_per_slab];
189 bp->n_slabs_reserved_available = n_slabs_reserved;
190
191 for (i = 0; i < n_buffers; i++)
192 bp->buffers[i] = i * params->buffer_size;
193
194 /* lock. */
195 status = pthread_mutex_init(&bp->lock, NULL);
196 if (status) {
197 free(p);
198 return NULL;
199 }
200
201 /* mmap. */
202 bp->addr = mmap(NULL,
203 n_buffers * params->buffer_size,
204 PROT_READ | PROT_WRITE,
205 MAP_PRIVATE | MAP_ANONYMOUS | params->mmap_flags,
206 -1,
207 0);
208 if (bp->addr == MAP_FAILED) {
209 pthread_mutex_destroy(&bp->lock);
210 free(p);
211 return NULL;
212 }
213
214 /* umem. */
215 status = xsk_umem__create(&bp->umem,
216 bp->addr,
217 bp->params.n_buffers * bp->params.buffer_size,
218 &bp->umem_fq,
219 &bp->umem_cq,
220 umem_cfg);
221 if (status) {
222 munmap(bp->addr, bp->params.n_buffers * bp->params.buffer_size);
223 pthread_mutex_destroy(&bp->lock);
224 free(p);
225 return NULL;
226 }
227 memcpy(&bp->umem_cfg, umem_cfg, sizeof(*umem_cfg));
228
229 return bp;
230}
231
232static void
233bpool_free(struct bpool *bp)
234{
235 if (!bp)
236 return;
237
238 xsk_umem__delete(bp->umem);
239 munmap(bp->addr, bp->params.n_buffers * bp->params.buffer_size);
240 pthread_mutex_destroy(&bp->lock);
241 free(bp);
242}
243
244struct bcache {
245 struct bpool *bp;
246
247 u64 *slab_cons;
248 u64 *slab_prod;
249
250 u64 n_buffers_cons;
251 u64 n_buffers_prod;
252};
253
254static u32
255bcache_slab_size(struct bcache *bc)
256{
257 struct bpool *bp = bc->bp;
258
259 return bp->params.n_buffers_per_slab;
260}
261
262static struct bcache *
263bcache_init(struct bpool *bp)
264{
265 struct bcache *bc;
266
267 bc = calloc(1, sizeof(struct bcache));
268 if (!bc)
269 return NULL;
270
271 bc->bp = bp;
272 bc->n_buffers_cons = 0;
273 bc->n_buffers_prod = 0;
274
275 pthread_mutex_lock(&bp->lock);
276 if (bp->n_slabs_reserved_available == 0) {
277 pthread_mutex_unlock(&bp->lock);
278 free(bc);
279 return NULL;
280 }
281
282 bc->slab_cons = bp->slabs_reserved[bp->n_slabs_reserved_available - 1];
283 bc->slab_prod = bp->slabs_reserved[bp->n_slabs_reserved_available - 2];
284 bp->n_slabs_reserved_available -= 2;
285 pthread_mutex_unlock(&bp->lock);
286
287 return bc;
288}
289
290static void
291bcache_free(struct bcache *bc)
292{
293 struct bpool *bp;
294
295 if (!bc)
296 return;
297
298 /* In order to keep this example simple, the case of freeing any
299 * existing buffers from the cache back to the pool is ignored.
300 */
301
302 bp = bc->bp;
303 pthread_mutex_lock(&bp->lock);
304 bp->slabs_reserved[bp->n_slabs_reserved_available] = bc->slab_prod;
305 bp->slabs_reserved[bp->n_slabs_reserved_available + 1] = bc->slab_cons;
306 bp->n_slabs_reserved_available += 2;
307 pthread_mutex_unlock(&bp->lock);
308
309 free(bc);
310}
311
312/* To work correctly, the implementation requires that the *n_buffers* input
313 * argument is never greater than the buffer pool's *n_buffers_per_slab*. This
314 * is typically the case, with one exception taking place when large number of
315 * buffers are allocated at init time (e.g. for the UMEM fill queue setup).
316 */
317static inline u32
318bcache_cons_check(struct bcache *bc, u32 n_buffers)
319{
320 struct bpool *bp = bc->bp;
321 u64 n_buffers_per_slab = bp->params.n_buffers_per_slab;
322 u64 n_buffers_cons = bc->n_buffers_cons;
323 u64 n_slabs_available;
324 u64 *slab_full;
325
326 /*
327 * Consumer slab is not empty: Use what's available locally. Do not
328 * look for more buffers from the pool when the ask can only be
329 * partially satisfied.
330 */
331 if (n_buffers_cons)
332 return (n_buffers_cons < n_buffers) ?
333 n_buffers_cons :
334 n_buffers;
335
336 /*
337 * Consumer slab is empty: look to trade the current consumer slab
338 * (full) for a full slab from the pool, if any is available.
339 */
340 pthread_mutex_lock(&bp->lock);
341 n_slabs_available = bp->n_slabs_available;
342 if (!n_slabs_available) {
343 pthread_mutex_unlock(&bp->lock);
344 return 0;
345 }
346
347 n_slabs_available--;
348 slab_full = bp->slabs[n_slabs_available];
349 bp->slabs[n_slabs_available] = bc->slab_cons;
350 bp->n_slabs_available = n_slabs_available;
351 pthread_mutex_unlock(&bp->lock);
352
353 bc->slab_cons = slab_full;
354 bc->n_buffers_cons = n_buffers_per_slab;
355 return n_buffers;
356}
357
358static inline u64
359bcache_cons(struct bcache *bc)
360{
361 u64 n_buffers_cons = bc->n_buffers_cons - 1;
362 u64 buffer;
363
364 buffer = bc->slab_cons[n_buffers_cons];
365 bc->n_buffers_cons = n_buffers_cons;
366 return buffer;
367}
368
369static inline void
370bcache_prod(struct bcache *bc, u64 buffer)
371{
372 struct bpool *bp = bc->bp;
373 u64 n_buffers_per_slab = bp->params.n_buffers_per_slab;
374 u64 n_buffers_prod = bc->n_buffers_prod;
375 u64 n_slabs_available;
376 u64 *slab_empty;
377
378 /*
379 * Producer slab is not yet full: store the current buffer to it.
380 */
381 if (n_buffers_prod < n_buffers_per_slab) {
382 bc->slab_prod[n_buffers_prod] = buffer;
383 bc->n_buffers_prod = n_buffers_prod + 1;
384 return;
385 }
386
387 /*
388 * Producer slab is full: trade the cache's current producer slab
389 * (full) for an empty slab from the pool, then store the current
390 * buffer to the new producer slab. As one full slab exists in the
391 * cache, it is guaranteed that there is at least one empty slab
392 * available in the pool.
393 */
394 pthread_mutex_lock(&bp->lock);
395 n_slabs_available = bp->n_slabs_available;
396 slab_empty = bp->slabs[n_slabs_available];
397 bp->slabs[n_slabs_available] = bc->slab_prod;
398 bp->n_slabs_available = n_slabs_available + 1;
399 pthread_mutex_unlock(&bp->lock);
400
401 slab_empty[0] = buffer;
402 bc->slab_prod = slab_empty;
403 bc->n_buffers_prod = 1;
404}
405
406/*
407 * Port
408 *
409 * Each of the forwarding ports sits on top of an AF_XDP socket. In order for
410 * packet forwarding to happen with no packet buffer copy, all the sockets need
411 * to share the same UMEM area, which is used as the buffer pool memory.
412 */
413#ifndef MAX_BURST_RX
414#define MAX_BURST_RX 64
415#endif
416
417#ifndef MAX_BURST_TX
418#define MAX_BURST_TX 64
419#endif
420
421struct burst_rx {
422 u64 addr[MAX_BURST_RX];
423 u32 len[MAX_BURST_RX];
424};
425
426struct burst_tx {
427 u64 addr[MAX_BURST_TX];
428 u32 len[MAX_BURST_TX];
429 u32 n_pkts;
430};
431
432struct port_params {
433 struct xsk_socket_config xsk_cfg;
434 struct bpool *bp;
435 const char *iface;
436 u32 iface_queue;
437};
438
439struct port {
440 struct port_params params;
441
442 struct bcache *bc;
443
444 struct xsk_ring_cons rxq;
445 struct xsk_ring_prod txq;
446 struct xsk_ring_prod umem_fq;
447 struct xsk_ring_cons umem_cq;
448 struct xsk_socket *xsk;
449 int umem_fq_initialized;
450
451 u64 n_pkts_rx;
452 u64 n_pkts_tx;
453};
454
455static void
456port_free(struct port *p)
457{
458 if (!p)
459 return;
460
461 /* To keep this example simple, the code to free the buffers from the
462 * socket's receive and transmit queues, as well as from the UMEM fill
463 * and completion queues, is not included.
464 */
465
466 if (p->xsk)
467 xsk_socket__delete(p->xsk);
468
469 bcache_free(p->bc);
470
471 free(p);
472}
473
474static struct port *
475port_init(struct port_params *params)
476{
477 struct port *p;
478 u32 umem_fq_size, pos = 0;
479 int status, i;
480
481 /* Memory allocation and initialization. */
482 p = calloc(sizeof(struct port), 1);
483 if (!p)
484 return NULL;
485
486 memcpy(&p->params, params, sizeof(p->params));
487 umem_fq_size = params->bp->umem_cfg.fill_size;
488
489 /* bcache. */
490 p->bc = bcache_init(params->bp);
491 if (!p->bc ||
492 (bcache_slab_size(p->bc) < umem_fq_size) ||
493 (bcache_cons_check(p->bc, umem_fq_size) < umem_fq_size)) {
494 port_free(p);
495 return NULL;
496 }
497
498 /* xsk socket. */
499 status = xsk_socket__create_shared(&p->xsk,
500 params->iface,
501 params->iface_queue,
502 params->bp->umem,
503 &p->rxq,
504 &p->txq,
505 &p->umem_fq,
506 &p->umem_cq,
507 &params->xsk_cfg);
508 if (status) {
509 port_free(p);
510 return NULL;
511 }
512
513 /* umem fq. */
514 xsk_ring_prod__reserve(&p->umem_fq, umem_fq_size, &pos);
515
516 for (i = 0; i < umem_fq_size; i++)
517 *xsk_ring_prod__fill_addr(&p->umem_fq, pos + i) =
518 bcache_cons(p->bc);
519
520 xsk_ring_prod__submit(&p->umem_fq, umem_fq_size);
521 p->umem_fq_initialized = 1;
522
523 return p;
524}
525
526static inline u32
527port_rx_burst(struct port *p, struct burst_rx *b)
528{
529 u32 n_pkts, pos, i;
530
531 /* Free buffers for FQ replenish. */
532 n_pkts = ARRAY_SIZE(b->addr);
533
534 n_pkts = bcache_cons_check(p->bc, n_pkts);
535 if (!n_pkts)
536 return 0;
537
538 /* RXQ. */
539 n_pkts = xsk_ring_cons__peek(&p->rxq, n_pkts, &pos);
540 if (!n_pkts) {
541 if (xsk_ring_prod__needs_wakeup(&p->umem_fq)) {
542 struct pollfd pollfd = {
543 .fd = xsk_socket__fd(p->xsk),
544 .events = POLLIN,
545 };
546
547 poll(&pollfd, 1, 0);
548 }
549 return 0;
550 }
551
552 for (i = 0; i < n_pkts; i++) {
553 b->addr[i] = xsk_ring_cons__rx_desc(&p->rxq, pos + i)->addr;
554 b->len[i] = xsk_ring_cons__rx_desc(&p->rxq, pos + i)->len;
555 }
556
557 xsk_ring_cons__release(&p->rxq, n_pkts);
558 p->n_pkts_rx += n_pkts;
559
560 /* UMEM FQ. */
561 for ( ; ; ) {
562 int status;
563
564 status = xsk_ring_prod__reserve(&p->umem_fq, n_pkts, &pos);
565 if (status == n_pkts)
566 break;
567
568 if (xsk_ring_prod__needs_wakeup(&p->umem_fq)) {
569 struct pollfd pollfd = {
570 .fd = xsk_socket__fd(p->xsk),
571 .events = POLLIN,
572 };
573
574 poll(&pollfd, 1, 0);
575 }
576 }
577
578 for (i = 0; i < n_pkts; i++)
579 *xsk_ring_prod__fill_addr(&p->umem_fq, pos + i) =
580 bcache_cons(p->bc);
581
582 xsk_ring_prod__submit(&p->umem_fq, n_pkts);
583
584 return n_pkts;
585}
586
587static inline void
588port_tx_burst(struct port *p, struct burst_tx *b)
589{
590 u32 n_pkts, pos, i;
591 int status;
592
593 /* UMEM CQ. */
594 n_pkts = p->params.bp->umem_cfg.comp_size;
595
596 n_pkts = xsk_ring_cons__peek(&p->umem_cq, n_pkts, &pos);
597
598 for (i = 0; i < n_pkts; i++) {
599 u64 addr = *xsk_ring_cons__comp_addr(&p->umem_cq, pos + i);
600
601 bcache_prod(p->bc, addr);
602 }
603
604 xsk_ring_cons__release(&p->umem_cq, n_pkts);
605
606 /* TXQ. */
607 n_pkts = b->n_pkts;
608
609 for ( ; ; ) {
610 status = xsk_ring_prod__reserve(&p->txq, n_pkts, &pos);
611 if (status == n_pkts)
612 break;
613
614 if (xsk_ring_prod__needs_wakeup(&p->txq))
615 sendto(xsk_socket__fd(p->xsk), NULL, 0, MSG_DONTWAIT,
616 NULL, 0);
617 }
618
619 for (i = 0; i < n_pkts; i++) {
620 xsk_ring_prod__tx_desc(&p->txq, pos + i)->addr = b->addr[i];
621 xsk_ring_prod__tx_desc(&p->txq, pos + i)->len = b->len[i];
622 }
623
624 xsk_ring_prod__submit(&p->txq, n_pkts);
625 if (xsk_ring_prod__needs_wakeup(&p->txq))
626 sendto(xsk_socket__fd(p->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0);
627 p->n_pkts_tx += n_pkts;
628}
629
630/*
631 * Thread
632 *
633 * Packet forwarding threads.
634 */
635#ifndef MAX_PORTS_PER_THREAD
636#define MAX_PORTS_PER_THREAD 16
637#endif
638
639struct thread_data {
640 struct port *ports_rx[MAX_PORTS_PER_THREAD];
641 struct port *ports_tx[MAX_PORTS_PER_THREAD];
642 u32 n_ports_rx;
643 struct burst_rx burst_rx;
644 struct burst_tx burst_tx[MAX_PORTS_PER_THREAD];
645 u32 cpu_core_id;
646 int quit;
647};
648
649static void swap_mac_addresses(void *data)
650{
651 struct ether_header *eth = (struct ether_header *)data;
652 struct ether_addr *src_addr = (struct ether_addr *)&eth->ether_shost;
653 struct ether_addr *dst_addr = (struct ether_addr *)&eth->ether_dhost;
654 struct ether_addr tmp;
655
656 tmp = *src_addr;
657 *src_addr = *dst_addr;
658 *dst_addr = tmp;
659}
660
661static void *
662thread_func(void *arg)
663{
664 struct thread_data *t = arg;
665 cpu_set_t cpu_cores;
666 u32 i;
667
668 CPU_ZERO(&cpu_cores);
669 CPU_SET(t->cpu_core_id, &cpu_cores);
670 pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpu_cores);
671
672 for (i = 0; !t->quit; i = (i + 1) & (t->n_ports_rx - 1)) {
673 struct port *port_rx = t->ports_rx[i];
674 struct port *port_tx = t->ports_tx[i];
675 struct burst_rx *brx = &t->burst_rx;
676 struct burst_tx *btx = &t->burst_tx[i];
677 u32 n_pkts, j;
678
679 /* RX. */
680 n_pkts = port_rx_burst(port_rx, brx);
681 if (!n_pkts)
682 continue;
683
684 /* Process & TX. */
685 for (j = 0; j < n_pkts; j++) {
686 u64 addr = xsk_umem__add_offset_to_addr(brx->addr[j]);
687 u8 *pkt = xsk_umem__get_data(port_rx->params.bp->addr,
688 addr);
689
690 swap_mac_addresses(pkt);
691
692 btx->addr[btx->n_pkts] = brx->addr[j];
693 btx->len[btx->n_pkts] = brx->len[j];
694 btx->n_pkts++;
695
696 if (btx->n_pkts == MAX_BURST_TX) {
697 port_tx_burst(port_tx, btx);
698 btx->n_pkts = 0;
699 }
700 }
701 }
702
703 return NULL;
704}
705
706/*
707 * Process
708 */
709static const struct bpool_params bpool_params_default = {
710 .n_buffers = 64 * 1024,
711 .buffer_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
712 .mmap_flags = 0,
713
714 .n_users_max = 16,
715 .n_buffers_per_slab = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2,
716};
717
718static const struct xsk_umem_config umem_cfg_default = {
719 .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2,
720 .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
721 .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
722 .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM,
723 .flags = 0,
724};
725
726static const struct port_params port_params_default = {
727 .xsk_cfg = {
728 .rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
729 .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
730 .libbpf_flags = 0,
731 .xdp_flags = XDP_FLAGS_DRV_MODE,
732 .bind_flags = XDP_USE_NEED_WAKEUP | XDP_ZEROCOPY,
733 },
734
735 .bp = NULL,
736 .iface = NULL,
737 .iface_queue = 0,
738};
739
740#ifndef MAX_PORTS
741#define MAX_PORTS 64
742#endif
743
744#ifndef MAX_THREADS
745#define MAX_THREADS 64
746#endif
747
748static struct bpool_params bpool_params;
749static struct xsk_umem_config umem_cfg;
750static struct bpool *bp;
751
752static struct port_params port_params[MAX_PORTS];
753static struct port *ports[MAX_PORTS];
754static u64 n_pkts_rx[MAX_PORTS];
755static u64 n_pkts_tx[MAX_PORTS];
756static int n_ports;
757
758static pthread_t threads[MAX_THREADS];
759static struct thread_data thread_data[MAX_THREADS];
760static int n_threads;
761
762static void
763print_usage(char *prog_name)
764{
765 const char *usage =
766 "Usage:\n"
767 "\t%s [ -b SIZE ] -c CORE -i INTERFACE [ -q QUEUE ]\n"
768 "\n"
769 "-c CORE CPU core to run a packet forwarding thread\n"
770 " on. May be invoked multiple times.\n"
771 "\n"
772 "-b SIZE Number of buffers in the buffer pool shared\n"
773 " by all the forwarding threads. Default: %u.\n"
774 "\n"
775 "-i INTERFACE Network interface. Each (INTERFACE, QUEUE)\n"
776 " pair specifies one forwarding port. May be\n"
777 " invoked multiple times.\n"
778 "\n"
779 "-q QUEUE Network interface queue for RX and TX. Each\n"
780 " (INTERFACE, QUEUE) pair specified one\n"
781 " forwarding port. Default: %u. May be invoked\n"
782 " multiple times.\n"
783 "\n";
784 printf(usage,
785 prog_name,
786 bpool_params_default.n_buffers,
787 port_params_default.iface_queue);
788}
789
790static int
791parse_args(int argc, char **argv)
792{
793 struct option lgopts[] = {
794 { NULL, 0, 0, 0 }
795 };
796 int opt, option_index;
797
798 /* Parse the input arguments. */
799 for ( ; ;) {
800 opt = getopt_long(argc, argv, "c:i:q:", lgopts, &option_index);
801 if (opt == EOF)
802 break;
803
804 switch (opt) {
805 case 'b':
806 bpool_params.n_buffers = atoi(optarg);
807 break;
808
809 case 'c':
810 if (n_threads == MAX_THREADS) {
811 printf("Max number of threads (%d) reached.\n",
812 MAX_THREADS);
813 return -1;
814 }
815
816 thread_data[n_threads].cpu_core_id = atoi(optarg);
817 n_threads++;
818 break;
819
820 case 'i':
821 if (n_ports == MAX_PORTS) {
822 printf("Max number of ports (%d) reached.\n",
823 MAX_PORTS);
824 return -1;
825 }
826
827 port_params[n_ports].iface = optarg;
828 port_params[n_ports].iface_queue = 0;
829 n_ports++;
830 break;
831
832 case 'q':
833 if (n_ports == 0) {
834 printf("No port specified for queue.\n");
835 return -1;
836 }
837 port_params[n_ports - 1].iface_queue = atoi(optarg);
838 break;
839
840 default:
841 printf("Illegal argument.\n");
842 return -1;
843 }
844 }
845
846 optind = 1; /* reset getopt lib */
847
848 /* Check the input arguments. */
849 if (!n_ports) {
850 printf("No ports specified.\n");
851 return -1;
852 }
853
854 if (!n_threads) {
855 printf("No threads specified.\n");
856 return -1;
857 }
858
859 if (n_ports % n_threads) {
860 printf("Ports cannot be evenly distributed to threads.\n");
861 return -1;
862 }
863
864 return 0;
865}
866
867static void
868print_port(u32 port_id)
869{
870 struct port *port = ports[port_id];
871
872 printf("Port %u: interface = %s, queue = %u\n",
873 port_id, port->params.iface, port->params.iface_queue);
874}
875
876static void
877print_thread(u32 thread_id)
878{
879 struct thread_data *t = &thread_data[thread_id];
880 u32 i;
881
882 printf("Thread %u (CPU core %u): ",
883 thread_id, t->cpu_core_id);
884
885 for (i = 0; i < t->n_ports_rx; i++) {
886 struct port *port_rx = t->ports_rx[i];
887 struct port *port_tx = t->ports_tx[i];
888
889 printf("(%s, %u) -> (%s, %u), ",
890 port_rx->params.iface,
891 port_rx->params.iface_queue,
892 port_tx->params.iface,
893 port_tx->params.iface_queue);
894 }
895
896 printf("\n");
897}
898
899static void
900print_port_stats_separator(void)
901{
902 printf("+-%4s-+-%12s-+-%13s-+-%12s-+-%13s-+\n",
903 "----",
904 "------------",
905 "-------------",
906 "------------",
907 "-------------");
908}
909
910static void
911print_port_stats_header(void)
912{
913 print_port_stats_separator();
914 printf("| %4s | %12s | %13s | %12s | %13s |\n",
915 "Port",
916 "RX packets",
917 "RX rate (pps)",
918 "TX packets",
919 "TX_rate (pps)");
920 print_port_stats_separator();
921}
922
923static void
924print_port_stats_trailer(void)
925{
926 print_port_stats_separator();
927 printf("\n");
928}
929
930static void
931print_port_stats(int port_id, u64 ns_diff)
932{
933 struct port *p = ports[port_id];
934 double rx_pps, tx_pps;
935
936 rx_pps = (p->n_pkts_rx - n_pkts_rx[port_id]) * 1000000000. / ns_diff;
937 tx_pps = (p->n_pkts_tx - n_pkts_tx[port_id]) * 1000000000. / ns_diff;
938
939 printf("| %4d | %12llu | %13.0f | %12llu | %13.0f |\n",
940 port_id,
941 p->n_pkts_rx,
942 rx_pps,
943 p->n_pkts_tx,
944 tx_pps);
945
946 n_pkts_rx[port_id] = p->n_pkts_rx;
947 n_pkts_tx[port_id] = p->n_pkts_tx;
948}
949
950static void
951print_port_stats_all(u64 ns_diff)
952{
953 int i;
954
955 print_port_stats_header();
956 for (i = 0; i < n_ports; i++)
957 print_port_stats(i, ns_diff);
958 print_port_stats_trailer();
959}
960
961static int quit;
962
963static void
964signal_handler(int sig)
965{
966 quit = 1;
967}
968
969static void remove_xdp_program(void)
970{
971 int i;
972
973 for (i = 0 ; i < n_ports; i++)
974 bpf_set_link_xdp_fd(if_nametoindex(port_params[i].iface), -1,
975 port_params[i].xsk_cfg.xdp_flags);
976}
977
978int main(int argc, char **argv)
979{
980 struct timespec time;
981 u64 ns0;
982 int i;
983
984 /* Parse args. */
985 memcpy(&bpool_params, &bpool_params_default,
986 sizeof(struct bpool_params));
987 memcpy(&umem_cfg, &umem_cfg_default,
988 sizeof(struct xsk_umem_config));
989 for (i = 0; i < MAX_PORTS; i++)
990 memcpy(&port_params[i], &port_params_default,
991 sizeof(struct port_params));
992
993 if (parse_args(argc, argv)) {
994 print_usage(argv[0]);
995 return -1;
996 }
997
998 /* Buffer pool initialization. */
999 bp = bpool_init(&bpool_params, &umem_cfg);
1000 if (!bp) {
1001 printf("Buffer pool initialization failed.\n");
1002 return -1;
1003 }
1004 printf("Buffer pool created successfully.\n");
1005
1006 /* Ports initialization. */
1007 for (i = 0; i < MAX_PORTS; i++)
1008 port_params[i].bp = bp;
1009
1010 for (i = 0; i < n_ports; i++) {
1011 ports[i] = port_init(&port_params[i]);
1012 if (!ports[i]) {
1013 printf("Port %d initialization failed.\n", i);
1014 return -1;
1015 }
1016 print_port(i);
1017 }
1018 printf("All ports created successfully.\n");
1019
1020 /* Threads. */
1021 for (i = 0; i < n_threads; i++) {
1022 struct thread_data *t = &thread_data[i];
1023 u32 n_ports_per_thread = n_ports / n_threads, j;
1024
1025 for (j = 0; j < n_ports_per_thread; j++) {
1026 t->ports_rx[j] = ports[i * n_ports_per_thread + j];
1027 t->ports_tx[j] = ports[i * n_ports_per_thread +
1028 (j + 1) % n_ports_per_thread];
1029 }
1030
1031 t->n_ports_rx = n_ports_per_thread;
1032
1033 print_thread(i);
1034 }
1035
1036 for (i = 0; i < n_threads; i++) {
1037 int status;
1038
1039 status = pthread_create(&threads[i],
1040 NULL,
1041 thread_func,
1042 &thread_data[i]);
1043 if (status) {
1044 printf("Thread %d creation failed.\n", i);
1045 return -1;
1046 }
1047 }
1048 printf("All threads created successfully.\n");
1049
1050 /* Print statistics. */
1051 signal(SIGINT, signal_handler);
1052 signal(SIGTERM, signal_handler);
1053 signal(SIGABRT, signal_handler);
1054
1055 clock_gettime(CLOCK_MONOTONIC, &time);
1056 ns0 = time.tv_sec * 1000000000UL + time.tv_nsec;
1057 for ( ; !quit; ) {
1058 u64 ns1, ns_diff;
1059
1060 sleep(1);
1061 clock_gettime(CLOCK_MONOTONIC, &time);
1062 ns1 = time.tv_sec * 1000000000UL + time.tv_nsec;
1063 ns_diff = ns1 - ns0;
1064 ns0 = ns1;
1065
1066 print_port_stats_all(ns_diff);
1067 }
1068
1069 /* Threads completion. */
1070 printf("Quit.\n");
1071 for (i = 0; i < n_threads; i++)
1072 thread_data[i].quit = 1;
1073
1074 for (i = 0; i < n_threads; i++)
1075 pthread_join(threads[i], NULL);
1076
1077 for (i = 0; i < n_ports; i++)
1078 port_free(ports[i]);
1079
1080 bpool_free(bp);
1081
1082 remove_xdp_program();
1083
1084 return 0;
1085}
diff --git a/samples/configfs/Makefile b/samples/configfs/Makefile
new file mode 100644
index 000000000..92d661fcb
--- /dev/null
+++ b/samples/configfs/Makefile
@@ -0,0 +1,3 @@
1# SPDX-License-Identifier: GPL-2.0-only
2
3obj-$(CONFIG_SAMPLE_CONFIGFS) += configfs_sample.o
diff --git a/samples/configfs/configfs_sample.c b/samples/configfs/configfs_sample.c
new file mode 100644
index 000000000..f9008be7a
--- /dev/null
+++ b/samples/configfs/configfs_sample.c
@@ -0,0 +1,369 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * vim: noexpandtab ts=8 sts=0 sw=8:
4 *
5 * configfs_example_macros.c - This file is a demonstration module
6 * containing a number of configfs subsystems. It uses the helper
7 * macros defined by configfs.h
8 *
9 * Based on sysfs:
10 * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
11 *
12 * configfs Copyright (C) 2005 Oracle. All rights reserved.
13 */
14
15#include <linux/init.h>
16#include <linux/kernel.h>
17#include <linux/module.h>
18#include <linux/slab.h>
19#include <linux/configfs.h>
20
21/*
22 * 01-childless
23 *
24 * This first example is a childless subsystem. It cannot create
25 * any config_items. It just has attributes.
26 *
27 * Note that we are enclosing the configfs_subsystem inside a container.
28 * This is not necessary if a subsystem has no attributes directly
29 * on the subsystem. See the next example, 02-simple-children, for
30 * such a subsystem.
31 */
32
33struct childless {
34 struct configfs_subsystem subsys;
35 int showme;
36 int storeme;
37};
38
39static inline struct childless *to_childless(struct config_item *item)
40{
41 return container_of(to_configfs_subsystem(to_config_group(item)),
42 struct childless, subsys);
43}
44
45static ssize_t childless_showme_show(struct config_item *item, char *page)
46{
47 struct childless *childless = to_childless(item);
48 ssize_t pos;
49
50 pos = sprintf(page, "%d\n", childless->showme);
51 childless->showme++;
52
53 return pos;
54}
55
56static ssize_t childless_storeme_show(struct config_item *item, char *page)
57{
58 return sprintf(page, "%d\n", to_childless(item)->storeme);
59}
60
61static ssize_t childless_storeme_store(struct config_item *item,
62 const char *page, size_t count)
63{
64 struct childless *childless = to_childless(item);
65 int ret;
66
67 ret = kstrtoint(page, 10, &childless->storeme);
68 if (ret)
69 return ret;
70
71 return count;
72}
73
74static ssize_t childless_description_show(struct config_item *item, char *page)
75{
76 return sprintf(page,
77"[01-childless]\n"
78"\n"
79"The childless subsystem is the simplest possible subsystem in\n"
80"configfs. It does not support the creation of child config_items.\n"
81"It only has a few attributes. In fact, it isn't much different\n"
82"than a directory in /proc.\n");
83}
84
85CONFIGFS_ATTR_RO(childless_, showme);
86CONFIGFS_ATTR(childless_, storeme);
87CONFIGFS_ATTR_RO(childless_, description);
88
89static struct configfs_attribute *childless_attrs[] = {
90 &childless_attr_showme,
91 &childless_attr_storeme,
92 &childless_attr_description,
93 NULL,
94};
95
96static const struct config_item_type childless_type = {
97 .ct_attrs = childless_attrs,
98 .ct_owner = THIS_MODULE,
99};
100
101static struct childless childless_subsys = {
102 .subsys = {
103 .su_group = {
104 .cg_item = {
105 .ci_namebuf = "01-childless",
106 .ci_type = &childless_type,
107 },
108 },
109 },
110};
111
112/* ----------------------------------------------------------------- */
113
114/*
115 * 02-simple-children
116 *
117 * This example merely has a simple one-attribute child. Note that
118 * there is no extra attribute structure, as the child's attribute is
119 * known from the get-go. Also, there is no container for the
120 * subsystem, as it has no attributes of its own.
121 */
122
123struct simple_child {
124 struct config_item item;
125 int storeme;
126};
127
128static inline struct simple_child *to_simple_child(struct config_item *item)
129{
130 return container_of(item, struct simple_child, item);
131}
132
133static ssize_t simple_child_storeme_show(struct config_item *item, char *page)
134{
135 return sprintf(page, "%d\n", to_simple_child(item)->storeme);
136}
137
138static ssize_t simple_child_storeme_store(struct config_item *item,
139 const char *page, size_t count)
140{
141 struct simple_child *simple_child = to_simple_child(item);
142 int ret;
143
144 ret = kstrtoint(page, 10, &simple_child->storeme);
145 if (ret)
146 return ret;
147
148 return count;
149}
150
151CONFIGFS_ATTR(simple_child_, storeme);
152
153static struct configfs_attribute *simple_child_attrs[] = {
154 &simple_child_attr_storeme,
155 NULL,
156};
157
158static void simple_child_release(struct config_item *item)
159{
160 kfree(to_simple_child(item));
161}
162
163static struct configfs_item_operations simple_child_item_ops = {
164 .release = simple_child_release,
165};
166
167static const struct config_item_type simple_child_type = {
168 .ct_item_ops = &simple_child_item_ops,
169 .ct_attrs = simple_child_attrs,
170 .ct_owner = THIS_MODULE,
171};
172
173struct simple_children {
174 struct config_group group;
175};
176
177static inline struct simple_children *to_simple_children(struct config_item *item)
178{
179 return container_of(to_config_group(item),
180 struct simple_children, group);
181}
182
183static struct config_item *simple_children_make_item(struct config_group *group,
184 const char *name)
185{
186 struct simple_child *simple_child;
187
188 simple_child = kzalloc(sizeof(struct simple_child), GFP_KERNEL);
189 if (!simple_child)
190 return ERR_PTR(-ENOMEM);
191
192 config_item_init_type_name(&simple_child->item, name,
193 &simple_child_type);
194
195 return &simple_child->item;
196}
197
198static ssize_t simple_children_description_show(struct config_item *item,
199 char *page)
200{
201 return sprintf(page,
202"[02-simple-children]\n"
203"\n"
204"This subsystem allows the creation of child config_items. These\n"
205"items have only one attribute that is readable and writeable.\n");
206}
207
208CONFIGFS_ATTR_RO(simple_children_, description);
209
210static struct configfs_attribute *simple_children_attrs[] = {
211 &simple_children_attr_description,
212 NULL,
213};
214
215static void simple_children_release(struct config_item *item)
216{
217 kfree(to_simple_children(item));
218}
219
220static struct configfs_item_operations simple_children_item_ops = {
221 .release = simple_children_release,
222};
223
224/*
225 * Note that, since no extra work is required on ->drop_item(),
226 * no ->drop_item() is provided.
227 */
228static struct configfs_group_operations simple_children_group_ops = {
229 .make_item = simple_children_make_item,
230};
231
232static const struct config_item_type simple_children_type = {
233 .ct_item_ops = &simple_children_item_ops,
234 .ct_group_ops = &simple_children_group_ops,
235 .ct_attrs = simple_children_attrs,
236 .ct_owner = THIS_MODULE,
237};
238
239static struct configfs_subsystem simple_children_subsys = {
240 .su_group = {
241 .cg_item = {
242 .ci_namebuf = "02-simple-children",
243 .ci_type = &simple_children_type,
244 },
245 },
246};
247
248/* ----------------------------------------------------------------- */
249
250/*
251 * 03-group-children
252 *
253 * This example reuses the simple_children group from above. However,
254 * the simple_children group is not the subsystem itself, it is a
255 * child of the subsystem. Creation of a group in the subsystem creates
256 * a new simple_children group. That group can then have simple_child
257 * children of its own.
258 */
259
260static struct config_group *group_children_make_group(
261 struct config_group *group, const char *name)
262{
263 struct simple_children *simple_children;
264
265 simple_children = kzalloc(sizeof(struct simple_children),
266 GFP_KERNEL);
267 if (!simple_children)
268 return ERR_PTR(-ENOMEM);
269
270 config_group_init_type_name(&simple_children->group, name,
271 &simple_children_type);
272
273 return &simple_children->group;
274}
275
276static ssize_t group_children_description_show(struct config_item *item,
277 char *page)
278{
279 return sprintf(page,
280"[03-group-children]\n"
281"\n"
282"This subsystem allows the creation of child config_groups. These\n"
283"groups are like the subsystem simple-children.\n");
284}
285
286CONFIGFS_ATTR_RO(group_children_, description);
287
288static struct configfs_attribute *group_children_attrs[] = {
289 &group_children_attr_description,
290 NULL,
291};
292
293/*
294 * Note that, since no extra work is required on ->drop_item(),
295 * no ->drop_item() is provided.
296 */
297static struct configfs_group_operations group_children_group_ops = {
298 .make_group = group_children_make_group,
299};
300
301static const struct config_item_type group_children_type = {
302 .ct_group_ops = &group_children_group_ops,
303 .ct_attrs = group_children_attrs,
304 .ct_owner = THIS_MODULE,
305};
306
307static struct configfs_subsystem group_children_subsys = {
308 .su_group = {
309 .cg_item = {
310 .ci_namebuf = "03-group-children",
311 .ci_type = &group_children_type,
312 },
313 },
314};
315
316/* ----------------------------------------------------------------- */
317
318/*
319 * We're now done with our subsystem definitions.
320 * For convenience in this module, here's a list of them all. It
321 * allows the init function to easily register them. Most modules
322 * will only have one subsystem, and will only call register_subsystem
323 * on it directly.
324 */
325static struct configfs_subsystem *example_subsys[] = {
326 &childless_subsys.subsys,
327 &simple_children_subsys,
328 &group_children_subsys,
329 NULL,
330};
331
332static int __init configfs_example_init(void)
333{
334 struct configfs_subsystem *subsys;
335 int ret, i;
336
337 for (i = 0; example_subsys[i]; i++) {
338 subsys = example_subsys[i];
339
340 config_group_init(&subsys->su_group);
341 mutex_init(&subsys->su_mutex);
342 ret = configfs_register_subsystem(subsys);
343 if (ret) {
344 pr_err("Error %d while registering subsystem %s\n",
345 ret, subsys->su_group.cg_item.ci_namebuf);
346 goto out_unregister;
347 }
348 }
349
350 return 0;
351
352out_unregister:
353 for (i--; i >= 0; i--)
354 configfs_unregister_subsystem(example_subsys[i]);
355
356 return ret;
357}
358
359static void __exit configfs_example_exit(void)
360{
361 int i;
362
363 for (i = 0; example_subsys[i]; i++)
364 configfs_unregister_subsystem(example_subsys[i]);
365}
366
367module_init(configfs_example_init);
368module_exit(configfs_example_exit);
369MODULE_LICENSE("GPL");
diff --git a/samples/connector/.gitignore b/samples/connector/.gitignore
new file mode 100644
index 000000000..d86f2ff9c
--- /dev/null
+++ b/samples/connector/.gitignore
@@ -0,0 +1,2 @@
1# SPDX-License-Identifier: GPL-2.0-only
2ucon
diff --git a/samples/connector/Makefile b/samples/connector/Makefile
new file mode 100644
index 000000000..d98a9e047
--- /dev/null
+++ b/samples/connector/Makefile
@@ -0,0 +1,6 @@
1# SPDX-License-Identifier: GPL-2.0
2obj-$(CONFIG_SAMPLE_CONNECTOR) += cn_test.o
3
4userprogs-always-$(CONFIG_CC_CAN_LINK) += ucon
5
6userccflags += -I usr/include
diff --git a/samples/connector/cn_test.c b/samples/connector/cn_test.c
new file mode 100644
index 000000000..0958a171d
--- /dev/null
+++ b/samples/connector/cn_test.c
@@ -0,0 +1,188 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * cn_test.c
4 *
5 * 2004+ Copyright (c) Evgeniy Polyakov <zbr@ioremap.net>
6 * All rights reserved.
7 */
8
9#define pr_fmt(fmt) "cn_test: " fmt
10
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/moduleparam.h>
14#include <linux/skbuff.h>
15#include <linux/slab.h>
16#include <linux/timer.h>
17
18#include <linux/connector.h>
19
20static struct cb_id cn_test_id = { CN_NETLINK_USERS + 3, 0x456 };
21static char cn_test_name[] = "cn_test";
22static struct sock *nls;
23static struct timer_list cn_test_timer;
24
25static void cn_test_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
26{
27 pr_info("%s: %lu: idx=%x, val=%x, seq=%u, ack=%u, len=%d: %s.\n",
28 __func__, jiffies, msg->id.idx, msg->id.val,
29 msg->seq, msg->ack, msg->len,
30 msg->len ? (char *)msg->data : "");
31}
32
33/*
34 * Do not remove this function even if no one is using it as
35 * this is an example of how to get notifications about new
36 * connector user registration
37 */
38#if 0
39static int cn_test_want_notify(void)
40{
41 struct cn_ctl_msg *ctl;
42 struct cn_notify_req *req;
43 struct cn_msg *msg = NULL;
44 int size, size0;
45 struct sk_buff *skb;
46 struct nlmsghdr *nlh;
47 u32 group = 1;
48
49 size0 = sizeof(*msg) + sizeof(*ctl) + 3 * sizeof(*req);
50
51 size = NLMSG_SPACE(size0);
52
53 skb = alloc_skb(size, GFP_ATOMIC);
54 if (!skb) {
55 pr_err("failed to allocate new skb with size=%u\n", size);
56 return -ENOMEM;
57 }
58
59 nlh = nlmsg_put(skb, 0, 0x123, NLMSG_DONE, size - sizeof(*nlh), 0);
60 if (!nlh) {
61 kfree_skb(skb);
62 return -EMSGSIZE;
63 }
64
65 msg = nlmsg_data(nlh);
66
67 memset(msg, 0, size0);
68
69 msg->id.idx = -1;
70 msg->id.val = -1;
71 msg->seq = 0x123;
72 msg->ack = 0x345;
73 msg->len = size0 - sizeof(*msg);
74
75 ctl = (struct cn_ctl_msg *)(msg + 1);
76
77 ctl->idx_notify_num = 1;
78 ctl->val_notify_num = 2;
79 ctl->group = group;
80 ctl->len = msg->len - sizeof(*ctl);
81
82 req = (struct cn_notify_req *)(ctl + 1);
83
84 /*
85 * Idx.
86 */
87 req->first = cn_test_id.idx;
88 req->range = 10;
89
90 /*
91 * Val 0.
92 */
93 req++;
94 req->first = cn_test_id.val;
95 req->range = 10;
96
97 /*
98 * Val 1.
99 */
100 req++;
101 req->first = cn_test_id.val + 20;
102 req->range = 10;
103
104 NETLINK_CB(skb).dst_group = ctl->group;
105 //netlink_broadcast(nls, skb, 0, ctl->group, GFP_ATOMIC);
106 netlink_unicast(nls, skb, 0, 0);
107
108 pr_info("request was sent: group=0x%x\n", ctl->group);
109
110 return 0;
111}
112#endif
113
114static u32 cn_test_timer_counter;
115static void cn_test_timer_func(struct timer_list *unused)
116{
117 struct cn_msg *m;
118 char data[32];
119
120 pr_debug("%s: timer fired\n", __func__);
121
122 m = kzalloc(sizeof(*m) + sizeof(data), GFP_ATOMIC);
123 if (m) {
124
125 memcpy(&m->id, &cn_test_id, sizeof(m->id));
126 m->seq = cn_test_timer_counter;
127 m->len = sizeof(data);
128
129 m->len =
130 scnprintf(data, sizeof(data), "counter = %u",
131 cn_test_timer_counter) + 1;
132
133 memcpy(m + 1, data, m->len);
134
135 cn_netlink_send(m, 0, 0, GFP_ATOMIC);
136 kfree(m);
137 }
138
139 cn_test_timer_counter++;
140
141 mod_timer(&cn_test_timer, jiffies + msecs_to_jiffies(1000));
142}
143
144static int cn_test_init(void)
145{
146 int err;
147
148 err = cn_add_callback(&cn_test_id, cn_test_name, cn_test_callback);
149 if (err)
150 goto err_out;
151 cn_test_id.val++;
152 err = cn_add_callback(&cn_test_id, cn_test_name, cn_test_callback);
153 if (err) {
154 cn_del_callback(&cn_test_id);
155 goto err_out;
156 }
157
158 timer_setup(&cn_test_timer, cn_test_timer_func, 0);
159 mod_timer(&cn_test_timer, jiffies + msecs_to_jiffies(1000));
160
161 pr_info("initialized with id={%u.%u}\n",
162 cn_test_id.idx, cn_test_id.val);
163
164 return 0;
165
166 err_out:
167 if (nls && nls->sk_socket)
168 sock_release(nls->sk_socket);
169
170 return err;
171}
172
173static void cn_test_fini(void)
174{
175 del_timer_sync(&cn_test_timer);
176 cn_del_callback(&cn_test_id);
177 cn_test_id.val--;
178 cn_del_callback(&cn_test_id);
179 if (nls && nls->sk_socket)
180 sock_release(nls->sk_socket);
181}
182
183module_init(cn_test_init);
184module_exit(cn_test_fini);
185
186MODULE_LICENSE("GPL");
187MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");
188MODULE_DESCRIPTION("Connector's test module");
diff --git a/samples/connector/ucon.c b/samples/connector/ucon.c
new file mode 100644
index 000000000..fa17f8642
--- /dev/null
+++ b/samples/connector/ucon.c
@@ -0,0 +1,236 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * ucon.c
4 *
5 * Copyright (c) 2004+ Evgeniy Polyakov <zbr@ioremap.net>
6 */
7
8#include <asm/types.h>
9
10#include <sys/types.h>
11#include <sys/socket.h>
12#include <sys/poll.h>
13
14#include <linux/netlink.h>
15#include <linux/rtnetlink.h>
16
17#include <arpa/inet.h>
18
19#include <stdbool.h>
20#include <stdio.h>
21#include <stdlib.h>
22#include <unistd.h>
23#include <string.h>
24#include <errno.h>
25#include <time.h>
26#include <getopt.h>
27
28#include <linux/connector.h>
29
30#define DEBUG
31#define NETLINK_CONNECTOR 11
32
33/* Hopefully your userspace connector.h matches this kernel */
34#define CN_TEST_IDX CN_NETLINK_USERS + 3
35#define CN_TEST_VAL 0x456
36
37#ifdef DEBUG
38#define ulog(f, a...) fprintf(stdout, f, ##a)
39#else
40#define ulog(f, a...) do {} while (0)
41#endif
42
43static int need_exit;
44static __u32 seq;
45
46static int netlink_send(int s, struct cn_msg *msg)
47{
48 struct nlmsghdr *nlh;
49 unsigned int size;
50 int err;
51 char buf[128];
52 struct cn_msg *m;
53
54 size = NLMSG_SPACE(sizeof(struct cn_msg) + msg->len);
55
56 nlh = (struct nlmsghdr *)buf;
57 nlh->nlmsg_seq = seq++;
58 nlh->nlmsg_pid = getpid();
59 nlh->nlmsg_type = NLMSG_DONE;
60 nlh->nlmsg_len = size;
61 nlh->nlmsg_flags = 0;
62
63 m = NLMSG_DATA(nlh);
64#if 0
65 ulog("%s: [%08x.%08x] len=%u, seq=%u, ack=%u.\n",
66 __func__, msg->id.idx, msg->id.val, msg->len, msg->seq, msg->ack);
67#endif
68 memcpy(m, msg, sizeof(*m) + msg->len);
69
70 err = send(s, nlh, size, 0);
71 if (err == -1)
72 ulog("Failed to send: %s [%d].\n",
73 strerror(errno), errno);
74
75 return err;
76}
77
78static void usage(void)
79{
80 printf(
81 "Usage: ucon [options] [output file]\n"
82 "\n"
83 "\t-h\tthis help screen\n"
84 "\t-s\tsend buffers to the test module\n"
85 "\n"
86 "The default behavior of ucon is to subscribe to the test module\n"
87 "and wait for state messages. Any ones received are dumped to the\n"
88 "specified output file (or stdout). The test module is assumed to\n"
89 "have an id of {%u.%u}\n"
90 "\n"
91 "If you get no output, then verify the cn_test module id matches\n"
92 "the expected id above.\n"
93 , CN_TEST_IDX, CN_TEST_VAL
94 );
95}
96
97int main(int argc, char *argv[])
98{
99 int s;
100 char buf[1024];
101 int len;
102 struct nlmsghdr *reply;
103 struct sockaddr_nl l_local;
104 struct cn_msg *data;
105 FILE *out;
106 time_t tm;
107 struct pollfd pfd;
108 bool send_msgs = false;
109
110 while ((s = getopt(argc, argv, "hs")) != -1) {
111 switch (s) {
112 case 's':
113 send_msgs = true;
114 break;
115
116 case 'h':
117 usage();
118 return 0;
119
120 default:
121 /* getopt() outputs an error for us */
122 usage();
123 return 1;
124 }
125 }
126
127 if (argc != optind) {
128 out = fopen(argv[optind], "a+");
129 if (!out) {
130 ulog("Unable to open %s for writing: %s\n",
131 argv[1], strerror(errno));
132 out = stdout;
133 }
134 } else
135 out = stdout;
136
137 memset(buf, 0, sizeof(buf));
138
139 s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
140 if (s == -1) {
141 perror("socket");
142 return -1;
143 }
144
145 l_local.nl_family = AF_NETLINK;
146 l_local.nl_groups = -1; /* bitmask of requested groups */
147 l_local.nl_pid = 0;
148
149 ulog("subscribing to %u.%u\n", CN_TEST_IDX, CN_TEST_VAL);
150
151 if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) {
152 perror("bind");
153 close(s);
154 return -1;
155 }
156
157#if 0
158 {
159 int on = 0x57; /* Additional group number */
160 setsockopt(s, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &on, sizeof(on));
161 }
162#endif
163 if (send_msgs) {
164 int i, j;
165
166 memset(buf, 0, sizeof(buf));
167
168 data = (struct cn_msg *)buf;
169
170 data->id.idx = CN_TEST_IDX;
171 data->id.val = CN_TEST_VAL;
172 data->seq = seq++;
173 data->ack = 0;
174 data->len = 0;
175
176 for (j=0; j<10; ++j) {
177 for (i=0; i<1000; ++i) {
178 len = netlink_send(s, data);
179 }
180
181 ulog("%d messages have been sent to %08x.%08x.\n", i, data->id.idx, data->id.val);
182 }
183
184 return 0;
185 }
186
187
188 pfd.fd = s;
189
190 while (!need_exit) {
191 pfd.events = POLLIN;
192 pfd.revents = 0;
193 switch (poll(&pfd, 1, -1)) {
194 case 0:
195 need_exit = 1;
196 break;
197 case -1:
198 if (errno != EINTR) {
199 need_exit = 1;
200 break;
201 }
202 continue;
203 }
204 if (need_exit)
205 break;
206
207 memset(buf, 0, sizeof(buf));
208 len = recv(s, buf, sizeof(buf), 0);
209 if (len == -1) {
210 perror("recv buf");
211 close(s);
212 return -1;
213 }
214 reply = (struct nlmsghdr *)buf;
215
216 switch (reply->nlmsg_type) {
217 case NLMSG_ERROR:
218 fprintf(out, "Error message received.\n");
219 fflush(out);
220 break;
221 case NLMSG_DONE:
222 data = (struct cn_msg *)NLMSG_DATA(reply);
223
224 time(&tm);
225 fprintf(out, "%.24s : [%x.%x] [%08u.%08u].\n",
226 ctime(&tm), data->id.idx, data->id.val, data->seq, data->ack);
227 fflush(out);
228 break;
229 default:
230 break;
231 }
232 }
233
234 close(s);
235 return 0;
236}
diff --git a/samples/ftrace/Makefile b/samples/ftrace/Makefile
new file mode 100644
index 000000000..4ce896e10
--- /dev/null
+++ b/samples/ftrace/Makefile
@@ -0,0 +1,8 @@
1# SPDX-License-Identifier: GPL-2.0-only
2
3obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct.o
4obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-too.o
5obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-modify.o
6
7CFLAGS_sample-trace-array.o := -I$(src)
8obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += sample-trace-array.o
diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c
new file mode 100644
index 000000000..d620f3da0
--- /dev/null
+++ b/samples/ftrace/ftrace-direct-modify.c
@@ -0,0 +1,97 @@
1// SPDX-License-Identifier: GPL-2.0-only
2#include <linux/module.h>
3#include <linux/kthread.h>
4#include <linux/ftrace.h>
5
6extern void my_direct_func1(void);
7extern void my_direct_func2(void);
8
9void my_direct_func1(void)
10{
11 trace_printk("my direct func1\n");
12}
13
14void my_direct_func2(void)
15{
16 trace_printk("my direct func2\n");
17}
18
19extern void my_tramp1(void *);
20extern void my_tramp2(void *);
21
22static unsigned long my_ip = (unsigned long)schedule;
23
24asm (
25" .pushsection .text, \"ax\", @progbits\n"
26" .type my_tramp1, @function\n"
27" .globl my_tramp1\n"
28" my_tramp1:"
29" pushq %rbp\n"
30" movq %rsp, %rbp\n"
31" call my_direct_func1\n"
32" leave\n"
33" .size my_tramp1, .-my_tramp1\n"
34 ASM_RET
35" .type my_tramp2, @function\n"
36" .globl my_tramp2\n"
37" my_tramp2:"
38" pushq %rbp\n"
39" movq %rsp, %rbp\n"
40" call my_direct_func2\n"
41" leave\n"
42 ASM_RET
43" .size my_tramp2, .-my_tramp2\n"
44" .popsection\n"
45);
46
47static unsigned long my_tramp = (unsigned long)my_tramp1;
48static unsigned long tramps[2] = {
49 (unsigned long)my_tramp1,
50 (unsigned long)my_tramp2,
51};
52
53static int simple_thread(void *arg)
54{
55 static int t;
56 int ret = 0;
57
58 while (!kthread_should_stop()) {
59 set_current_state(TASK_INTERRUPTIBLE);
60 schedule_timeout(2 * HZ);
61
62 if (ret)
63 continue;
64 t ^= 1;
65 ret = modify_ftrace_direct(my_ip, my_tramp, tramps[t]);
66 if (!ret)
67 my_tramp = tramps[t];
68 WARN_ON_ONCE(ret);
69 }
70
71 return 0;
72}
73
74static struct task_struct *simple_tsk;
75
76static int __init ftrace_direct_init(void)
77{
78 int ret;
79
80 ret = register_ftrace_direct(my_ip, my_tramp);
81 if (!ret)
82 simple_tsk = kthread_run(simple_thread, NULL, "event-sample-fn");
83 return ret;
84}
85
86static void __exit ftrace_direct_exit(void)
87{
88 kthread_stop(simple_tsk);
89 unregister_ftrace_direct(my_ip, my_tramp);
90}
91
92module_init(ftrace_direct_init);
93module_exit(ftrace_direct_exit);
94
95MODULE_AUTHOR("Steven Rostedt");
96MODULE_DESCRIPTION("Example use case of using modify_ftrace_direct()");
97MODULE_LICENSE("GPL");
diff --git a/samples/ftrace/ftrace-direct-too.c b/samples/ftrace/ftrace-direct-too.c
new file mode 100644
index 000000000..3927cb880
--- /dev/null
+++ b/samples/ftrace/ftrace-direct-too.c
@@ -0,0 +1,57 @@
1// SPDX-License-Identifier: GPL-2.0-only
2#include <linux/module.h>
3
4#include <linux/mm.h> /* for handle_mm_fault() */
5#include <linux/ftrace.h>
6
7extern void my_direct_func(struct vm_area_struct *vma,
8 unsigned long address, unsigned int flags);
9
10void my_direct_func(struct vm_area_struct *vma,
11 unsigned long address, unsigned int flags)
12{
13 trace_printk("handle mm fault vma=%p address=%lx flags=%x\n",
14 vma, address, flags);
15}
16
17extern void my_tramp(void *);
18
19asm (
20" .pushsection .text, \"ax\", @progbits\n"
21" .type my_tramp, @function\n"
22" .globl my_tramp\n"
23" my_tramp:"
24" pushq %rbp\n"
25" movq %rsp, %rbp\n"
26" pushq %rdi\n"
27" pushq %rsi\n"
28" pushq %rdx\n"
29" call my_direct_func\n"
30" popq %rdx\n"
31" popq %rsi\n"
32" popq %rdi\n"
33" leave\n"
34 ASM_RET
35" .size my_tramp, .-my_tramp\n"
36" .popsection\n"
37);
38
39
40static int __init ftrace_direct_init(void)
41{
42 return register_ftrace_direct((unsigned long)handle_mm_fault,
43 (unsigned long)my_tramp);
44}
45
46static void __exit ftrace_direct_exit(void)
47{
48 unregister_ftrace_direct((unsigned long)handle_mm_fault,
49 (unsigned long)my_tramp);
50}
51
52module_init(ftrace_direct_init);
53module_exit(ftrace_direct_exit);
54
55MODULE_AUTHOR("Steven Rostedt");
56MODULE_DESCRIPTION("Another example use case of using register_ftrace_direct()");
57MODULE_LICENSE("GPL");
diff --git a/samples/ftrace/ftrace-direct.c b/samples/ftrace/ftrace-direct.c
new file mode 100644
index 000000000..1e901bb8d
--- /dev/null
+++ b/samples/ftrace/ftrace-direct.c
@@ -0,0 +1,50 @@
1// SPDX-License-Identifier: GPL-2.0-only
2#include <linux/module.h>
3
4#include <linux/sched.h> /* for wake_up_process() */
5#include <linux/ftrace.h>
6
7extern void my_direct_func(struct task_struct *p);
8
9void my_direct_func(struct task_struct *p)
10{
11 trace_printk("waking up %s-%d\n", p->comm, p->pid);
12}
13
14extern void my_tramp(void *);
15
16asm (
17" .pushsection .text, \"ax\", @progbits\n"
18" .type my_tramp, @function\n"
19" .globl my_tramp\n"
20" my_tramp:"
21" pushq %rbp\n"
22" movq %rsp, %rbp\n"
23" pushq %rdi\n"
24" call my_direct_func\n"
25" popq %rdi\n"
26" leave\n"
27 ASM_RET
28" .size my_tramp, .-my_tramp\n"
29" .popsection\n"
30);
31
32
33static int __init ftrace_direct_init(void)
34{
35 return register_ftrace_direct((unsigned long)wake_up_process,
36 (unsigned long)my_tramp);
37}
38
39static void __exit ftrace_direct_exit(void)
40{
41 unregister_ftrace_direct((unsigned long)wake_up_process,
42 (unsigned long)my_tramp);
43}
44
45module_init(ftrace_direct_init);
46module_exit(ftrace_direct_exit);
47
48MODULE_AUTHOR("Steven Rostedt");
49MODULE_DESCRIPTION("Example use case of using register_ftrace_direct()");
50MODULE_LICENSE("GPL");
diff --git a/samples/ftrace/sample-trace-array.c b/samples/ftrace/sample-trace-array.c
new file mode 100644
index 000000000..6aba02a31
--- /dev/null
+++ b/samples/ftrace/sample-trace-array.c
@@ -0,0 +1,143 @@
1// SPDX-License-Identifier: GPL-2.0-only
2#include <linux/module.h>
3#include <linux/kthread.h>
4#include <linux/trace.h>
5#include <linux/trace_events.h>
6#include <linux/timer.h>
7#include <linux/err.h>
8#include <linux/jiffies.h>
9#include <linux/workqueue.h>
10
11/*
12 * Any file that uses trace points, must include the header.
13 * But only one file, must include the header by defining
14 * CREATE_TRACE_POINTS first. This will make the C code that
15 * creates the handles for the trace points.
16 */
17#define CREATE_TRACE_POINTS
18#include "sample-trace-array.h"
19
20struct trace_array *tr;
21static void mytimer_handler(struct timer_list *unused);
22static struct task_struct *simple_tsk;
23
24static void trace_work_fn(struct work_struct *work)
25{
26 /*
27 * Disable tracing for event "sample_event".
28 */
29 trace_array_set_clr_event(tr, "sample-subsystem", "sample_event",
30 false);
31}
32static DECLARE_WORK(trace_work, trace_work_fn);
33
34/*
35 * mytimer: Timer setup to disable tracing for event "sample_event". This
36 * timer is only for the purposes of the sample module to demonstrate access of
37 * Ftrace instances from within kernel.
38 */
39static DEFINE_TIMER(mytimer, mytimer_handler);
40
41static void mytimer_handler(struct timer_list *unused)
42{
43 schedule_work(&trace_work);
44}
45
46static void simple_thread_func(int count)
47{
48 set_current_state(TASK_INTERRUPTIBLE);
49 schedule_timeout(HZ);
50
51 /*
52 * Printing count value using trace_array_printk() - trace_printk()
53 * equivalent for the instance buffers.
54 */
55 trace_array_printk(tr, _THIS_IP_, "trace_array_printk: count=%d\n",
56 count);
57 /*
58 * Tracepoint for event "sample_event". This will print the
59 * current value of count and current jiffies.
60 */
61 trace_sample_event(count, jiffies);
62}
63
64static int simple_thread(void *arg)
65{
66 int count = 0;
67 unsigned long delay = msecs_to_jiffies(5000);
68
69 /*
70 * Enable tracing for "sample_event".
71 */
72 trace_array_set_clr_event(tr, "sample-subsystem", "sample_event", true);
73
74 /*
75 * Adding timer - mytimer. This timer will disable tracing after
76 * delay seconds.
77 *
78 */
79 add_timer(&mytimer);
80 mod_timer(&mytimer, jiffies+delay);
81
82 while (!kthread_should_stop())
83 simple_thread_func(count++);
84
85 del_timer(&mytimer);
86 cancel_work_sync(&trace_work);
87
88 /*
89 * trace_array_put() decrements the reference counter associated with
90 * the trace array - "tr". We are done using the trace array, hence
91 * decrement the reference counter so that it can be destroyed using
92 * trace_array_destroy().
93 */
94 trace_array_put(tr);
95
96 return 0;
97}
98
99static int __init sample_trace_array_init(void)
100{
101 /*
102 * Return a pointer to the trace array with name "sample-instance" if it
103 * exists, else create a new trace array.
104 *
105 * NOTE: This function increments the reference counter
106 * associated with the trace array - "tr".
107 */
108 tr = trace_array_get_by_name("sample-instance");
109
110 if (!tr)
111 return -1;
112 /*
113 * If context specific per-cpu buffers havent already been allocated.
114 */
115 trace_printk_init_buffers();
116
117 simple_tsk = kthread_run(simple_thread, NULL, "sample-instance");
118 if (IS_ERR(simple_tsk)) {
119 trace_array_put(tr);
120 trace_array_destroy(tr);
121 return -1;
122 }
123
124 return 0;
125}
126
127static void __exit sample_trace_array_exit(void)
128{
129 kthread_stop(simple_tsk);
130
131 /*
132 * We are unloading our module and no longer require the trace array.
133 * Remove/destroy "tr" using trace_array_destroy()
134 */
135 trace_array_destroy(tr);
136}
137
138module_init(sample_trace_array_init);
139module_exit(sample_trace_array_exit);
140
141MODULE_AUTHOR("Divya Indi");
142MODULE_DESCRIPTION("Sample module for kernel access to Ftrace instances");
143MODULE_LICENSE("GPL");
diff --git a/samples/ftrace/sample-trace-array.h b/samples/ftrace/sample-trace-array.h
new file mode 100644
index 000000000..6f8962428
--- /dev/null
+++ b/samples/ftrace/sample-trace-array.h
@@ -0,0 +1,84 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2
3/*
4 * If TRACE_SYSTEM is defined, that will be the directory created
5 * in the ftrace directory under /sys/kernel/tracing/events/<system>
6 *
7 * The define_trace.h below will also look for a file name of
8 * TRACE_SYSTEM.h where TRACE_SYSTEM is what is defined here.
9 * In this case, it would look for sample-trace.h
10 *
11 * If the header name will be different than the system name
12 * (as in this case), then you can override the header name that
13 * define_trace.h will look up by defining TRACE_INCLUDE_FILE
14 *
15 * This file is called sample-trace-array.h but we want the system
16 * to be called "sample-subsystem". Therefore we must define the name of this
17 * file:
18 *
19 * #define TRACE_INCLUDE_FILE sample-trace-array
20 *
21 * As we do in the bottom of this file.
22 *
23 * Notice that TRACE_SYSTEM should be defined outside of #if
24 * protection, just like TRACE_INCLUDE_FILE.
25 */
26#undef TRACE_SYSTEM
27#define TRACE_SYSTEM sample-subsystem
28
29/*
30 * TRACE_SYSTEM is expected to be a C valid variable (alpha-numeric
31 * and underscore), although it may start with numbers. If for some
32 * reason it is not, you need to add the following lines:
33 */
34#undef TRACE_SYSTEM_VAR
35#define TRACE_SYSTEM_VAR sample_subsystem
36
37/*
38 * But the above is only needed if TRACE_SYSTEM is not alpha-numeric
39 * and underscored. By default, TRACE_SYSTEM_VAR will be equal to
40 * TRACE_SYSTEM. As TRACE_SYSTEM_VAR must be alpha-numeric, if
41 * TRACE_SYSTEM is not, then TRACE_SYSTEM_VAR must be defined with
42 * only alpha-numeric and underscores.
43 *
44 * The TRACE_SYSTEM_VAR is only used internally and not visible to
45 * user space.
46 */
47
48/*
49 * Notice that this file is not protected like a normal header.
50 * We also must allow for rereading of this file. The
51 *
52 * || defined(TRACE_HEADER_MULTI_READ)
53 *
54 * serves this purpose.
55 */
56#if !defined(_SAMPLE_TRACE_ARRAY_H) || defined(TRACE_HEADER_MULTI_READ)
57#define _SAMPLE_TRACE_ARRAY_H
58
59#include <linux/tracepoint.h>
60TRACE_EVENT(sample_event,
61
62 TP_PROTO(int count, unsigned long time),
63
64 TP_ARGS(count, time),
65
66 TP_STRUCT__entry(
67 __field(int, count)
68 __field(unsigned long, time)
69 ),
70
71 TP_fast_assign(
72 __entry->count = count;
73 __entry->time = time;
74 ),
75
76 TP_printk("count value=%d at jiffies=%lu", __entry->count,
77 __entry->time)
78 );
79#endif
80
81#undef TRACE_INCLUDE_PATH
82#define TRACE_INCLUDE_PATH .
83#define TRACE_INCLUDE_FILE sample-trace-array
84#include <trace/define_trace.h>
diff --git a/samples/hck/Makefile b/samples/hck/Makefile
new file mode 100644
index 000000000..1f24a99a4
--- /dev/null
+++ b/samples/hck/Makefile
@@ -0,0 +1,6 @@
1# SPDX-License-Identifier: GPL-2.0-only
2ccflags-y += -I$(src)
3
4obj-$(CONFIG_SAMPLE_HCK_CALL) += call.o
5obj-$(CONFIG_SAMPLE_HCK_REGISTER) += register.o
6obj-$(CONFIG_SAMPLE_HCK_REGISTER_ONE) += register_one.o \ No newline at end of file
diff --git a/samples/hck/call.c b/samples/hck/call.c
new file mode 100644
index 000000000..870d5611c
--- /dev/null
+++ b/samples/hck/call.c
@@ -0,0 +1,24 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Sample Call HCK
4 *
5 */
6#include <linux/module.h>
7#include <linux/init.h>
8#include <linux/hck/lite_hck_sample.h>
9
10static int __init samplecallhck_init(void)
11{
12 int val = 0;
13
14 pr_info("hck sample: call\n");
15
16 CALL_HCK_LITE_HOOK(get_boot_config_lhck, &val);
17 pr_info("hck sample val changed: %d\n", val);
18
19 CALL_HCK_LITE_HOOK(set_boot_stat_lhck, val);
20 pr_info("hck sample val not changed: %d\n", val);
21
22 return 0;
23}
24late_initcall(samplecallhck_init); \ No newline at end of file
diff --git a/samples/hck/register.c b/samples/hck/register.c
new file mode 100644
index 000000000..407d05f74
--- /dev/null
+++ b/samples/hck/register.c
@@ -0,0 +1,48 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Sample HCK
4 *
5 */
6#include <linux/module.h>
7#include <linux/init.h>
8#include <linux/slab.h>
9#include <linux/hck/lite_hck_sample.h>
10
11static struct sample_hck_data data = {
12 .stat = 999,
13 .name = "sample tesst",
14};
15
16void get_boot_config(int* info)
17{
18 pr_info("hck sample: %s\n", __func__);
19 *info = 1;
20}
21
22void set_boot_stat(void* data, int info)
23{
24 pr_info("hck sample: %s\n", __func__);
25 info = 2;
26 struct sample_hck_data *hdata = data;
27
28 pr_info("hck data: stat = %d, name = %s\n", hdata->stat, hdata->name);
29}
30
31static int __init samplehck_init(void)
32{
33 pr_info("hck sample register\n");
34
35 REGISTER_HCK_LITE_HOOK(get_boot_config_lhck, get_boot_config);
36 REGISTER_HCK_LITE_DATA_HOOK(set_boot_stat_lhck, set_boot_stat, &data);
37
38 return 0;
39}
40
41static void __exit samplehck_exit(void)
42{
43}
44
45module_init(samplehck_init);
46module_exit(samplehck_exit);
47MODULE_LICENSE("GPL v2");
48MODULE_AUTHOR("zhujiaxin <zhujiaxin@huawei.com>");
diff --git a/samples/hck/register_one.c b/samples/hck/register_one.c
new file mode 100644
index 000000000..9ea2c0250
--- /dev/null
+++ b/samples/hck/register_one.c
@@ -0,0 +1,31 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Sample HCK
4 *
5 */
6#include <linux/module.h>
7#include <linux/init.h>
8#include <linux/hck/lite_hck_sample.h>
9
10void get_boot_power_config(int* info)
11{
12 pr_info("hck sample: intf-2 run\n");
13 *info = 2;
14}
15
16static int __init samplehckone_init(void)
17{
18 pr_info("hck sample register_one\n");
19 REGISTER_HCK_LITE_HOOK(get_boot_config_lhck, get_boot_power_config);
20
21 return 0;
22}
23
24static void __exit samplehckone_exit(void)
25{
26}
27
28module_init(samplehckone_init);
29module_exit(samplehckone_exit);
30MODULE_LICENSE("GPL v2");
31MODULE_AUTHOR("zhujiaxin <zhujiaxin@huawei.com>");
diff --git a/samples/hidraw/.gitignore b/samples/hidraw/.gitignore
new file mode 100644
index 000000000..d7a6074eb
--- /dev/null
+++ b/samples/hidraw/.gitignore
@@ -0,0 +1,2 @@
1# SPDX-License-Identifier: GPL-2.0-only
2hid-example
diff --git a/samples/hidraw/Makefile b/samples/hidraw/Makefile
new file mode 100644
index 000000000..594d989e5
--- /dev/null
+++ b/samples/hidraw/Makefile
@@ -0,0 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0
2userprogs-always-y += hid-example
3
4userccflags += -I usr/include
diff --git a/samples/hidraw/hid-example.c b/samples/hidraw/hid-example.c
new file mode 100644
index 000000000..37a0ffcb4
--- /dev/null
+++ b/samples/hidraw/hid-example.c
@@ -0,0 +1,182 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Hidraw Userspace Example
4 *
5 * Copyright (c) 2010 Alan Ott <alan@signal11.us>
6 * Copyright (c) 2010 Signal 11 Software
7 *
8 * The code may be used by anyone for any purpose,
9 * and can serve as a starting point for developing
10 * applications using hidraw.
11 */
12
13/* Linux */
14#include <linux/types.h>
15#include <linux/input.h>
16#include <linux/hidraw.h>
17
18/*
19 * Ugly hack to work around failing compilation on systems that don't
20 * yet populate new version of hidraw.h to userspace.
21 */
22#ifndef HIDIOCSFEATURE
23#warning Please have your distro update the userspace kernel headers
24#define HIDIOCSFEATURE(len) _IOC(_IOC_WRITE|_IOC_READ, 'H', 0x06, len)
25#define HIDIOCGFEATURE(len) _IOC(_IOC_WRITE|_IOC_READ, 'H', 0x07, len)
26#endif
27
28/* Unix */
29#include <sys/ioctl.h>
30#include <sys/types.h>
31#include <sys/stat.h>
32#include <fcntl.h>
33#include <unistd.h>
34
35/* C */
36#include <stdio.h>
37#include <string.h>
38#include <stdlib.h>
39#include <errno.h>
40
41const char *bus_str(int bus);
42
43int main(int argc, char **argv)
44{
45 int fd;
46 int i, res, desc_size = 0;
47 char buf[256];
48 struct hidraw_report_descriptor rpt_desc;
49 struct hidraw_devinfo info;
50 char *device = "/dev/hidraw0";
51
52 if (argc > 1)
53 device = argv[1];
54
55 /* Open the Device with non-blocking reads. In real life,
56 don't use a hard coded path; use libudev instead. */
57 fd = open(device, O_RDWR|O_NONBLOCK);
58
59 if (fd < 0) {
60 perror("Unable to open device");
61 return 1;
62 }
63
64 memset(&rpt_desc, 0x0, sizeof(rpt_desc));
65 memset(&info, 0x0, sizeof(info));
66 memset(buf, 0x0, sizeof(buf));
67
68 /* Get Report Descriptor Size */
69 res = ioctl(fd, HIDIOCGRDESCSIZE, &desc_size);
70 if (res < 0)
71 perror("HIDIOCGRDESCSIZE");
72 else
73 printf("Report Descriptor Size: %d\n", desc_size);
74
75 /* Get Report Descriptor */
76 rpt_desc.size = desc_size;
77 res = ioctl(fd, HIDIOCGRDESC, &rpt_desc);
78 if (res < 0) {
79 perror("HIDIOCGRDESC");
80 } else {
81 printf("Report Descriptor:\n");
82 for (i = 0; i < rpt_desc.size; i++)
83 printf("%hhx ", rpt_desc.value[i]);
84 puts("\n");
85 }
86
87 /* Get Raw Name */
88 res = ioctl(fd, HIDIOCGRAWNAME(256), buf);
89 if (res < 0)
90 perror("HIDIOCGRAWNAME");
91 else
92 printf("Raw Name: %s\n", buf);
93
94 /* Get Physical Location */
95 res = ioctl(fd, HIDIOCGRAWPHYS(256), buf);
96 if (res < 0)
97 perror("HIDIOCGRAWPHYS");
98 else
99 printf("Raw Phys: %s\n", buf);
100
101 /* Get Raw Info */
102 res = ioctl(fd, HIDIOCGRAWINFO, &info);
103 if (res < 0) {
104 perror("HIDIOCGRAWINFO");
105 } else {
106 printf("Raw Info:\n");
107 printf("\tbustype: %d (%s)\n",
108 info.bustype, bus_str(info.bustype));
109 printf("\tvendor: 0x%04hx\n", info.vendor);
110 printf("\tproduct: 0x%04hx\n", info.product);
111 }
112
113 /* Set Feature */
114 buf[0] = 0x9; /* Report Number */
115 buf[1] = 0xff;
116 buf[2] = 0xff;
117 buf[3] = 0xff;
118 res = ioctl(fd, HIDIOCSFEATURE(4), buf);
119 if (res < 0)
120 perror("HIDIOCSFEATURE");
121 else
122 printf("ioctl HIDIOCSFEATURE returned: %d\n", res);
123
124 /* Get Feature */
125 buf[0] = 0x9; /* Report Number */
126 res = ioctl(fd, HIDIOCGFEATURE(256), buf);
127 if (res < 0) {
128 perror("HIDIOCGFEATURE");
129 } else {
130 printf("ioctl HIDIOCGFEATURE returned: %d\n", res);
131 printf("Report data (not containing the report number):\n\t");
132 for (i = 0; i < res; i++)
133 printf("%hhx ", buf[i]);
134 puts("\n");
135 }
136
137 /* Send a Report to the Device */
138 buf[0] = 0x1; /* Report Number */
139 buf[1] = 0x77;
140 res = write(fd, buf, 2);
141 if (res < 0) {
142 printf("Error: %d\n", errno);
143 perror("write");
144 } else {
145 printf("write() wrote %d bytes\n", res);
146 }
147
148 /* Get a report from the device */
149 res = read(fd, buf, 16);
150 if (res < 0) {
151 perror("read");
152 } else {
153 printf("read() read %d bytes:\n\t", res);
154 for (i = 0; i < res; i++)
155 printf("%hhx ", buf[i]);
156 puts("\n");
157 }
158 close(fd);
159 return 0;
160}
161
162const char *
163bus_str(int bus)
164{
165 switch (bus) {
166 case BUS_USB:
167 return "USB";
168 break;
169 case BUS_HIL:
170 return "HIL";
171 break;
172 case BUS_BLUETOOTH:
173 return "Bluetooth";
174 break;
175 case BUS_VIRTUAL:
176 return "Virtual";
177 break;
178 default:
179 return "Other";
180 break;
181 }
182}
diff --git a/samples/hw_breakpoint/Makefile b/samples/hw_breakpoint/Makefile
new file mode 100644
index 000000000..ef4b6fdd7
--- /dev/null
+++ b/samples/hw_breakpoint/Makefile
@@ -0,0 +1,2 @@
1# SPDX-License-Identifier: GPL-2.0-only
2obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += data_breakpoint.o
diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c
new file mode 100644
index 000000000..418c46fe5
--- /dev/null
+++ b/samples/hw_breakpoint/data_breakpoint.c
@@ -0,0 +1,82 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * data_breakpoint.c - Sample HW Breakpoint file to watch kernel data address
4 *
5 * usage: insmod data_breakpoint.ko ksym=<ksym_name>
6 *
7 * This file is a kernel module that places a breakpoint over ksym_name kernel
8 * variable using Hardware Breakpoint register. The corresponding handler which
9 * prints a backtrace is invoked every time a write operation is performed on
10 * that variable.
11 *
12 * Copyright (C) IBM Corporation, 2009
13 *
14 * Author: K.Prasad <prasad@linux.vnet.ibm.com>
15 */
16#include <linux/module.h> /* Needed by all modules */
17#include <linux/kernel.h> /* Needed for KERN_INFO */
18#include <linux/init.h> /* Needed for the macros */
19#include <linux/kallsyms.h>
20
21#include <linux/perf_event.h>
22#include <linux/hw_breakpoint.h>
23
24struct perf_event * __percpu *sample_hbp;
25
26static char ksym_name[KSYM_NAME_LEN] = "jiffies";
27module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO);
28MODULE_PARM_DESC(ksym, "Kernel symbol to monitor; this module will report any"
29 " write operations on the kernel symbol");
30
31static void sample_hbp_handler(struct perf_event *bp,
32 struct perf_sample_data *data,
33 struct pt_regs *regs)
34{
35 printk(KERN_INFO "%s value is changed\n", ksym_name);
36 dump_stack();
37 printk(KERN_INFO "Dump stack from sample_hbp_handler\n");
38}
39
40static int __init hw_break_module_init(void)
41{
42 int ret;
43 struct perf_event_attr attr;
44 void *addr = __symbol_get(ksym_name);
45
46 if (!addr)
47 return -ENXIO;
48
49 hw_breakpoint_init(&attr);
50 attr.bp_addr = (unsigned long)addr;
51 attr.bp_len = HW_BREAKPOINT_LEN_4;
52 attr.bp_type = HW_BREAKPOINT_W;
53
54 sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler, NULL);
55 if (IS_ERR((void __force *)sample_hbp)) {
56 ret = PTR_ERR((void __force *)sample_hbp);
57 goto fail;
58 }
59
60 printk(KERN_INFO "HW Breakpoint for %s write installed\n", ksym_name);
61
62 return 0;
63
64fail:
65 printk(KERN_INFO "Breakpoint registration failed\n");
66
67 return ret;
68}
69
70static void __exit hw_break_module_exit(void)
71{
72 unregister_wide_hw_breakpoint(sample_hbp);
73 symbol_put(ksym_name);
74 printk(KERN_INFO "HW Breakpoint for %s write uninstalled\n", ksym_name);
75}
76
77module_init(hw_break_module_init);
78module_exit(hw_break_module_exit);
79
80MODULE_LICENSE("GPL");
81MODULE_AUTHOR("K.Prasad");
82MODULE_DESCRIPTION("ksym breakpoint");
diff --git a/samples/kdb/Makefile b/samples/kdb/Makefile
new file mode 100644
index 000000000..947cb8522
--- /dev/null
+++ b/samples/kdb/Makefile
@@ -0,0 +1,2 @@
1# SPDX-License-Identifier: GPL-2.0-only
2obj-$(CONFIG_SAMPLE_KDB) += kdb_hello.o
diff --git a/samples/kdb/kdb_hello.c b/samples/kdb/kdb_hello.c
new file mode 100644
index 000000000..c1c2fa0f6
--- /dev/null
+++ b/samples/kdb/kdb_hello.c
@@ -0,0 +1,60 @@
1/*
2 * Created by: Jason Wessel <jason.wessel@windriver.com>
3 *
4 * Copyright (c) 2010 Wind River Systems, Inc. All Rights Reserved.
5 *
6 * This file is licensed under the terms of the GNU General Public
7 * License version 2. This program is licensed "as is" without any
8 * warranty of any kind, whether express or implied.
9 */
10
11#include <linux/module.h>
12#include <linux/kdb.h>
13
14/*
15 * All kdb shell command call backs receive argc and argv, where
16 * argv[0] is the command the end user typed
17 */
18static int kdb_hello_cmd(int argc, const char **argv)
19{
20 if (argc > 1)
21 return KDB_ARGCOUNT;
22
23 if (argc)
24 kdb_printf("Hello %s.\n", argv[1]);
25 else
26 kdb_printf("Hello world!\n");
27
28 return 0;
29}
30
31
32static int __init kdb_hello_cmd_init(void)
33{
34 /*
35 * Registration of a dynamically added kdb command is done with
36 * kdb_register() with the arguments being:
37 * 1: The name of the shell command
38 * 2: The function that processes the command
39 * 3: Description of the usage of any arguments
40 * 4: Descriptive text when you run help
41 * 5: Number of characters to complete the command
42 * 0 == type the whole command
43 * 1 == match both "g" and "go" for example
44 */
45 kdb_register("hello", kdb_hello_cmd, "[string]",
46 "Say Hello World or Hello [string]", 0);
47 return 0;
48}
49
50static void __exit kdb_hello_cmd_exit(void)
51{
52 kdb_unregister("hello");
53}
54
55module_init(kdb_hello_cmd_init);
56module_exit(kdb_hello_cmd_exit);
57
58MODULE_AUTHOR("WindRiver");
59MODULE_DESCRIPTION("KDB example to add a hello command");
60MODULE_LICENSE("GPL");
diff --git a/samples/kfifo/Makefile b/samples/kfifo/Makefile
new file mode 100644
index 000000000..0af5250ad
--- /dev/null
+++ b/samples/kfifo/Makefile
@@ -0,0 +1,2 @@
1# SPDX-License-Identifier: GPL-2.0-only
2obj-$(CONFIG_SAMPLE_KFIFO) += bytestream-example.o dma-example.o inttype-example.o record-example.o
diff --git a/samples/kfifo/bytestream-example.c b/samples/kfifo/bytestream-example.c
new file mode 100644
index 000000000..5a90aa527
--- /dev/null
+++ b/samples/kfifo/bytestream-example.c
@@ -0,0 +1,195 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Sample kfifo byte stream implementation
4 *
5 * Copyright (C) 2010 Stefani Seibold <stefani@seibold.net>
6 */
7
8#include <linux/init.h>
9#include <linux/module.h>
10#include <linux/proc_fs.h>
11#include <linux/mutex.h>
12#include <linux/kfifo.h>
13
14/*
15 * This module shows how to create a byte stream fifo.
16 */
17
18/* fifo size in elements (bytes) */
19#define FIFO_SIZE 32
20
21/* name of the proc entry */
22#define PROC_FIFO "bytestream-fifo"
23
24/* lock for procfs read access */
25static DEFINE_MUTEX(read_lock);
26
27/* lock for procfs write access */
28static DEFINE_MUTEX(write_lock);
29
30/*
31 * define DYNAMIC in this example for a dynamically allocated fifo.
32 *
33 * Otherwise the fifo storage will be a part of the fifo structure.
34 */
35#if 0
36#define DYNAMIC
37#endif
38
39#ifdef DYNAMIC
40static struct kfifo test;
41#else
42static DECLARE_KFIFO(test, unsigned char, FIFO_SIZE);
43#endif
44
45static const unsigned char expected_result[FIFO_SIZE] = {
46 3, 4, 5, 6, 7, 8, 9, 0,
47 1, 20, 21, 22, 23, 24, 25, 26,
48 27, 28, 29, 30, 31, 32, 33, 34,
49 35, 36, 37, 38, 39, 40, 41, 42,
50};
51
52static int __init testfunc(void)
53{
54 unsigned char buf[6];
55 unsigned char i, j;
56 unsigned int ret;
57
58 printk(KERN_INFO "byte stream fifo test start\n");
59
60 /* put string into the fifo */
61 kfifo_in(&test, "hello", 5);
62
63 /* put values into the fifo */
64 for (i = 0; i != 10; i++)
65 kfifo_put(&test, i);
66
67 /* show the number of used elements */
68 printk(KERN_INFO "fifo len: %u\n", kfifo_len(&test));
69
70 /* get max of 5 bytes from the fifo */
71 i = kfifo_out(&test, buf, 5);
72 printk(KERN_INFO "buf: %.*s\n", i, buf);
73
74 /* get max of 2 elements from the fifo */
75 ret = kfifo_out(&test, buf, 2);
76 printk(KERN_INFO "ret: %d\n", ret);
77 /* and put it back to the end of the fifo */
78 ret = kfifo_in(&test, buf, ret);
79 printk(KERN_INFO "ret: %d\n", ret);
80
81 /* skip first element of the fifo */
82 printk(KERN_INFO "skip 1st element\n");
83 kfifo_skip(&test);
84
85 /* put values into the fifo until is full */
86 for (i = 20; kfifo_put(&test, i); i++)
87 ;
88
89 printk(KERN_INFO "queue len: %u\n", kfifo_len(&test));
90
91 /* show the first value without removing from the fifo */
92 if (kfifo_peek(&test, &i))
93 printk(KERN_INFO "%d\n", i);
94
95 /* check the correctness of all values in the fifo */
96 j = 0;
97 while (kfifo_get(&test, &i)) {
98 printk(KERN_INFO "item = %d\n", i);
99 if (i != expected_result[j++]) {
100 printk(KERN_WARNING "value mismatch: test failed\n");
101 return -EIO;
102 }
103 }
104 if (j != ARRAY_SIZE(expected_result)) {
105 printk(KERN_WARNING "size mismatch: test failed\n");
106 return -EIO;
107 }
108 printk(KERN_INFO "test passed\n");
109
110 return 0;
111}
112
113static ssize_t fifo_write(struct file *file, const char __user *buf,
114 size_t count, loff_t *ppos)
115{
116 int ret;
117 unsigned int copied;
118
119 if (mutex_lock_interruptible(&write_lock))
120 return -ERESTARTSYS;
121
122 ret = kfifo_from_user(&test, buf, count, &copied);
123
124 mutex_unlock(&write_lock);
125 if (ret)
126 return ret;
127
128 return copied;
129}
130
131static ssize_t fifo_read(struct file *file, char __user *buf,
132 size_t count, loff_t *ppos)
133{
134 int ret;
135 unsigned int copied;
136
137 if (mutex_lock_interruptible(&read_lock))
138 return -ERESTARTSYS;
139
140 ret = kfifo_to_user(&test, buf, count, &copied);
141
142 mutex_unlock(&read_lock);
143 if (ret)
144 return ret;
145
146 return copied;
147}
148
149static const struct proc_ops fifo_proc_ops = {
150 .proc_read = fifo_read,
151 .proc_write = fifo_write,
152 .proc_lseek = noop_llseek,
153};
154
155static int __init example_init(void)
156{
157#ifdef DYNAMIC
158 int ret;
159
160 ret = kfifo_alloc(&test, FIFO_SIZE, GFP_KERNEL);
161 if (ret) {
162 printk(KERN_ERR "error kfifo_alloc\n");
163 return ret;
164 }
165#else
166 INIT_KFIFO(test);
167#endif
168 if (testfunc() < 0) {
169#ifdef DYNAMIC
170 kfifo_free(&test);
171#endif
172 return -EIO;
173 }
174
175 if (proc_create(PROC_FIFO, 0, NULL, &fifo_proc_ops) == NULL) {
176#ifdef DYNAMIC
177 kfifo_free(&test);
178#endif
179 return -ENOMEM;
180 }
181 return 0;
182}
183
184static void __exit example_exit(void)
185{
186 remove_proc_entry(PROC_FIFO, NULL);
187#ifdef DYNAMIC
188 kfifo_free(&test);
189#endif
190}
191
192module_init(example_init);
193module_exit(example_exit);
194MODULE_LICENSE("GPL");
195MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>");
diff --git a/samples/kfifo/dma-example.c b/samples/kfifo/dma-example.c
new file mode 100644
index 000000000..0cf27483c
--- /dev/null
+++ b/samples/kfifo/dma-example.c
@@ -0,0 +1,141 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Sample fifo dma implementation
4 *
5 * Copyright (C) 2010 Stefani Seibold <stefani@seibold.net>
6 */
7
8#include <linux/init.h>
9#include <linux/module.h>
10#include <linux/kfifo.h>
11
12/*
13 * This module shows how to handle fifo dma operations.
14 */
15
16/* fifo size in elements (bytes) */
17#define FIFO_SIZE 32
18
19static struct kfifo fifo;
20
21static int __init example_init(void)
22{
23 int i;
24 unsigned int ret;
25 unsigned int nents;
26 struct scatterlist sg[10];
27
28 printk(KERN_INFO "DMA fifo test start\n");
29
30 if (kfifo_alloc(&fifo, FIFO_SIZE, GFP_KERNEL)) {
31 printk(KERN_WARNING "error kfifo_alloc\n");
32 return -ENOMEM;
33 }
34
35 printk(KERN_INFO "queue size: %u\n", kfifo_size(&fifo));
36
37 kfifo_in(&fifo, "test", 4);
38
39 for (i = 0; i != 9; i++)
40 kfifo_put(&fifo, i);
41
42 /* kick away first byte */
43 kfifo_skip(&fifo);
44
45 printk(KERN_INFO "queue len: %u\n", kfifo_len(&fifo));
46
47 /*
48 * Configure the kfifo buffer to receive data from DMA input.
49 *
50 * .--------------------------------------.
51 * | 0 | 1 | 2 | ... | 12 | 13 | ... | 31 |
52 * |---|------------------|---------------|
53 * \_/ \________________/ \_____________/
54 * \ \ \
55 * \ \_allocated data \
56 * \_*free space* \_*free space*
57 *
58 * We need two different SG entries: one for the free space area at the
59 * end of the kfifo buffer (19 bytes) and another for the first free
60 * byte at the beginning, after the kfifo_skip().
61 */
62 sg_init_table(sg, ARRAY_SIZE(sg));
63 nents = kfifo_dma_in_prepare(&fifo, sg, ARRAY_SIZE(sg), FIFO_SIZE);
64 printk(KERN_INFO "DMA sgl entries: %d\n", nents);
65 if (!nents) {
66 /* fifo is full and no sgl was created */
67 printk(KERN_WARNING "error kfifo_dma_in_prepare\n");
68 return -EIO;
69 }
70
71 /* receive data */
72 printk(KERN_INFO "scatterlist for receive:\n");
73 for (i = 0; i < nents; i++) {
74 printk(KERN_INFO
75 "sg[%d] -> "
76 "page %p offset 0x%.8x length 0x%.8x\n",
77 i, sg_page(&sg[i]), sg[i].offset, sg[i].length);
78
79 if (sg_is_last(&sg[i]))
80 break;
81 }
82
83 /* put here your code to setup and exectute the dma operation */
84 /* ... */
85
86 /* example: zero bytes received */
87 ret = 0;
88
89 /* finish the dma operation and update the received data */
90 kfifo_dma_in_finish(&fifo, ret);
91
92 /* Prepare to transmit data, example: 8 bytes */
93 nents = kfifo_dma_out_prepare(&fifo, sg, ARRAY_SIZE(sg), 8);
94 printk(KERN_INFO "DMA sgl entries: %d\n", nents);
95 if (!nents) {
96 /* no data was available and no sgl was created */
97 printk(KERN_WARNING "error kfifo_dma_out_prepare\n");
98 return -EIO;
99 }
100
101 printk(KERN_INFO "scatterlist for transmit:\n");
102 for (i = 0; i < nents; i++) {
103 printk(KERN_INFO
104 "sg[%d] -> "
105 "page %p offset 0x%.8x length 0x%.8x\n",
106 i, sg_page(&sg[i]), sg[i].offset, sg[i].length);
107
108 if (sg_is_last(&sg[i]))
109 break;
110 }
111
112 /* put here your code to setup and exectute the dma operation */
113 /* ... */
114
115 /* example: 5 bytes transmitted */
116 ret = 5;
117
118 /* finish the dma operation and update the transmitted data */
119 kfifo_dma_out_finish(&fifo, ret);
120
121 ret = kfifo_len(&fifo);
122 printk(KERN_INFO "queue len: %u\n", kfifo_len(&fifo));
123
124 if (ret != 7) {
125 printk(KERN_WARNING "size mismatch: test failed");
126 return -EIO;
127 }
128 printk(KERN_INFO "test passed\n");
129
130 return 0;
131}
132
133static void __exit example_exit(void)
134{
135 kfifo_free(&fifo);
136}
137
138module_init(example_init);
139module_exit(example_exit);
140MODULE_LICENSE("GPL");
141MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>");
diff --git a/samples/kfifo/inttype-example.c b/samples/kfifo/inttype-example.c
new file mode 100644
index 000000000..e5403d8c9
--- /dev/null
+++ b/samples/kfifo/inttype-example.c
@@ -0,0 +1,186 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Sample kfifo int type implementation
4 *
5 * Copyright (C) 2010 Stefani Seibold <stefani@seibold.net>
6 */
7
8#include <linux/init.h>
9#include <linux/module.h>
10#include <linux/proc_fs.h>
11#include <linux/mutex.h>
12#include <linux/kfifo.h>
13
14/*
15 * This module shows how to create a int type fifo.
16 */
17
18/* fifo size in elements (ints) */
19#define FIFO_SIZE 32
20
21/* name of the proc entry */
22#define PROC_FIFO "int-fifo"
23
24/* lock for procfs read access */
25static DEFINE_MUTEX(read_lock);
26
27/* lock for procfs write access */
28static DEFINE_MUTEX(write_lock);
29
30/*
31 * define DYNAMIC in this example for a dynamically allocated fifo.
32 *
33 * Otherwise the fifo storage will be a part of the fifo structure.
34 */
35#if 0
36#define DYNAMIC
37#endif
38
39#ifdef DYNAMIC
40static DECLARE_KFIFO_PTR(test, int);
41#else
42static DEFINE_KFIFO(test, int, FIFO_SIZE);
43#endif
44
45static const int expected_result[FIFO_SIZE] = {
46 3, 4, 5, 6, 7, 8, 9, 0,
47 1, 20, 21, 22, 23, 24, 25, 26,
48 27, 28, 29, 30, 31, 32, 33, 34,
49 35, 36, 37, 38, 39, 40, 41, 42,
50};
51
52static int __init testfunc(void)
53{
54 int buf[6];
55 int i, j;
56 unsigned int ret;
57
58 printk(KERN_INFO "int fifo test start\n");
59
60 /* put values into the fifo */
61 for (i = 0; i != 10; i++)
62 kfifo_put(&test, i);
63
64 /* show the number of used elements */
65 printk(KERN_INFO "fifo len: %u\n", kfifo_len(&test));
66
67 /* get max of 2 elements from the fifo */
68 ret = kfifo_out(&test, buf, 2);
69 printk(KERN_INFO "ret: %d\n", ret);
70 /* and put it back to the end of the fifo */
71 ret = kfifo_in(&test, buf, ret);
72 printk(KERN_INFO "ret: %d\n", ret);
73
74 /* skip first element of the fifo */
75 printk(KERN_INFO "skip 1st element\n");
76 kfifo_skip(&test);
77
78 /* put values into the fifo until is full */
79 for (i = 20; kfifo_put(&test, i); i++)
80 ;
81
82 printk(KERN_INFO "queue len: %u\n", kfifo_len(&test));
83
84 /* show the first value without removing from the fifo */
85 if (kfifo_peek(&test, &i))
86 printk(KERN_INFO "%d\n", i);
87
88 /* check the correctness of all values in the fifo */
89 j = 0;
90 while (kfifo_get(&test, &i)) {
91 printk(KERN_INFO "item = %d\n", i);
92 if (i != expected_result[j++]) {
93 printk(KERN_WARNING "value mismatch: test failed\n");
94 return -EIO;
95 }
96 }
97 if (j != ARRAY_SIZE(expected_result)) {
98 printk(KERN_WARNING "size mismatch: test failed\n");
99 return -EIO;
100 }
101 printk(KERN_INFO "test passed\n");
102
103 return 0;
104}
105
106static ssize_t fifo_write(struct file *file, const char __user *buf,
107 size_t count, loff_t *ppos)
108{
109 int ret;
110 unsigned int copied;
111
112 if (mutex_lock_interruptible(&write_lock))
113 return -ERESTARTSYS;
114
115 ret = kfifo_from_user(&test, buf, count, &copied);
116
117 mutex_unlock(&write_lock);
118 if (ret)
119 return ret;
120
121 return copied;
122}
123
124static ssize_t fifo_read(struct file *file, char __user *buf,
125 size_t count, loff_t *ppos)
126{
127 int ret;
128 unsigned int copied;
129
130 if (mutex_lock_interruptible(&read_lock))
131 return -ERESTARTSYS;
132
133 ret = kfifo_to_user(&test, buf, count, &copied);
134
135 mutex_unlock(&read_lock);
136 if (ret)
137 return ret;
138
139 return copied;
140}
141
142static const struct proc_ops fifo_proc_ops = {
143 .proc_read = fifo_read,
144 .proc_write = fifo_write,
145 .proc_lseek = noop_llseek,
146};
147
148static int __init example_init(void)
149{
150#ifdef DYNAMIC
151 int ret;
152
153 ret = kfifo_alloc(&test, FIFO_SIZE, GFP_KERNEL);
154 if (ret) {
155 printk(KERN_ERR "error kfifo_alloc\n");
156 return ret;
157 }
158#endif
159 if (testfunc() < 0) {
160#ifdef DYNAMIC
161 kfifo_free(&test);
162#endif
163 return -EIO;
164 }
165
166 if (proc_create(PROC_FIFO, 0, NULL, &fifo_proc_ops) == NULL) {
167#ifdef DYNAMIC
168 kfifo_free(&test);
169#endif
170 return -ENOMEM;
171 }
172 return 0;
173}
174
175static void __exit example_exit(void)
176{
177 remove_proc_entry(PROC_FIFO, NULL);
178#ifdef DYNAMIC
179 kfifo_free(&test);
180#endif
181}
182
183module_init(example_init);
184module_exit(example_exit);
185MODULE_LICENSE("GPL");
186MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>");
diff --git a/samples/kfifo/record-example.c b/samples/kfifo/record-example.c
new file mode 100644
index 000000000..f64f3d62d
--- /dev/null
+++ b/samples/kfifo/record-example.c
@@ -0,0 +1,202 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Sample dynamic sized record fifo implementation
4 *
5 * Copyright (C) 2010 Stefani Seibold <stefani@seibold.net>
6 */
7
8#include <linux/init.h>
9#include <linux/module.h>
10#include <linux/proc_fs.h>
11#include <linux/mutex.h>
12#include <linux/kfifo.h>
13
14/*
15 * This module shows how to create a variable sized record fifo.
16 */
17
18/* fifo size in elements (bytes) */
19#define FIFO_SIZE 128
20
21/* name of the proc entry */
22#define PROC_FIFO "record-fifo"
23
24/* lock for procfs read access */
25static DEFINE_MUTEX(read_lock);
26
27/* lock for procfs write access */
28static DEFINE_MUTEX(write_lock);
29
30/*
31 * define DYNAMIC in this example for a dynamically allocated fifo.
32 *
33 * Otherwise the fifo storage will be a part of the fifo structure.
34 */
35#if 0
36#define DYNAMIC
37#endif
38
39/*
40 * struct kfifo_rec_ptr_1 and STRUCT_KFIFO_REC_1 can handle records of a
41 * length between 0 and 255 bytes.
42 *
43 * struct kfifo_rec_ptr_2 and STRUCT_KFIFO_REC_2 can handle records of a
44 * length between 0 and 65535 bytes.
45 */
46
47#ifdef DYNAMIC
48struct kfifo_rec_ptr_1 test;
49
50#else
51typedef STRUCT_KFIFO_REC_1(FIFO_SIZE) mytest;
52
53static mytest test;
54#endif
55
56static const char *expected_result[] = {
57 "a",
58 "bb",
59 "ccc",
60 "dddd",
61 "eeeee",
62 "ffffff",
63 "ggggggg",
64 "hhhhhhhh",
65 "iiiiiiiii",
66 "jjjjjjjjjj",
67};
68
69static int __init testfunc(void)
70{
71 char buf[100];
72 unsigned int i;
73 unsigned int ret;
74 struct { unsigned char buf[6]; } hello = { "hello" };
75
76 printk(KERN_INFO "record fifo test start\n");
77
78 kfifo_in(&test, &hello, sizeof(hello));
79
80 /* show the size of the next record in the fifo */
81 printk(KERN_INFO "fifo peek len: %u\n", kfifo_peek_len(&test));
82
83 /* put in variable length data */
84 for (i = 0; i < 10; i++) {
85 memset(buf, 'a' + i, i + 1);
86 kfifo_in(&test, buf, i + 1);
87 }
88
89 /* skip first element of the fifo */
90 printk(KERN_INFO "skip 1st element\n");
91 kfifo_skip(&test);
92
93 printk(KERN_INFO "fifo len: %u\n", kfifo_len(&test));
94
95 /* show the first record without removing from the fifo */
96 ret = kfifo_out_peek(&test, buf, sizeof(buf));
97 if (ret)
98 printk(KERN_INFO "%.*s\n", ret, buf);
99
100 /* check the correctness of all values in the fifo */
101 i = 0;
102 while (!kfifo_is_empty(&test)) {
103 ret = kfifo_out(&test, buf, sizeof(buf));
104 buf[ret] = '\0';
105 printk(KERN_INFO "item = %.*s\n", ret, buf);
106 if (strcmp(buf, expected_result[i++])) {
107 printk(KERN_WARNING "value mismatch: test failed\n");
108 return -EIO;
109 }
110 }
111 if (i != ARRAY_SIZE(expected_result)) {
112 printk(KERN_WARNING "size mismatch: test failed\n");
113 return -EIO;
114 }
115 printk(KERN_INFO "test passed\n");
116
117 return 0;
118}
119
120static ssize_t fifo_write(struct file *file, const char __user *buf,
121 size_t count, loff_t *ppos)
122{
123 int ret;
124 unsigned int copied;
125
126 if (mutex_lock_interruptible(&write_lock))
127 return -ERESTARTSYS;
128
129 ret = kfifo_from_user(&test, buf, count, &copied);
130
131 mutex_unlock(&write_lock);
132 if (ret)
133 return ret;
134
135 return copied;
136}
137
138static ssize_t fifo_read(struct file *file, char __user *buf,
139 size_t count, loff_t *ppos)
140{
141 int ret;
142 unsigned int copied;
143
144 if (mutex_lock_interruptible(&read_lock))
145 return -ERESTARTSYS;
146
147 ret = kfifo_to_user(&test, buf, count, &copied);
148
149 mutex_unlock(&read_lock);
150 if (ret)
151 return ret;
152
153 return copied;
154}
155
156static const struct proc_ops fifo_proc_ops = {
157 .proc_read = fifo_read,
158 .proc_write = fifo_write,
159 .proc_lseek = noop_llseek,
160};
161
162static int __init example_init(void)
163{
164#ifdef DYNAMIC
165 int ret;
166
167 ret = kfifo_alloc(&test, FIFO_SIZE, GFP_KERNEL);
168 if (ret) {
169 printk(KERN_ERR "error kfifo_alloc\n");
170 return ret;
171 }
172#else
173 INIT_KFIFO(test);
174#endif
175 if (testfunc() < 0) {
176#ifdef DYNAMIC
177 kfifo_free(&test);
178#endif
179 return -EIO;
180 }
181
182 if (proc_create(PROC_FIFO, 0, NULL, &fifo_proc_ops) == NULL) {
183#ifdef DYNAMIC
184 kfifo_free(&test);
185#endif
186 return -ENOMEM;
187 }
188 return 0;
189}
190
191static void __exit example_exit(void)
192{
193 remove_proc_entry(PROC_FIFO, NULL);
194#ifdef DYNAMIC
195 kfifo_free(&test);
196#endif
197}
198
199module_init(example_init);
200module_exit(example_exit);
201MODULE_LICENSE("GPL");
202MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>");
diff --git a/samples/kmemleak/Makefile b/samples/kmemleak/Makefile
new file mode 100644
index 000000000..16b6132c5
--- /dev/null
+++ b/samples/kmemleak/Makefile
@@ -0,0 +1,3 @@
1# SPDX-License-Identifier: GPL-2.0-only
2
3obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
diff --git a/samples/kmemleak/kmemleak-test.c b/samples/kmemleak/kmemleak-test.c
new file mode 100644
index 000000000..7b476eb82
--- /dev/null
+++ b/samples/kmemleak/kmemleak-test.c
@@ -0,0 +1,99 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * samples/kmemleak/kmemleak-test.c
4 *
5 * Copyright (C) 2008 ARM Limited
6 * Written by Catalin Marinas <catalin.marinas@arm.com>
7 */
8
9#define pr_fmt(fmt) "kmemleak: " fmt
10
11#include <linux/init.h>
12#include <linux/kernel.h>
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/vmalloc.h>
16#include <linux/list.h>
17#include <linux/percpu.h>
18#include <linux/fdtable.h>
19
20#include <linux/kmemleak.h>
21
22struct test_node {
23 long header[25];
24 struct list_head list;
25 long footer[25];
26};
27
28static LIST_HEAD(test_list);
29static DEFINE_PER_CPU(void *, kmemleak_test_pointer);
30
31/*
32 * Some very simple testing. This function needs to be extended for
33 * proper testing.
34 */
35static int __init kmemleak_test_init(void)
36{
37 struct test_node *elem;
38 int i;
39
40 pr_info("Kmemleak testing\n");
41
42 /* make some orphan objects */
43 pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
44 pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
45 pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
46 pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
47 pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
48 pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
49 pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
50 pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
51#ifndef CONFIG_MODULES
52 pr_info("kmem_cache_alloc(files_cachep) = %p\n",
53 kmem_cache_alloc(files_cachep, GFP_KERNEL));
54 pr_info("kmem_cache_alloc(files_cachep) = %p\n",
55 kmem_cache_alloc(files_cachep, GFP_KERNEL));
56#endif
57 pr_info("vmalloc(64) = %p\n", vmalloc(64));
58 pr_info("vmalloc(64) = %p\n", vmalloc(64));
59 pr_info("vmalloc(64) = %p\n", vmalloc(64));
60 pr_info("vmalloc(64) = %p\n", vmalloc(64));
61 pr_info("vmalloc(64) = %p\n", vmalloc(64));
62
63 /*
64 * Add elements to a list. They should only appear as orphan
65 * after the module is removed.
66 */
67 for (i = 0; i < 10; i++) {
68 elem = kzalloc(sizeof(*elem), GFP_KERNEL);
69 pr_info("kzalloc(sizeof(*elem)) = %p\n", elem);
70 if (!elem)
71 return -ENOMEM;
72 INIT_LIST_HEAD(&elem->list);
73 list_add_tail(&elem->list, &test_list);
74 }
75
76 for_each_possible_cpu(i) {
77 per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL);
78 pr_info("kmalloc(129) = %p\n",
79 per_cpu(kmemleak_test_pointer, i));
80 }
81
82 return 0;
83}
84module_init(kmemleak_test_init);
85
86static void __exit kmemleak_test_exit(void)
87{
88 struct test_node *elem, *tmp;
89
90 /*
91 * Remove the list elements without actually freeing the
92 * memory.
93 */
94 list_for_each_entry_safe(elem, tmp, &test_list, list)
95 list_del(&elem->list);
96}
97module_exit(kmemleak_test_exit);
98
99MODULE_LICENSE("GPL");
diff --git a/samples/kobject/Makefile b/samples/kobject/Makefile
new file mode 100644
index 000000000..bb5d21997
--- /dev/null
+++ b/samples/kobject/Makefile
@@ -0,0 +1,2 @@
1# SPDX-License-Identifier: GPL-2.0-only
2obj-$(CONFIG_SAMPLE_KOBJECT) += kobject-example.o kset-example.o
diff --git a/samples/kobject/kobject-example.c b/samples/kobject/kobject-example.c
new file mode 100644
index 000000000..9e383fdba
--- /dev/null
+++ b/samples/kobject/kobject-example.c
@@ -0,0 +1,144 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Sample kobject implementation
4 *
5 * Copyright (C) 2004-2007 Greg Kroah-Hartman <greg@kroah.com>
6 * Copyright (C) 2007 Novell Inc.
7 */
8#include <linux/kobject.h>
9#include <linux/string.h>
10#include <linux/sysfs.h>
11#include <linux/module.h>
12#include <linux/init.h>
13
14/*
15 * This module shows how to create a simple subdirectory in sysfs called
16 * /sys/kernel/kobject-example In that directory, 3 files are created:
17 * "foo", "baz", and "bar". If an integer is written to these files, it can be
18 * later read out of it.
19 */
20
21static int foo;
22static int baz;
23static int bar;
24
25/*
26 * The "foo" file where a static variable is read from and written to.
27 */
28static ssize_t foo_show(struct kobject *kobj, struct kobj_attribute *attr,
29 char *buf)
30{
31 return sprintf(buf, "%d\n", foo);
32}
33
34static ssize_t foo_store(struct kobject *kobj, struct kobj_attribute *attr,
35 const char *buf, size_t count)
36{
37 int ret;
38
39 ret = kstrtoint(buf, 10, &foo);
40 if (ret < 0)
41 return ret;
42
43 return count;
44}
45
46/* Sysfs attributes cannot be world-writable. */
47static struct kobj_attribute foo_attribute =
48 __ATTR(foo, 0664, foo_show, foo_store);
49
50/*
51 * More complex function where we determine which variable is being accessed by
52 * looking at the attribute for the "baz" and "bar" files.
53 */
54static ssize_t b_show(struct kobject *kobj, struct kobj_attribute *attr,
55 char *buf)
56{
57 int var;
58
59 if (strcmp(attr->attr.name, "baz") == 0)
60 var = baz;
61 else
62 var = bar;
63 return sprintf(buf, "%d\n", var);
64}
65
66static ssize_t b_store(struct kobject *kobj, struct kobj_attribute *attr,
67 const char *buf, size_t count)
68{
69 int var, ret;
70
71 ret = kstrtoint(buf, 10, &var);
72 if (ret < 0)
73 return ret;
74
75 if (strcmp(attr->attr.name, "baz") == 0)
76 baz = var;
77 else
78 bar = var;
79 return count;
80}
81
82static struct kobj_attribute baz_attribute =
83 __ATTR(baz, 0664, b_show, b_store);
84static struct kobj_attribute bar_attribute =
85 __ATTR(bar, 0664, b_show, b_store);
86
87
88/*
89 * Create a group of attributes so that we can create and destroy them all
90 * at once.
91 */
92static struct attribute *attrs[] = {
93 &foo_attribute.attr,
94 &baz_attribute.attr,
95 &bar_attribute.attr,
96 NULL, /* need to NULL terminate the list of attributes */
97};
98
99/*
100 * An unnamed attribute group will put all of the attributes directly in
101 * the kobject directory. If we specify a name, a subdirectory will be
102 * created for the attributes with the directory being the name of the
103 * attribute group.
104 */
105static struct attribute_group attr_group = {
106 .attrs = attrs,
107};
108
109static struct kobject *example_kobj;
110
111static int __init example_init(void)
112{
113 int retval;
114
115 /*
116 * Create a simple kobject with the name of "kobject_example",
117 * located under /sys/kernel/
118 *
119 * As this is a simple directory, no uevent will be sent to
120 * userspace. That is why this function should not be used for
121 * any type of dynamic kobjects, where the name and number are
122 * not known ahead of time.
123 */
124 example_kobj = kobject_create_and_add("kobject_example", kernel_kobj);
125 if (!example_kobj)
126 return -ENOMEM;
127
128 /* Create the files associated with this kobject */
129 retval = sysfs_create_group(example_kobj, &attr_group);
130 if (retval)
131 kobject_put(example_kobj);
132
133 return retval;
134}
135
136static void __exit example_exit(void)
137{
138 kobject_put(example_kobj);
139}
140
141module_init(example_init);
142module_exit(example_exit);
143MODULE_LICENSE("GPL v2");
144MODULE_AUTHOR("Greg Kroah-Hartman <greg@kroah.com>");
diff --git a/samples/kobject/kset-example.c b/samples/kobject/kset-example.c
new file mode 100644
index 000000000..c8010f126
--- /dev/null
+++ b/samples/kobject/kset-example.c
@@ -0,0 +1,288 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Sample kset and ktype implementation
4 *
5 * Copyright (C) 2004-2007 Greg Kroah-Hartman <greg@kroah.com>
6 * Copyright (C) 2007 Novell Inc.
7 */
8#include <linux/kobject.h>
9#include <linux/string.h>
10#include <linux/sysfs.h>
11#include <linux/slab.h>
12#include <linux/module.h>
13#include <linux/init.h>
14
15/*
16 * This module shows how to create a kset in sysfs called
17 * /sys/kernel/kset-example
18 * Then tree kobjects are created and assigned to this kset, "foo", "baz",
19 * and "bar". In those kobjects, attributes of the same name are also
20 * created and if an integer is written to these files, it can be later
21 * read out of it.
22 */
23
24
25/*
26 * This is our "object" that we will create a few of and register them with
27 * sysfs.
28 */
29struct foo_obj {
30 struct kobject kobj;
31 int foo;
32 int baz;
33 int bar;
34};
35#define to_foo_obj(x) container_of(x, struct foo_obj, kobj)
36
37/* a custom attribute that works just for a struct foo_obj. */
38struct foo_attribute {
39 struct attribute attr;
40 ssize_t (*show)(struct foo_obj *foo, struct foo_attribute *attr, char *buf);
41 ssize_t (*store)(struct foo_obj *foo, struct foo_attribute *attr, const char *buf, size_t count);
42};
43#define to_foo_attr(x) container_of(x, struct foo_attribute, attr)
44
45/*
46 * The default show function that must be passed to sysfs. This will be
47 * called by sysfs for whenever a show function is called by the user on a
48 * sysfs file associated with the kobjects we have registered. We need to
49 * transpose back from a "default" kobject to our custom struct foo_obj and
50 * then call the show function for that specific object.
51 */
52static ssize_t foo_attr_show(struct kobject *kobj,
53 struct attribute *attr,
54 char *buf)
55{
56 struct foo_attribute *attribute;
57 struct foo_obj *foo;
58
59 attribute = to_foo_attr(attr);
60 foo = to_foo_obj(kobj);
61
62 if (!attribute->show)
63 return -EIO;
64
65 return attribute->show(foo, attribute, buf);
66}
67
68/*
69 * Just like the default show function above, but this one is for when the
70 * sysfs "store" is requested (when a value is written to a file.)
71 */
72static ssize_t foo_attr_store(struct kobject *kobj,
73 struct attribute *attr,
74 const char *buf, size_t len)
75{
76 struct foo_attribute *attribute;
77 struct foo_obj *foo;
78
79 attribute = to_foo_attr(attr);
80 foo = to_foo_obj(kobj);
81
82 if (!attribute->store)
83 return -EIO;
84
85 return attribute->store(foo, attribute, buf, len);
86}
87
88/* Our custom sysfs_ops that we will associate with our ktype later on */
89static const struct sysfs_ops foo_sysfs_ops = {
90 .show = foo_attr_show,
91 .store = foo_attr_store,
92};
93
94/*
95 * The release function for our object. This is REQUIRED by the kernel to
96 * have. We free the memory held in our object here.
97 *
98 * NEVER try to get away with just a "blank" release function to try to be
99 * smarter than the kernel. Turns out, no one ever is...
100 */
101static void foo_release(struct kobject *kobj)
102{
103 struct foo_obj *foo;
104
105 foo = to_foo_obj(kobj);
106 kfree(foo);
107}
108
109/*
110 * The "foo" file where the .foo variable is read from and written to.
111 */
112static ssize_t foo_show(struct foo_obj *foo_obj, struct foo_attribute *attr,
113 char *buf)
114{
115 return sprintf(buf, "%d\n", foo_obj->foo);
116}
117
118static ssize_t foo_store(struct foo_obj *foo_obj, struct foo_attribute *attr,
119 const char *buf, size_t count)
120{
121 int ret;
122
123 ret = kstrtoint(buf, 10, &foo_obj->foo);
124 if (ret < 0)
125 return ret;
126
127 return count;
128}
129
130/* Sysfs attributes cannot be world-writable. */
131static struct foo_attribute foo_attribute =
132 __ATTR(foo, 0664, foo_show, foo_store);
133
134/*
135 * More complex function where we determine which variable is being accessed by
136 * looking at the attribute for the "baz" and "bar" files.
137 */
138static ssize_t b_show(struct foo_obj *foo_obj, struct foo_attribute *attr,
139 char *buf)
140{
141 int var;
142
143 if (strcmp(attr->attr.name, "baz") == 0)
144 var = foo_obj->baz;
145 else
146 var = foo_obj->bar;
147 return sprintf(buf, "%d\n", var);
148}
149
150static ssize_t b_store(struct foo_obj *foo_obj, struct foo_attribute *attr,
151 const char *buf, size_t count)
152{
153 int var, ret;
154
155 ret = kstrtoint(buf, 10, &var);
156 if (ret < 0)
157 return ret;
158
159 if (strcmp(attr->attr.name, "baz") == 0)
160 foo_obj->baz = var;
161 else
162 foo_obj->bar = var;
163 return count;
164}
165
166static struct foo_attribute baz_attribute =
167 __ATTR(baz, 0664, b_show, b_store);
168static struct foo_attribute bar_attribute =
169 __ATTR(bar, 0664, b_show, b_store);
170
171/*
172 * Create a group of attributes so that we can create and destroy them all
173 * at once.
174 */
175static struct attribute *foo_default_attrs[] = {
176 &foo_attribute.attr,
177 &baz_attribute.attr,
178 &bar_attribute.attr,
179 NULL, /* need to NULL terminate the list of attributes */
180};
181ATTRIBUTE_GROUPS(foo_default);
182
183/*
184 * Our own ktype for our kobjects. Here we specify our sysfs ops, the
185 * release function, and the set of default attributes we want created
186 * whenever a kobject of this type is registered with the kernel.
187 */
188static struct kobj_type foo_ktype = {
189 .sysfs_ops = &foo_sysfs_ops,
190 .release = foo_release,
191 .default_groups = foo_default_groups,
192};
193
194static struct kset *example_kset;
195static struct foo_obj *foo_obj;
196static struct foo_obj *bar_obj;
197static struct foo_obj *baz_obj;
198
199static struct foo_obj *create_foo_obj(const char *name)
200{
201 struct foo_obj *foo;
202 int retval;
203
204 /* allocate the memory for the whole object */
205 foo = kzalloc(sizeof(*foo), GFP_KERNEL);
206 if (!foo)
207 return NULL;
208
209 /*
210 * As we have a kset for this kobject, we need to set it before calling
211 * the kobject core.
212 */
213 foo->kobj.kset = example_kset;
214
215 /*
216 * Initialize and add the kobject to the kernel. All the default files
217 * will be created here. As we have already specified a kset for this
218 * kobject, we don't have to set a parent for the kobject, the kobject
219 * will be placed beneath that kset automatically.
220 */
221 retval = kobject_init_and_add(&foo->kobj, &foo_ktype, NULL, "%s", name);
222 if (retval) {
223 kobject_put(&foo->kobj);
224 return NULL;
225 }
226
227 /*
228 * We are always responsible for sending the uevent that the kobject
229 * was added to the system.
230 */
231 kobject_uevent(&foo->kobj, KOBJ_ADD);
232
233 return foo;
234}
235
236static void destroy_foo_obj(struct foo_obj *foo)
237{
238 kobject_put(&foo->kobj);
239}
240
241static int __init example_init(void)
242{
243 /*
244 * Create a kset with the name of "kset_example",
245 * located under /sys/kernel/
246 */
247 example_kset = kset_create_and_add("kset_example", NULL, kernel_kobj);
248 if (!example_kset)
249 return -ENOMEM;
250
251 /*
252 * Create three objects and register them with our kset
253 */
254 foo_obj = create_foo_obj("foo");
255 if (!foo_obj)
256 goto foo_error;
257
258 bar_obj = create_foo_obj("bar");
259 if (!bar_obj)
260 goto bar_error;
261
262 baz_obj = create_foo_obj("baz");
263 if (!baz_obj)
264 goto baz_error;
265
266 return 0;
267
268baz_error:
269 destroy_foo_obj(bar_obj);
270bar_error:
271 destroy_foo_obj(foo_obj);
272foo_error:
273 kset_unregister(example_kset);
274 return -EINVAL;
275}
276
277static void __exit example_exit(void)
278{
279 destroy_foo_obj(baz_obj);
280 destroy_foo_obj(bar_obj);
281 destroy_foo_obj(foo_obj);
282 kset_unregister(example_kset);
283}
284
285module_init(example_init);
286module_exit(example_exit);
287MODULE_LICENSE("GPL v2");
288MODULE_AUTHOR("Greg Kroah-Hartman <greg@kroah.com>");
diff --git a/samples/kprobes/Makefile b/samples/kprobes/Makefile
new file mode 100644
index 000000000..e77459271
--- /dev/null
+++ b/samples/kprobes/Makefile
@@ -0,0 +1,6 @@
1# SPDX-License-Identifier: GPL-2.0-only
2# builds the kprobes example kernel modules;
3# then to use one (as root): insmod <module_name.ko>
4
5obj-$(CONFIG_SAMPLE_KPROBES) += kprobe_example.o
6obj-$(CONFIG_SAMPLE_KRETPROBES) += kretprobe_example.o
diff --git a/samples/kprobes/kprobe_example.c b/samples/kprobes/kprobe_example.c
new file mode 100644
index 000000000..365905cb2
--- /dev/null
+++ b/samples/kprobes/kprobe_example.c
@@ -0,0 +1,120 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * NOTE: This example is works on x86 and powerpc.
4 * Here's a sample kernel module showing the use of kprobes to dump a
5 * stack trace and selected registers when kernel_clone() is called.
6 *
7 * For more information on theory of operation of kprobes, see
8 * Documentation/trace/kprobes.rst
9 *
10 * You will see the trace data in /var/log/messages and on the console
11 * whenever kernel_clone() is invoked to create a new process.
12 */
13
14#include <linux/kernel.h>
15#include <linux/module.h>
16#include <linux/kprobes.h>
17
18#define MAX_SYMBOL_LEN 64
19static char symbol[MAX_SYMBOL_LEN] = "kernel_clone";
20module_param_string(symbol, symbol, sizeof(symbol), 0644);
21
22/* For each probe you need to allocate a kprobe structure */
23static struct kprobe kp = {
24 .symbol_name = symbol,
25};
26
27/* kprobe pre_handler: called just before the probed instruction is executed */
28static int __kprobes handler_pre(struct kprobe *p, struct pt_regs *regs)
29{
30#ifdef CONFIG_X86
31 pr_info("<%s> pre_handler: p->addr = 0x%p, ip = %lx, flags = 0x%lx\n",
32 p->symbol_name, p->addr, regs->ip, regs->flags);
33#endif
34#ifdef CONFIG_PPC
35 pr_info("<%s> pre_handler: p->addr = 0x%p, nip = 0x%lx, msr = 0x%lx\n",
36 p->symbol_name, p->addr, regs->nip, regs->msr);
37#endif
38#ifdef CONFIG_MIPS
39 pr_info("<%s> pre_handler: p->addr = 0x%p, epc = 0x%lx, status = 0x%lx\n",
40 p->symbol_name, p->addr, regs->cp0_epc, regs->cp0_status);
41#endif
42#ifdef CONFIG_ARM64
43 pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx,"
44 " pstate = 0x%lx\n",
45 p->symbol_name, p->addr, (long)regs->pc, (long)regs->pstate);
46#endif
47#ifdef CONFIG_S390
48 pr_info("<%s> pre_handler: p->addr, 0x%p, ip = 0x%lx, flags = 0x%lx\n",
49 p->symbol_name, p->addr, regs->psw.addr, regs->flags);
50#endif
51
52 /* A dump_stack() here will give a stack backtrace */
53 return 0;
54}
55
56/* kprobe post_handler: called after the probed instruction is executed */
57static void __kprobes handler_post(struct kprobe *p, struct pt_regs *regs,
58 unsigned long flags)
59{
60#ifdef CONFIG_X86
61 pr_info("<%s> post_handler: p->addr = 0x%p, flags = 0x%lx\n",
62 p->symbol_name, p->addr, regs->flags);
63#endif
64#ifdef CONFIG_PPC
65 pr_info("<%s> post_handler: p->addr = 0x%p, msr = 0x%lx\n",
66 p->symbol_name, p->addr, regs->msr);
67#endif
68#ifdef CONFIG_MIPS
69 pr_info("<%s> post_handler: p->addr = 0x%p, status = 0x%lx\n",
70 p->symbol_name, p->addr, regs->cp0_status);
71#endif
72#ifdef CONFIG_ARM64
73 pr_info("<%s> post_handler: p->addr = 0x%p, pstate = 0x%lx\n",
74 p->symbol_name, p->addr, (long)regs->pstate);
75#endif
76#ifdef CONFIG_S390
77 pr_info("<%s> pre_handler: p->addr, 0x%p, flags = 0x%lx\n",
78 p->symbol_name, p->addr, regs->flags);
79#endif
80}
81
82/*
83 * fault_handler: this is called if an exception is generated for any
84 * instruction within the pre- or post-handler, or when Kprobes
85 * single-steps the probed instruction.
86 */
87static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr)
88{
89 pr_info("fault_handler: p->addr = 0x%p, trap #%dn", p->addr, trapnr);
90 /* Return 0 because we don't handle the fault. */
91 return 0;
92}
93/* NOKPROBE_SYMBOL() is also available */
94NOKPROBE_SYMBOL(handler_fault);
95
96static int __init kprobe_init(void)
97{
98 int ret;
99 kp.pre_handler = handler_pre;
100 kp.post_handler = handler_post;
101 kp.fault_handler = handler_fault;
102
103 ret = register_kprobe(&kp);
104 if (ret < 0) {
105 pr_err("register_kprobe failed, returned %d\n", ret);
106 return ret;
107 }
108 pr_info("Planted kprobe at %p\n", kp.addr);
109 return 0;
110}
111
112static void __exit kprobe_exit(void)
113{
114 unregister_kprobe(&kp);
115 pr_info("kprobe at %p unregistered\n", kp.addr);
116}
117
118module_init(kprobe_init)
119module_exit(kprobe_exit)
120MODULE_LICENSE("GPL");
diff --git a/samples/kprobes/kretprobe_example.c b/samples/kprobes/kretprobe_example.c
new file mode 100644
index 000000000..228321ecb
--- /dev/null
+++ b/samples/kprobes/kretprobe_example.c
@@ -0,0 +1,108 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * kretprobe_example.c
4 *
5 * Here's a sample kernel module showing the use of return probes to
6 * report the return value and total time taken for probed function
7 * to run.
8 *
9 * usage: insmod kretprobe_example.ko func=<func_name>
10 *
11 * If no func_name is specified, kernel_clone is instrumented
12 *
13 * For more information on theory of operation of kretprobes, see
14 * Documentation/trace/kprobes.rst
15 *
16 * Build and insert the kernel module as done in the kprobe example.
17 * You will see the trace data in /var/log/messages and on the console
18 * whenever the probed function returns. (Some messages may be suppressed
19 * if syslogd is configured to eliminate duplicate messages.)
20 */
21
22#include <linux/kernel.h>
23#include <linux/module.h>
24#include <linux/kprobes.h>
25#include <linux/ktime.h>
26#include <linux/limits.h>
27#include <linux/sched.h>
28
29static char func_name[NAME_MAX] = "kernel_clone";
30module_param_string(func, func_name, NAME_MAX, S_IRUGO);
31MODULE_PARM_DESC(func, "Function to kretprobe; this module will report the"
32 " function's execution time");
33
34/* per-instance private data */
35struct my_data {
36 ktime_t entry_stamp;
37};
38
39/* Here we use the entry_hanlder to timestamp function entry */
40static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
41{
42 struct my_data *data;
43
44 if (!current->mm)
45 return 1; /* Skip kernel threads */
46
47 data = (struct my_data *)ri->data;
48 data->entry_stamp = ktime_get();
49 return 0;
50}
51NOKPROBE_SYMBOL(entry_handler);
52
53/*
54 * Return-probe handler: Log the return value and duration. Duration may turn
55 * out to be zero consistently, depending upon the granularity of time
56 * accounting on the platform.
57 */
58static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
59{
60 unsigned long retval = regs_return_value(regs);
61 struct my_data *data = (struct my_data *)ri->data;
62 s64 delta;
63 ktime_t now;
64
65 now = ktime_get();
66 delta = ktime_to_ns(ktime_sub(now, data->entry_stamp));
67 pr_info("%s returned %lu and took %lld ns to execute\n",
68 func_name, retval, (long long)delta);
69 return 0;
70}
71NOKPROBE_SYMBOL(ret_handler);
72
73static struct kretprobe my_kretprobe = {
74 .handler = ret_handler,
75 .entry_handler = entry_handler,
76 .data_size = sizeof(struct my_data),
77 /* Probe up to 20 instances concurrently. */
78 .maxactive = 20,
79};
80
81static int __init kretprobe_init(void)
82{
83 int ret;
84
85 my_kretprobe.kp.symbol_name = func_name;
86 ret = register_kretprobe(&my_kretprobe);
87 if (ret < 0) {
88 pr_err("register_kretprobe failed, returned %d\n", ret);
89 return ret;
90 }
91 pr_info("Planted return probe at %s: %p\n",
92 my_kretprobe.kp.symbol_name, my_kretprobe.kp.addr);
93 return 0;
94}
95
96static void __exit kretprobe_exit(void)
97{
98 unregister_kretprobe(&my_kretprobe);
99 pr_info("kretprobe at %p unregistered\n", my_kretprobe.kp.addr);
100
101 /* nmissed > 0 suggests that maxactive was set too low. */
102 pr_info("Missed probing %d instances of %s\n",
103 my_kretprobe.nmissed, my_kretprobe.kp.symbol_name);
104}
105
106module_init(kretprobe_init)
107module_exit(kretprobe_exit)
108MODULE_LICENSE("GPL");
diff --git a/samples/livepatch/Makefile b/samples/livepatch/Makefile
new file mode 100644
index 000000000..9f853eeb6
--- /dev/null
+++ b/samples/livepatch/Makefile
@@ -0,0 +1,8 @@
1# SPDX-License-Identifier: GPL-2.0-only
2obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-sample.o
3obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-mod.o
4obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix1.o
5obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix2.o
6obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-demo.o
7obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-mod.o
8obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-busymod.o
diff --git a/samples/livepatch/livepatch-callbacks-busymod.c b/samples/livepatch/livepatch-callbacks-busymod.c
new file mode 100644
index 000000000..378e2d402
--- /dev/null
+++ b/samples/livepatch/livepatch-callbacks-busymod.c
@@ -0,0 +1,60 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
4 */
5
6/*
7 * livepatch-callbacks-busymod.c - (un)patching callbacks demo support module
8 *
9 *
10 * Purpose
11 * -------
12 *
13 * Simple module to demonstrate livepatch (un)patching callbacks.
14 *
15 *
16 * Usage
17 * -----
18 *
19 * This module is not intended to be standalone. See the "Usage"
20 * section of livepatch-callbacks-mod.c.
21 */
22
23#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
24
25#include <linux/module.h>
26#include <linux/kernel.h>
27#include <linux/workqueue.h>
28#include <linux/delay.h>
29
30static int sleep_secs;
31module_param(sleep_secs, int, 0644);
32MODULE_PARM_DESC(sleep_secs, "sleep_secs (default=0)");
33
34static void busymod_work_func(struct work_struct *work);
35static DECLARE_DELAYED_WORK(work, busymod_work_func);
36
37static void busymod_work_func(struct work_struct *work)
38{
39 pr_info("%s, sleeping %d seconds ...\n", __func__, sleep_secs);
40 msleep(sleep_secs * 1000);
41 pr_info("%s exit\n", __func__);
42}
43
44static int livepatch_callbacks_mod_init(void)
45{
46 pr_info("%s\n", __func__);
47 schedule_delayed_work(&work,
48 msecs_to_jiffies(1000 * 0));
49 return 0;
50}
51
52static void livepatch_callbacks_mod_exit(void)
53{
54 cancel_delayed_work_sync(&work);
55 pr_info("%s\n", __func__);
56}
57
58module_init(livepatch_callbacks_mod_init);
59module_exit(livepatch_callbacks_mod_exit);
60MODULE_LICENSE("GPL");
diff --git a/samples/livepatch/livepatch-callbacks-demo.c b/samples/livepatch/livepatch-callbacks-demo.c
new file mode 100644
index 000000000..11c3f4357
--- /dev/null
+++ b/samples/livepatch/livepatch-callbacks-demo.c
@@ -0,0 +1,196 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
4 */
5
6/*
7 * livepatch-callbacks-demo.c - (un)patching callbacks livepatch demo
8 *
9 *
10 * Purpose
11 * -------
12 *
13 * Demonstration of registering livepatch (un)patching callbacks.
14 *
15 *
16 * Usage
17 * -----
18 *
19 * Step 1 - load the simple module
20 *
21 * insmod samples/livepatch/livepatch-callbacks-mod.ko
22 *
23 *
24 * Step 2 - load the demonstration livepatch (with callbacks)
25 *
26 * insmod samples/livepatch/livepatch-callbacks-demo.ko
27 *
28 *
29 * Step 3 - cleanup
30 *
31 * echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
32 * rmmod livepatch_callbacks_demo
33 * rmmod livepatch_callbacks_mod
34 *
35 * Watch dmesg output to see livepatch enablement, callback execution
36 * and patching operations for both vmlinux and module targets.
37 *
38 * NOTE: swap the insmod order of livepatch-callbacks-mod.ko and
39 * livepatch-callbacks-demo.ko to observe what happens when a
40 * target module is loaded after a livepatch with callbacks.
41 *
42 * NOTE: 'pre_patch_ret' is a module parameter that sets the pre-patch
43 * callback return status. Try setting up a non-zero status
44 * such as -19 (-ENODEV):
45 *
46 * # Load demo livepatch, vmlinux is patched
47 * insmod samples/livepatch/livepatch-callbacks-demo.ko
48 *
49 * # Setup next pre-patch callback to return -ENODEV
50 * echo -19 > /sys/module/livepatch_callbacks_demo/parameters/pre_patch_ret
51 *
52 * # Module loader refuses to load the target module
53 * insmod samples/livepatch/livepatch-callbacks-mod.ko
54 * insmod: ERROR: could not insert module samples/livepatch/livepatch-callbacks-mod.ko: No such device
55 *
56 * NOTE: There is a second target module,
57 * livepatch-callbacks-busymod.ko, available for experimenting
58 * with livepatch (un)patch callbacks. This module contains
59 * a 'sleep_secs' parameter that parks the module on one of the
60 * functions that the livepatch demo module wants to patch.
61 * Modifying this value and tweaking the order of module loads can
62 * effectively demonstrate stalled patch transitions:
63 *
64 * # Load a target module, let it park on 'busymod_work_func' for
65 * # thirty seconds
66 * insmod samples/livepatch/livepatch-callbacks-busymod.ko sleep_secs=30
67 *
68 * # Meanwhile load the livepatch
69 * insmod samples/livepatch/livepatch-callbacks-demo.ko
70 *
71 * # ... then load and unload another target module while the
72 * # transition is in progress
73 * insmod samples/livepatch/livepatch-callbacks-mod.ko
74 * rmmod samples/livepatch/livepatch-callbacks-mod.ko
75 *
76 * # Finally cleanup
77 * echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
78 * rmmod samples/livepatch/livepatch-callbacks-demo.ko
79 */
80
81#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
82
83#include <linux/module.h>
84#include <linux/kernel.h>
85#include <linux/livepatch.h>
86
87static int pre_patch_ret;
88module_param(pre_patch_ret, int, 0644);
89MODULE_PARM_DESC(pre_patch_ret, "pre_patch_ret (default=0)");
90
91static const char *const module_state[] = {
92 [MODULE_STATE_LIVE] = "[MODULE_STATE_LIVE] Normal state",
93 [MODULE_STATE_COMING] = "[MODULE_STATE_COMING] Full formed, running module_init",
94 [MODULE_STATE_GOING] = "[MODULE_STATE_GOING] Going away",
95 [MODULE_STATE_UNFORMED] = "[MODULE_STATE_UNFORMED] Still setting it up",
96};
97
98static void callback_info(const char *callback, struct klp_object *obj)
99{
100 if (obj->mod)
101 pr_info("%s: %s -> %s\n", callback, obj->mod->name,
102 module_state[obj->mod->state]);
103 else
104 pr_info("%s: vmlinux\n", callback);
105}
106
107/* Executed on object patching (ie, patch enablement) */
108static int pre_patch_callback(struct klp_object *obj)
109{
110 callback_info(__func__, obj);
111 return pre_patch_ret;
112}
113
114/* Executed on object unpatching (ie, patch disablement) */
115static void post_patch_callback(struct klp_object *obj)
116{
117 callback_info(__func__, obj);
118}
119
120/* Executed on object unpatching (ie, patch disablement) */
121static void pre_unpatch_callback(struct klp_object *obj)
122{
123 callback_info(__func__, obj);
124}
125
126/* Executed on object unpatching (ie, patch disablement) */
127static void post_unpatch_callback(struct klp_object *obj)
128{
129 callback_info(__func__, obj);
130}
131
132static void patched_work_func(struct work_struct *work)
133{
134 pr_info("%s\n", __func__);
135}
136
137static struct klp_func no_funcs[] = {
138 { }
139};
140
141static struct klp_func busymod_funcs[] = {
142 {
143 .old_name = "busymod_work_func",
144 .new_func = patched_work_func,
145 }, { }
146};
147
148static struct klp_object objs[] = {
149 {
150 .name = NULL, /* vmlinux */
151 .funcs = no_funcs,
152 .callbacks = {
153 .pre_patch = pre_patch_callback,
154 .post_patch = post_patch_callback,
155 .pre_unpatch = pre_unpatch_callback,
156 .post_unpatch = post_unpatch_callback,
157 },
158 }, {
159 .name = "livepatch_callbacks_mod",
160 .funcs = no_funcs,
161 .callbacks = {
162 .pre_patch = pre_patch_callback,
163 .post_patch = post_patch_callback,
164 .pre_unpatch = pre_unpatch_callback,
165 .post_unpatch = post_unpatch_callback,
166 },
167 }, {
168 .name = "livepatch_callbacks_busymod",
169 .funcs = busymod_funcs,
170 .callbacks = {
171 .pre_patch = pre_patch_callback,
172 .post_patch = post_patch_callback,
173 .pre_unpatch = pre_unpatch_callback,
174 .post_unpatch = post_unpatch_callback,
175 },
176 }, { }
177};
178
179static struct klp_patch patch = {
180 .mod = THIS_MODULE,
181 .objs = objs,
182};
183
184static int livepatch_callbacks_demo_init(void)
185{
186 return klp_enable_patch(&patch);
187}
188
189static void livepatch_callbacks_demo_exit(void)
190{
191}
192
193module_init(livepatch_callbacks_demo_init);
194module_exit(livepatch_callbacks_demo_exit);
195MODULE_LICENSE("GPL");
196MODULE_INFO(livepatch, "Y");
diff --git a/samples/livepatch/livepatch-callbacks-mod.c b/samples/livepatch/livepatch-callbacks-mod.c
new file mode 100644
index 000000000..2a074f422
--- /dev/null
+++ b/samples/livepatch/livepatch-callbacks-mod.c
@@ -0,0 +1,41 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
4 */
5
6/*
7 * livepatch-callbacks-mod.c - (un)patching callbacks demo support module
8 *
9 *
10 * Purpose
11 * -------
12 *
13 * Simple module to demonstrate livepatch (un)patching callbacks.
14 *
15 *
16 * Usage
17 * -----
18 *
19 * This module is not intended to be standalone. See the "Usage"
20 * section of livepatch-callbacks-demo.c.
21 */
22
23#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
24
25#include <linux/module.h>
26#include <linux/kernel.h>
27
28static int livepatch_callbacks_mod_init(void)
29{
30 pr_info("%s\n", __func__);
31 return 0;
32}
33
34static void livepatch_callbacks_mod_exit(void)
35{
36 pr_info("%s\n", __func__);
37}
38
39module_init(livepatch_callbacks_mod_init);
40module_exit(livepatch_callbacks_mod_exit);
41MODULE_LICENSE("GPL");
diff --git a/samples/livepatch/livepatch-sample.c b/samples/livepatch/livepatch-sample.c
new file mode 100644
index 000000000..cd76d7ebe
--- /dev/null
+++ b/samples/livepatch/livepatch-sample.c
@@ -0,0 +1,70 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * livepatch-sample.c - Kernel Live Patching Sample Module
4 *
5 * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
6 */
7
8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9
10#include <linux/module.h>
11#include <linux/kernel.h>
12#include <linux/livepatch.h>
13
14/*
15 * This (dumb) live patch overrides the function that prints the
16 * kernel boot cmdline when /proc/cmdline is read.
17 *
18 * Example:
19 *
20 * $ cat /proc/cmdline
21 * <your cmdline>
22 *
23 * $ insmod livepatch-sample.ko
24 * $ cat /proc/cmdline
25 * this has been live patched
26 *
27 * $ echo 0 > /sys/kernel/livepatch/livepatch_sample/enabled
28 * $ cat /proc/cmdline
29 * <your cmdline>
30 */
31
32#include <linux/seq_file.h>
33static int livepatch_cmdline_proc_show(struct seq_file *m, void *v)
34{
35 seq_printf(m, "%s\n", "this has been live patched");
36 return 0;
37}
38
39static struct klp_func funcs[] = {
40 {
41 .old_name = "cmdline_proc_show",
42 .new_func = livepatch_cmdline_proc_show,
43 }, { }
44};
45
46static struct klp_object objs[] = {
47 {
48 /* name being NULL means vmlinux */
49 .funcs = funcs,
50 }, { }
51};
52
53static struct klp_patch patch = {
54 .mod = THIS_MODULE,
55 .objs = objs,
56};
57
58static int livepatch_init(void)
59{
60 return klp_enable_patch(&patch);
61}
62
63static void livepatch_exit(void)
64{
65}
66
67module_init(livepatch_init);
68module_exit(livepatch_exit);
69MODULE_LICENSE("GPL");
70MODULE_INFO(livepatch, "Y");
diff --git a/samples/livepatch/livepatch-shadow-fix1.c b/samples/livepatch/livepatch-shadow-fix1.c
new file mode 100644
index 000000000..918ce17b4
--- /dev/null
+++ b/samples/livepatch/livepatch-shadow-fix1.c
@@ -0,0 +1,173 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
4 */
5
6/*
7 * livepatch-shadow-fix1.c - Shadow variables, livepatch demo
8 *
9 * Purpose
10 * -------
11 *
12 * Fixes the memory leak introduced in livepatch-shadow-mod through the
13 * use of a shadow variable. This fix demonstrates the "extending" of
14 * short-lived data structures by patching its allocation and release
15 * functions.
16 *
17 *
18 * Usage
19 * -----
20 *
21 * This module is not intended to be standalone. See the "Usage"
22 * section of livepatch-shadow-mod.c.
23 */
24
25#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
26
27#include <linux/module.h>
28#include <linux/kernel.h>
29#include <linux/livepatch.h>
30#include <linux/slab.h>
31
32/* Shadow variable enums */
33#define SV_LEAK 1
34
35/* Allocate new dummies every second */
36#define ALLOC_PERIOD 1
37/* Check for expired dummies after a few new ones have been allocated */
38#define CLEANUP_PERIOD (3 * ALLOC_PERIOD)
39/* Dummies expire after a few cleanup instances */
40#define EXPIRE_PERIOD (4 * CLEANUP_PERIOD)
41
42struct dummy {
43 struct list_head list;
44 unsigned long jiffies_expire;
45};
46
47/*
48 * The constructor makes more sense together with klp_shadow_get_or_alloc().
49 * In this example, it would be safe to assign the pointer also to the shadow
50 * variable returned by klp_shadow_alloc(). But we wanted to show the more
51 * complicated use of the API.
52 */
53static int shadow_leak_ctor(void *obj, void *shadow_data, void *ctor_data)
54{
55 int **shadow_leak = shadow_data;
56 int **leak = ctor_data;
57
58 if (!ctor_data)
59 return -EINVAL;
60
61 *shadow_leak = *leak;
62 return 0;
63}
64
65static struct dummy *livepatch_fix1_dummy_alloc(void)
66{
67 struct dummy *d;
68 int *leak;
69 int **shadow_leak;
70
71 d = kzalloc(sizeof(*d), GFP_KERNEL);
72 if (!d)
73 return NULL;
74
75 d->jiffies_expire = jiffies +
76 msecs_to_jiffies(1000 * EXPIRE_PERIOD);
77
78 /*
79 * Patch: save the extra memory location into a SV_LEAK shadow
80 * variable. A patched dummy_free routine can later fetch this
81 * pointer to handle resource release.
82 */
83 leak = kzalloc(sizeof(*leak), GFP_KERNEL);
84 if (!leak)
85 goto err_leak;
86
87 shadow_leak = klp_shadow_alloc(d, SV_LEAK, sizeof(leak), GFP_KERNEL,
88 shadow_leak_ctor, &leak);
89 if (!shadow_leak) {
90 pr_err("%s: failed to allocate shadow variable for the leaking pointer: dummy @ %p, leak @ %p\n",
91 __func__, d, leak);
92 goto err_shadow;
93 }
94
95 pr_info("%s: dummy @ %p, expires @ %lx\n",
96 __func__, d, d->jiffies_expire);
97
98 return d;
99
100err_shadow:
101 kfree(leak);
102err_leak:
103 kfree(d);
104 return NULL;
105}
106
107static void livepatch_fix1_dummy_leak_dtor(void *obj, void *shadow_data)
108{
109 void *d = obj;
110 int **shadow_leak = shadow_data;
111
112 kfree(*shadow_leak);
113 pr_info("%s: dummy @ %p, prevented leak @ %p\n",
114 __func__, d, *shadow_leak);
115}
116
117static void livepatch_fix1_dummy_free(struct dummy *d)
118{
119 int **shadow_leak;
120
121 /*
122 * Patch: fetch the saved SV_LEAK shadow variable, detach and
123 * free it. Note: handle cases where this shadow variable does
124 * not exist (ie, dummy structures allocated before this livepatch
125 * was loaded.)
126 */
127 shadow_leak = klp_shadow_get(d, SV_LEAK);
128 if (shadow_leak)
129 klp_shadow_free(d, SV_LEAK, livepatch_fix1_dummy_leak_dtor);
130 else
131 pr_info("%s: dummy @ %p leaked!\n", __func__, d);
132
133 kfree(d);
134}
135
136static struct klp_func funcs[] = {
137 {
138 .old_name = "dummy_alloc",
139 .new_func = livepatch_fix1_dummy_alloc,
140 },
141 {
142 .old_name = "dummy_free",
143 .new_func = livepatch_fix1_dummy_free,
144 }, { }
145};
146
147static struct klp_object objs[] = {
148 {
149 .name = "livepatch_shadow_mod",
150 .funcs = funcs,
151 }, { }
152};
153
154static struct klp_patch patch = {
155 .mod = THIS_MODULE,
156 .objs = objs,
157};
158
159static int livepatch_shadow_fix1_init(void)
160{
161 return klp_enable_patch(&patch);
162}
163
164static void livepatch_shadow_fix1_exit(void)
165{
166 /* Cleanup any existing SV_LEAK shadow variables */
167 klp_shadow_free_all(SV_LEAK, livepatch_fix1_dummy_leak_dtor);
168}
169
170module_init(livepatch_shadow_fix1_init);
171module_exit(livepatch_shadow_fix1_exit);
172MODULE_LICENSE("GPL");
173MODULE_INFO(livepatch, "Y");
diff --git a/samples/livepatch/livepatch-shadow-fix2.c b/samples/livepatch/livepatch-shadow-fix2.c
new file mode 100644
index 000000000..29fe5cd42
--- /dev/null
+++ b/samples/livepatch/livepatch-shadow-fix2.c
@@ -0,0 +1,132 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
4 */
5
6/*
7 * livepatch-shadow-fix2.c - Shadow variables, livepatch demo
8 *
9 * Purpose
10 * -------
11 *
12 * Adds functionality to livepatch-shadow-mod's in-flight data
13 * structures through a shadow variable. The livepatch patches a
14 * routine that periodically inspects data structures, incrementing a
15 * per-data-structure counter, creating the counter if needed.
16 *
17 *
18 * Usage
19 * -----
20 *
21 * This module is not intended to be standalone. See the "Usage"
22 * section of livepatch-shadow-mod.c.
23 */
24
25#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
26
27#include <linux/module.h>
28#include <linux/kernel.h>
29#include <linux/livepatch.h>
30#include <linux/slab.h>
31
32/* Shadow variable enums */
33#define SV_LEAK 1
34#define SV_COUNTER 2
35
36struct dummy {
37 struct list_head list;
38 unsigned long jiffies_expire;
39};
40
41static bool livepatch_fix2_dummy_check(struct dummy *d, unsigned long jiffies)
42{
43 int *shadow_count;
44
45 /*
46 * Patch: handle in-flight dummy structures, if they do not
47 * already have a SV_COUNTER shadow variable, then attach a
48 * new one.
49 */
50 shadow_count = klp_shadow_get_or_alloc(d, SV_COUNTER,
51 sizeof(*shadow_count), GFP_NOWAIT,
52 NULL, NULL);
53 if (shadow_count)
54 *shadow_count += 1;
55
56 return time_after(jiffies, d->jiffies_expire);
57}
58
59static void livepatch_fix2_dummy_leak_dtor(void *obj, void *shadow_data)
60{
61 void *d = obj;
62 int **shadow_leak = shadow_data;
63
64 kfree(*shadow_leak);
65 pr_info("%s: dummy @ %p, prevented leak @ %p\n",
66 __func__, d, *shadow_leak);
67}
68
69static void livepatch_fix2_dummy_free(struct dummy *d)
70{
71 int **shadow_leak;
72 int *shadow_count;
73
74 /* Patch: copy the memory leak patch from the fix1 module. */
75 shadow_leak = klp_shadow_get(d, SV_LEAK);
76 if (shadow_leak)
77 klp_shadow_free(d, SV_LEAK, livepatch_fix2_dummy_leak_dtor);
78 else
79 pr_info("%s: dummy @ %p leaked!\n", __func__, d);
80
81 /*
82 * Patch: fetch the SV_COUNTER shadow variable and display
83 * the final count. Detach the shadow variable.
84 */
85 shadow_count = klp_shadow_get(d, SV_COUNTER);
86 if (shadow_count) {
87 pr_info("%s: dummy @ %p, check counter = %d\n",
88 __func__, d, *shadow_count);
89 klp_shadow_free(d, SV_COUNTER, NULL);
90 }
91
92 kfree(d);
93}
94
95static struct klp_func funcs[] = {
96 {
97 .old_name = "dummy_check",
98 .new_func = livepatch_fix2_dummy_check,
99 },
100 {
101 .old_name = "dummy_free",
102 .new_func = livepatch_fix2_dummy_free,
103 }, { }
104};
105
106static struct klp_object objs[] = {
107 {
108 .name = "livepatch_shadow_mod",
109 .funcs = funcs,
110 }, { }
111};
112
113static struct klp_patch patch = {
114 .mod = THIS_MODULE,
115 .objs = objs,
116};
117
118static int livepatch_shadow_fix2_init(void)
119{
120 return klp_enable_patch(&patch);
121}
122
123static void livepatch_shadow_fix2_exit(void)
124{
125 /* Cleanup any existing SV_COUNTER shadow variables */
126 klp_shadow_free_all(SV_COUNTER, NULL);
127}
128
129module_init(livepatch_shadow_fix2_init);
130module_exit(livepatch_shadow_fix2_exit);
131MODULE_LICENSE("GPL");
132MODULE_INFO(livepatch, "Y");
diff --git a/samples/livepatch/livepatch-shadow-mod.c b/samples/livepatch/livepatch-shadow-mod.c
new file mode 100644
index 000000000..7e753b0d2
--- /dev/null
+++ b/samples/livepatch/livepatch-shadow-mod.c
@@ -0,0 +1,217 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
4 */
5
6/*
7 * livepatch-shadow-mod.c - Shadow variables, buggy module demo
8 *
9 * Purpose
10 * -------
11 *
12 * As a demonstration of livepatch shadow variable API, this module
13 * introduces memory leak behavior that livepatch modules
14 * livepatch-shadow-fix1.ko and livepatch-shadow-fix2.ko correct and
15 * enhance.
16 *
17 * WARNING - even though the livepatch-shadow-fix modules patch the
18 * memory leak, please load these modules at your own risk -- some
19 * amount of memory may leaked before the bug is patched.
20 *
21 *
22 * Usage
23 * -----
24 *
25 * Step 1 - Load the buggy demonstration module:
26 *
27 * insmod samples/livepatch/livepatch-shadow-mod.ko
28 *
29 * Watch dmesg output for a few moments to see new dummy being allocated
30 * and a periodic cleanup check. (Note: a small amount of memory is
31 * being leaked.)
32 *
33 *
34 * Step 2 - Load livepatch fix1:
35 *
36 * insmod samples/livepatch/livepatch-shadow-fix1.ko
37 *
38 * Continue watching dmesg and note that now livepatch_fix1_dummy_free()
39 * and livepatch_fix1_dummy_alloc() are logging messages about leaked
40 * memory and eventually leaks prevented.
41 *
42 *
43 * Step 3 - Load livepatch fix2 (on top of fix1):
44 *
45 * insmod samples/livepatch/livepatch-shadow-fix2.ko
46 *
47 * This module extends functionality through shadow variables, as a new
48 * "check" counter is added to the dummy structure. Periodic dmesg
49 * messages will log these as dummies are cleaned up.
50 *
51 *
52 * Step 4 - Cleanup
53 *
54 * Unwind the demonstration by disabling the livepatch fix modules, then
55 * removing them and the demo module:
56 *
57 * echo 0 > /sys/kernel/livepatch/livepatch_shadow_fix2/enabled
58 * echo 0 > /sys/kernel/livepatch/livepatch_shadow_fix1/enabled
59 * rmmod livepatch-shadow-fix2
60 * rmmod livepatch-shadow-fix1
61 * rmmod livepatch-shadow-mod
62 */
63
64
65#include <linux/kernel.h>
66#include <linux/module.h>
67#include <linux/sched.h>
68#include <linux/slab.h>
69#include <linux/stat.h>
70#include <linux/workqueue.h>
71
72MODULE_LICENSE("GPL");
73MODULE_AUTHOR("Joe Lawrence <joe.lawrence@redhat.com>");
74MODULE_DESCRIPTION("Buggy module for shadow variable demo");
75
76/* Allocate new dummies every second */
77#define ALLOC_PERIOD 1
78/* Check for expired dummies after a few new ones have been allocated */
79#define CLEANUP_PERIOD (3 * ALLOC_PERIOD)
80/* Dummies expire after a few cleanup instances */
81#define EXPIRE_PERIOD (4 * CLEANUP_PERIOD)
82
83/*
84 * Keep a list of all the dummies so we can clean up any residual ones
85 * on module exit
86 */
87static LIST_HEAD(dummy_list);
88static DEFINE_MUTEX(dummy_list_mutex);
89
90struct dummy {
91 struct list_head list;
92 unsigned long jiffies_expire;
93};
94
95static __used noinline struct dummy *dummy_alloc(void)
96{
97 struct dummy *d;
98 int *leak;
99
100 d = kzalloc(sizeof(*d), GFP_KERNEL);
101 if (!d)
102 return NULL;
103
104 d->jiffies_expire = jiffies +
105 msecs_to_jiffies(1000 * EXPIRE_PERIOD);
106
107 /* Oops, forgot to save leak! */
108 leak = kzalloc(sizeof(*leak), GFP_KERNEL);
109 if (!leak) {
110 kfree(d);
111 return NULL;
112 }
113
114 pr_info("%s: dummy @ %p, expires @ %lx\n",
115 __func__, d, d->jiffies_expire);
116
117 return d;
118}
119
120static __used noinline void dummy_free(struct dummy *d)
121{
122 pr_info("%s: dummy @ %p, expired = %lx\n",
123 __func__, d, d->jiffies_expire);
124
125 kfree(d);
126}
127
128static __used noinline bool dummy_check(struct dummy *d,
129 unsigned long jiffies)
130{
131 return time_after(jiffies, d->jiffies_expire);
132}
133
134/*
135 * alloc_work_func: allocates new dummy structures, allocates additional
136 * memory, aptly named "leak", but doesn't keep
137 * permanent record of it.
138 */
139
140static void alloc_work_func(struct work_struct *work);
141static DECLARE_DELAYED_WORK(alloc_dwork, alloc_work_func);
142
143static void alloc_work_func(struct work_struct *work)
144{
145 struct dummy *d;
146
147 d = dummy_alloc();
148 if (!d)
149 return;
150
151 mutex_lock(&dummy_list_mutex);
152 list_add(&d->list, &dummy_list);
153 mutex_unlock(&dummy_list_mutex);
154
155 schedule_delayed_work(&alloc_dwork,
156 msecs_to_jiffies(1000 * ALLOC_PERIOD));
157}
158
159/*
160 * cleanup_work_func: frees dummy structures. Without knownledge of
161 * "leak", it leaks the additional memory that
162 * alloc_work_func created.
163 */
164
165static void cleanup_work_func(struct work_struct *work);
166static DECLARE_DELAYED_WORK(cleanup_dwork, cleanup_work_func);
167
168static void cleanup_work_func(struct work_struct *work)
169{
170 struct dummy *d, *tmp;
171 unsigned long j;
172
173 j = jiffies;
174 pr_info("%s: jiffies = %lx\n", __func__, j);
175
176 mutex_lock(&dummy_list_mutex);
177 list_for_each_entry_safe(d, tmp, &dummy_list, list) {
178
179 /* Kick out and free any expired dummies */
180 if (dummy_check(d, j)) {
181 list_del(&d->list);
182 dummy_free(d);
183 }
184 }
185 mutex_unlock(&dummy_list_mutex);
186
187 schedule_delayed_work(&cleanup_dwork,
188 msecs_to_jiffies(1000 * CLEANUP_PERIOD));
189}
190
191static int livepatch_shadow_mod_init(void)
192{
193 schedule_delayed_work(&alloc_dwork,
194 msecs_to_jiffies(1000 * ALLOC_PERIOD));
195 schedule_delayed_work(&cleanup_dwork,
196 msecs_to_jiffies(1000 * CLEANUP_PERIOD));
197
198 return 0;
199}
200
201static void livepatch_shadow_mod_exit(void)
202{
203 struct dummy *d, *tmp;
204
205 /* Wait for any dummies at work */
206 cancel_delayed_work_sync(&alloc_dwork);
207 cancel_delayed_work_sync(&cleanup_dwork);
208
209 /* Cleanup residual dummies */
210 list_for_each_entry_safe(d, tmp, &dummy_list, list) {
211 list_del(&d->list);
212 dummy_free(d);
213 }
214}
215
216module_init(livepatch_shadow_mod_init);
217module_exit(livepatch_shadow_mod_exit);
diff --git a/samples/mei/.gitignore b/samples/mei/.gitignore
new file mode 100644
index 000000000..db5e802f0
--- /dev/null
+++ b/samples/mei/.gitignore
@@ -0,0 +1,2 @@
1# SPDX-License-Identifier: GPL-2.0-only
2mei-amt-version
diff --git a/samples/mei/Makefile b/samples/mei/Makefile
new file mode 100644
index 000000000..c54b8a0ab
--- /dev/null
+++ b/samples/mei/Makefile
@@ -0,0 +1,5 @@
1# SPDX-License-Identifier: GPL-2.0
2# Copyright (c) 2012-2019, Intel Corporation. All rights reserved.
3userprogs-always-y += mei-amt-version
4
5userccflags += -I usr/include
diff --git a/samples/mei/mei-amt-version.c b/samples/mei/mei-amt-version.c
new file mode 100644
index 000000000..ad3e56042
--- /dev/null
+++ b/samples/mei/mei-amt-version.c
@@ -0,0 +1,479 @@
1/******************************************************************************
2 * Intel Management Engine Interface (Intel MEI) Linux driver
3 * Intel MEI Interface Header
4 *
5 * This file is provided under a dual BSD/GPLv2 license. When using or
6 * redistributing this file, you may do so under either license.
7 *
8 * GPL LICENSE SUMMARY
9 *
10 * Copyright(c) 2012 Intel Corporation. All rights reserved.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of version 2 of the GNU General Public License as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110,
24 * USA
25 *
26 * The full GNU General Public License is included in this distribution
27 * in the file called LICENSE.GPL.
28 *
29 * Contact Information:
30 * Intel Corporation.
31 * linux-mei@linux.intel.com
32 * http://www.intel.com
33 *
34 * BSD LICENSE
35 *
36 * Copyright(c) 2003 - 2012 Intel Corporation. All rights reserved.
37 * All rights reserved.
38 *
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 *
43 * * Redistributions of source code must retain the above copyright
44 * notice, this list of conditions and the following disclaimer.
45 * * Redistributions in binary form must reproduce the above copyright
46 * notice, this list of conditions and the following disclaimer in
47 * the documentation and/or other materials provided with the
48 * distribution.
49 * * Neither the name Intel Corporation nor the names of its
50 * contributors may be used to endorse or promote products derived
51 * from this software without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
54 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
55 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
56 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
57 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
58 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
59 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
60 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
61 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
62 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
63 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64 *
65 *****************************************************************************/
66
67#include <stdio.h>
68#include <stdlib.h>
69#include <string.h>
70#include <fcntl.h>
71#include <sys/ioctl.h>
72#include <unistd.h>
73#include <errno.h>
74#include <stdint.h>
75#include <stdbool.h>
76#include <bits/wordsize.h>
77#include <linux/mei.h>
78
79/*****************************************************************************
80 * Intel Management Engine Interface
81 *****************************************************************************/
82
83#define mei_msg(_me, fmt, ARGS...) do { \
84 if (_me->verbose) \
85 fprintf(stderr, fmt, ##ARGS); \
86} while (0)
87
88#define mei_err(_me, fmt, ARGS...) do { \
89 fprintf(stderr, "Error: " fmt, ##ARGS); \
90} while (0)
91
92struct mei {
93 uuid_le guid;
94 bool initialized;
95 bool verbose;
96 unsigned int buf_size;
97 unsigned char prot_ver;
98 int fd;
99};
100
101static void mei_deinit(struct mei *cl)
102{
103 if (cl->fd != -1)
104 close(cl->fd);
105 cl->fd = -1;
106 cl->buf_size = 0;
107 cl->prot_ver = 0;
108 cl->initialized = false;
109}
110
111static bool mei_init(struct mei *me, const uuid_le *guid,
112 unsigned char req_protocol_version, bool verbose)
113{
114 int result;
115 struct mei_client *cl;
116 struct mei_connect_client_data data;
117
118 me->verbose = verbose;
119
120 me->fd = open("/dev/mei0", O_RDWR);
121 if (me->fd == -1) {
122 mei_err(me, "Cannot establish a handle to the Intel MEI driver\n");
123 goto err;
124 }
125 memcpy(&me->guid, guid, sizeof(*guid));
126 memset(&data, 0, sizeof(data));
127 me->initialized = true;
128
129 memcpy(&data.in_client_uuid, &me->guid, sizeof(me->guid));
130 result = ioctl(me->fd, IOCTL_MEI_CONNECT_CLIENT, &data);
131 if (result) {
132 mei_err(me, "IOCTL_MEI_CONNECT_CLIENT receive message. err=%d\n", result);
133 goto err;
134 }
135 cl = &data.out_client_properties;
136 mei_msg(me, "max_message_length %d\n", cl->max_msg_length);
137 mei_msg(me, "protocol_version %d\n", cl->protocol_version);
138
139 if ((req_protocol_version > 0) &&
140 (cl->protocol_version != req_protocol_version)) {
141 mei_err(me, "Intel MEI protocol version not supported\n");
142 goto err;
143 }
144
145 me->buf_size = cl->max_msg_length;
146 me->prot_ver = cl->protocol_version;
147
148 return true;
149err:
150 mei_deinit(me);
151 return false;
152}
153
154static ssize_t mei_recv_msg(struct mei *me, unsigned char *buffer,
155 ssize_t len, unsigned long timeout)
156{
157 ssize_t rc;
158
159 mei_msg(me, "call read length = %zd\n", len);
160
161 rc = read(me->fd, buffer, len);
162 if (rc < 0) {
163 mei_err(me, "read failed with status %zd %s\n",
164 rc, strerror(errno));
165 mei_deinit(me);
166 } else {
167 mei_msg(me, "read succeeded with result %zd\n", rc);
168 }
169 return rc;
170}
171
172static ssize_t mei_send_msg(struct mei *me, const unsigned char *buffer,
173 ssize_t len, unsigned long timeout)
174{
175 struct timeval tv;
176 ssize_t written;
177 ssize_t rc;
178 fd_set set;
179
180 tv.tv_sec = timeout / 1000;
181 tv.tv_usec = (timeout % 1000) * 1000000;
182
183 mei_msg(me, "call write length = %zd\n", len);
184
185 written = write(me->fd, buffer, len);
186 if (written < 0) {
187 rc = -errno;
188 mei_err(me, "write failed with status %zd %s\n",
189 written, strerror(errno));
190 goto out;
191 }
192
193 FD_ZERO(&set);
194 FD_SET(me->fd, &set);
195 rc = select(me->fd + 1 , &set, NULL, NULL, &tv);
196 if (rc > 0 && FD_ISSET(me->fd, &set)) {
197 mei_msg(me, "write success\n");
198 } else if (rc == 0) {
199 mei_err(me, "write failed on timeout with status\n");
200 goto out;
201 } else { /* rc < 0 */
202 mei_err(me, "write failed on select with status %zd\n", rc);
203 goto out;
204 }
205
206 rc = written;
207out:
208 if (rc < 0)
209 mei_deinit(me);
210
211 return rc;
212}
213
214/***************************************************************************
215 * Intel Advanced Management Technology ME Client
216 ***************************************************************************/
217
218#define AMT_MAJOR_VERSION 1
219#define AMT_MINOR_VERSION 1
220
221#define AMT_STATUS_SUCCESS 0x0
222#define AMT_STATUS_INTERNAL_ERROR 0x1
223#define AMT_STATUS_NOT_READY 0x2
224#define AMT_STATUS_INVALID_AMT_MODE 0x3
225#define AMT_STATUS_INVALID_MESSAGE_LENGTH 0x4
226
227#define AMT_STATUS_HOST_IF_EMPTY_RESPONSE 0x4000
228#define AMT_STATUS_SDK_RESOURCES 0x1004
229
230
231#define AMT_BIOS_VERSION_LEN 65
232#define AMT_VERSIONS_NUMBER 50
233#define AMT_UNICODE_STRING_LEN 20
234
235struct amt_unicode_string {
236 uint16_t length;
237 char string[AMT_UNICODE_STRING_LEN];
238} __attribute__((packed));
239
240struct amt_version_type {
241 struct amt_unicode_string description;
242 struct amt_unicode_string version;
243} __attribute__((packed));
244
245struct amt_version {
246 uint8_t major;
247 uint8_t minor;
248} __attribute__((packed));
249
250struct amt_code_versions {
251 uint8_t bios[AMT_BIOS_VERSION_LEN];
252 uint32_t count;
253 struct amt_version_type versions[AMT_VERSIONS_NUMBER];
254} __attribute__((packed));
255
256/***************************************************************************
257 * Intel Advanced Management Technology Host Interface
258 ***************************************************************************/
259
260struct amt_host_if_msg_header {
261 struct amt_version version;
262 uint16_t _reserved;
263 uint32_t command;
264 uint32_t length;
265} __attribute__((packed));
266
267struct amt_host_if_resp_header {
268 struct amt_host_if_msg_header header;
269 uint32_t status;
270 unsigned char data[];
271} __attribute__((packed));
272
273const uuid_le MEI_IAMTHIF = UUID_LE(0x12f80028, 0xb4b7, 0x4b2d, \
274 0xac, 0xa8, 0x46, 0xe0, 0xff, 0x65, 0x81, 0x4c);
275
276#define AMT_HOST_IF_CODE_VERSIONS_REQUEST 0x0400001A
277#define AMT_HOST_IF_CODE_VERSIONS_RESPONSE 0x0480001A
278
279const struct amt_host_if_msg_header CODE_VERSION_REQ = {
280 .version = {AMT_MAJOR_VERSION, AMT_MINOR_VERSION},
281 ._reserved = 0,
282 .command = AMT_HOST_IF_CODE_VERSIONS_REQUEST,
283 .length = 0
284};
285
286
287struct amt_host_if {
288 struct mei mei_cl;
289 unsigned long send_timeout;
290 bool initialized;
291};
292
293
294static bool amt_host_if_init(struct amt_host_if *acmd,
295 unsigned long send_timeout, bool verbose)
296{
297 acmd->send_timeout = (send_timeout) ? send_timeout : 20000;
298 acmd->initialized = mei_init(&acmd->mei_cl, &MEI_IAMTHIF, 0, verbose);
299 return acmd->initialized;
300}
301
302static void amt_host_if_deinit(struct amt_host_if *acmd)
303{
304 mei_deinit(&acmd->mei_cl);
305 acmd->initialized = false;
306}
307
308static uint32_t amt_verify_code_versions(const struct amt_host_if_resp_header *resp)
309{
310 uint32_t status = AMT_STATUS_SUCCESS;
311 struct amt_code_versions *code_ver;
312 size_t code_ver_len;
313 uint32_t ver_type_cnt;
314 uint32_t len;
315 uint32_t i;
316
317 code_ver = (struct amt_code_versions *)resp->data;
318 /* length - sizeof(status) */
319 code_ver_len = resp->header.length - sizeof(uint32_t);
320 ver_type_cnt = code_ver_len -
321 sizeof(code_ver->bios) -
322 sizeof(code_ver->count);
323 if (code_ver->count != ver_type_cnt / sizeof(struct amt_version_type)) {
324 status = AMT_STATUS_INTERNAL_ERROR;
325 goto out;
326 }
327
328 for (i = 0; i < code_ver->count; i++) {
329 len = code_ver->versions[i].description.length;
330
331 if (len > AMT_UNICODE_STRING_LEN) {
332 status = AMT_STATUS_INTERNAL_ERROR;
333 goto out;
334 }
335
336 len = code_ver->versions[i].version.length;
337 if (code_ver->versions[i].version.string[len] != '\0' ||
338 len != strlen(code_ver->versions[i].version.string)) {
339 status = AMT_STATUS_INTERNAL_ERROR;
340 goto out;
341 }
342 }
343out:
344 return status;
345}
346
347static uint32_t amt_verify_response_header(uint32_t command,
348 const struct amt_host_if_msg_header *resp_hdr,
349 uint32_t response_size)
350{
351 if (response_size < sizeof(struct amt_host_if_resp_header)) {
352 return AMT_STATUS_INTERNAL_ERROR;
353 } else if (response_size != (resp_hdr->length +
354 sizeof(struct amt_host_if_msg_header))) {
355 return AMT_STATUS_INTERNAL_ERROR;
356 } else if (resp_hdr->command != command) {
357 return AMT_STATUS_INTERNAL_ERROR;
358 } else if (resp_hdr->_reserved != 0) {
359 return AMT_STATUS_INTERNAL_ERROR;
360 } else if (resp_hdr->version.major != AMT_MAJOR_VERSION ||
361 resp_hdr->version.minor < AMT_MINOR_VERSION) {
362 return AMT_STATUS_INTERNAL_ERROR;
363 }
364 return AMT_STATUS_SUCCESS;
365}
366
367static uint32_t amt_host_if_call(struct amt_host_if *acmd,
368 const unsigned char *command, ssize_t command_sz,
369 uint8_t **read_buf, uint32_t rcmd,
370 unsigned int expected_sz)
371{
372 uint32_t in_buf_sz;
373 ssize_t out_buf_sz;
374 ssize_t written;
375 uint32_t status;
376 struct amt_host_if_resp_header *msg_hdr;
377
378 in_buf_sz = acmd->mei_cl.buf_size;
379 *read_buf = (uint8_t *)malloc(sizeof(uint8_t) * in_buf_sz);
380 if (*read_buf == NULL)
381 return AMT_STATUS_SDK_RESOURCES;
382 memset(*read_buf, 0, in_buf_sz);
383 msg_hdr = (struct amt_host_if_resp_header *)*read_buf;
384
385 written = mei_send_msg(&acmd->mei_cl,
386 command, command_sz, acmd->send_timeout);
387 if (written != command_sz)
388 return AMT_STATUS_INTERNAL_ERROR;
389
390 out_buf_sz = mei_recv_msg(&acmd->mei_cl, *read_buf, in_buf_sz, 2000);
391 if (out_buf_sz <= 0)
392 return AMT_STATUS_HOST_IF_EMPTY_RESPONSE;
393
394 status = msg_hdr->status;
395 if (status != AMT_STATUS_SUCCESS)
396 return status;
397
398 status = amt_verify_response_header(rcmd,
399 &msg_hdr->header, out_buf_sz);
400 if (status != AMT_STATUS_SUCCESS)
401 return status;
402
403 if (expected_sz && expected_sz != out_buf_sz)
404 return AMT_STATUS_INTERNAL_ERROR;
405
406 return AMT_STATUS_SUCCESS;
407}
408
409
410static uint32_t amt_get_code_versions(struct amt_host_if *cmd,
411 struct amt_code_versions *versions)
412{
413 struct amt_host_if_resp_header *response = NULL;
414 uint32_t status;
415
416 status = amt_host_if_call(cmd,
417 (const unsigned char *)&CODE_VERSION_REQ,
418 sizeof(CODE_VERSION_REQ),
419 (uint8_t **)&response,
420 AMT_HOST_IF_CODE_VERSIONS_RESPONSE, 0);
421
422 if (status != AMT_STATUS_SUCCESS)
423 goto out;
424
425 status = amt_verify_code_versions(response);
426 if (status != AMT_STATUS_SUCCESS)
427 goto out;
428
429 memcpy(versions, response->data, sizeof(struct amt_code_versions));
430out:
431 if (response != NULL)
432 free(response);
433
434 return status;
435}
436
437/************************** end of amt_host_if_command ***********************/
438int main(int argc, char **argv)
439{
440 struct amt_code_versions ver;
441 struct amt_host_if acmd;
442 unsigned int i;
443 uint32_t status;
444 int ret;
445 bool verbose;
446
447 verbose = (argc > 1 && strcmp(argv[1], "-v") == 0);
448
449 if (!amt_host_if_init(&acmd, 5000, verbose)) {
450 ret = 1;
451 goto out;
452 }
453
454 status = amt_get_code_versions(&acmd, &ver);
455
456 amt_host_if_deinit(&acmd);
457
458 switch (status) {
459 case AMT_STATUS_HOST_IF_EMPTY_RESPONSE:
460 printf("Intel AMT: DISABLED\n");
461 ret = 0;
462 break;
463 case AMT_STATUS_SUCCESS:
464 printf("Intel AMT: ENABLED\n");
465 for (i = 0; i < ver.count; i++) {
466 printf("%s:\t%s\n", ver.versions[i].description.string,
467 ver.versions[i].version.string);
468 }
469 ret = 0;
470 break;
471 default:
472 printf("An error has occurred\n");
473 ret = 1;
474 break;
475 }
476
477out:
478 return ret;
479}
diff --git a/samples/nitro_enclaves/.gitignore b/samples/nitro_enclaves/.gitignore
new file mode 100644
index 000000000..827934129
--- /dev/null
+++ b/samples/nitro_enclaves/.gitignore
@@ -0,0 +1,2 @@
1# SPDX-License-Identifier: GPL-2.0
2ne_ioctl_sample
diff --git a/samples/nitro_enclaves/Makefile b/samples/nitro_enclaves/Makefile
new file mode 100644
index 000000000..a3ec78fef
--- /dev/null
+++ b/samples/nitro_enclaves/Makefile
@@ -0,0 +1,16 @@
1# SPDX-License-Identifier: GPL-2.0
2#
3# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4
5# Enclave lifetime management support for Nitro Enclaves (NE) - ioctl sample
6# usage.
7
8.PHONY: all clean
9
10CFLAGS += -Wall
11
12all:
13 $(CC) $(CFLAGS) -o ne_ioctl_sample ne_ioctl_sample.c -lpthread
14
15clean:
16 rm -f ne_ioctl_sample
diff --git a/samples/nitro_enclaves/ne_ioctl_sample.c b/samples/nitro_enclaves/ne_ioctl_sample.c
new file mode 100644
index 000000000..480b76314
--- /dev/null
+++ b/samples/nitro_enclaves/ne_ioctl_sample.c
@@ -0,0 +1,883 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 */
5
6/**
7 * DOC: Sample flow of using the ioctl interface provided by the Nitro Enclaves (NE)
8 * kernel driver.
9 *
10 * Usage
11 * -----
12 *
13 * Load the nitro_enclaves module, setting also the enclave CPU pool. The
14 * enclave CPUs need to be full cores from the same NUMA node. CPU 0 and its
15 * siblings have to remain available for the primary / parent VM, so they
16 * cannot be included in the enclave CPU pool.
17 *
18 * See the cpu list section from the kernel documentation.
19 * https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html#cpu-lists
20 *
21 * insmod drivers/virt/nitro_enclaves/nitro_enclaves.ko
22 * lsmod
23 *
24 * The CPU pool can be set at runtime, after the kernel module is loaded.
25 *
26 * echo <cpu-list> > /sys/module/nitro_enclaves/parameters/ne_cpus
27 *
28 * NUMA and CPU siblings information can be found using:
29 *
30 * lscpu
31 * /proc/cpuinfo
32 *
33 * Check the online / offline CPU list. The CPUs from the pool should be
34 * offlined.
35 *
36 * lscpu
37 *
38 * Check dmesg for any warnings / errors through the NE driver lifetime / usage.
39 * The NE logs contain the "nitro_enclaves" or "pci 0000:00:02.0" pattern.
40 *
41 * dmesg
42 *
43 * Setup hugetlbfs huge pages. The memory needs to be from the same NUMA node as
44 * the enclave CPUs.
45 *
46 * https://www.kernel.org/doc/html/latest/admin-guide/mm/hugetlbpage.html
47 *
48 * By default, the allocation of hugetlb pages are distributed on all possible
49 * NUMA nodes. Use the following configuration files to set the number of huge
50 * pages from a NUMA node:
51 *
52 * /sys/devices/system/node/node<X>/hugepages/hugepages-2048kB/nr_hugepages
53 * /sys/devices/system/node/node<X>/hugepages/hugepages-1048576kB/nr_hugepages
54 *
55 * or, if not on a system with multiple NUMA nodes, can also set the number
56 * of 2 MiB / 1 GiB huge pages using
57 *
58 * /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
59 * /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages
60 *
61 * In this example 256 hugepages of 2 MiB are used.
62 *
63 * Build and run the NE sample.
64 *
65 * make -C samples/nitro_enclaves clean
66 * make -C samples/nitro_enclaves
67 * ./samples/nitro_enclaves/ne_ioctl_sample <path_to_enclave_image>
68 *
69 * Unload the nitro_enclaves module.
70 *
71 * rmmod nitro_enclaves
72 * lsmod
73 */
74
75#include <stdio.h>
76#include <stdlib.h>
77#include <errno.h>
78#include <fcntl.h>
79#include <limits.h>
80#include <poll.h>
81#include <pthread.h>
82#include <string.h>
83#include <sys/eventfd.h>
84#include <sys/ioctl.h>
85#include <sys/mman.h>
86#include <sys/socket.h>
87#include <sys/stat.h>
88#include <sys/types.h>
89#include <unistd.h>
90
91#include <linux/mman.h>
92#include <linux/nitro_enclaves.h>
93#include <linux/vm_sockets.h>
94
95/**
96 * NE_DEV_NAME - Nitro Enclaves (NE) misc device that provides the ioctl interface.
97 */
98#define NE_DEV_NAME "/dev/nitro_enclaves"
99
100/**
101 * NE_POLL_WAIT_TIME - Timeout in seconds for each poll event.
102 */
103#define NE_POLL_WAIT_TIME (60)
104/**
105 * NE_POLL_WAIT_TIME_MS - Timeout in milliseconds for each poll event.
106 */
107#define NE_POLL_WAIT_TIME_MS (NE_POLL_WAIT_TIME * 1000)
108
109/**
110 * NE_SLEEP_TIME - Amount of time in seconds for the process to keep the enclave alive.
111 */
112#define NE_SLEEP_TIME (300)
113
114/**
115 * NE_DEFAULT_NR_VCPUS - Default number of vCPUs set for an enclave.
116 */
117#define NE_DEFAULT_NR_VCPUS (2)
118
119/**
120 * NE_MIN_MEM_REGION_SIZE - Minimum size of a memory region - 2 MiB.
121 */
122#define NE_MIN_MEM_REGION_SIZE (2 * 1024 * 1024)
123
124/**
125 * NE_DEFAULT_NR_MEM_REGIONS - Default number of memory regions of 2 MiB set for
126 * an enclave.
127 */
128#define NE_DEFAULT_NR_MEM_REGIONS (256)
129
130/**
131 * NE_IMAGE_LOAD_HEARTBEAT_CID - Vsock CID for enclave image loading heartbeat logic.
132 */
133#define NE_IMAGE_LOAD_HEARTBEAT_CID (3)
134/**
135 * NE_IMAGE_LOAD_HEARTBEAT_PORT - Vsock port for enclave image loading heartbeat logic.
136 */
137#define NE_IMAGE_LOAD_HEARTBEAT_PORT (9000)
138/**
139 * NE_IMAGE_LOAD_HEARTBEAT_VALUE - Heartbeat value for enclave image loading.
140 */
141#define NE_IMAGE_LOAD_HEARTBEAT_VALUE (0xb7)
142
143/**
144 * struct ne_user_mem_region - User space memory region set for an enclave.
145 * @userspace_addr: Address of the user space memory region.
146 * @memory_size: Size of the user space memory region.
147 */
148struct ne_user_mem_region {
149 void *userspace_addr;
150 size_t memory_size;
151};
152
153/**
154 * ne_create_vm() - Create a slot for the enclave VM.
155 * @ne_dev_fd: The file descriptor of the NE misc device.
156 * @slot_uid: The generated slot uid for the enclave.
157 * @enclave_fd : The generated file descriptor for the enclave.
158 *
159 * Context: Process context.
160 * Return:
161 * * 0 on success.
162 * * Negative return value on failure.
163 */
164static int ne_create_vm(int ne_dev_fd, unsigned long *slot_uid, int *enclave_fd)
165{
166 int rc = -EINVAL;
167 *enclave_fd = ioctl(ne_dev_fd, NE_CREATE_VM, slot_uid);
168
169 if (*enclave_fd < 0) {
170 rc = *enclave_fd;
171 switch (errno) {
172 case NE_ERR_NO_CPUS_AVAIL_IN_POOL: {
173 printf("Error in create VM, no CPUs available in the NE CPU pool\n");
174
175 break;
176 }
177
178 default:
179 printf("Error in create VM [%m]\n");
180 }
181
182 return rc;
183 }
184
185 return 0;
186}
187
188
189/**
190 * ne_poll_enclave_fd() - Thread function for polling the enclave fd.
191 * @data: Argument provided for the polling function.
192 *
193 * Context: Process context.
194 * Return:
195 * * NULL on success / failure.
196 */
197void *ne_poll_enclave_fd(void *data)
198{
199 int enclave_fd = *(int *)data;
200 struct pollfd fds[1] = {};
201 int i = 0;
202 int rc = -EINVAL;
203
204 printf("Running from poll thread, enclave fd %d\n", enclave_fd);
205
206 fds[0].fd = enclave_fd;
207 fds[0].events = POLLIN | POLLERR | POLLHUP;
208
209 /* Keep on polling until the current process is terminated. */
210 while (1) {
211 printf("[iter %d] Polling ...\n", i);
212
213 rc = poll(fds, 1, NE_POLL_WAIT_TIME_MS);
214 if (rc < 0) {
215 printf("Error in poll [%m]\n");
216
217 return NULL;
218 }
219
220 i++;
221
222 if (!rc) {
223 printf("Poll: %d seconds elapsed\n",
224 i * NE_POLL_WAIT_TIME);
225
226 continue;
227 }
228
229 printf("Poll received value 0x%x\n", fds[0].revents);
230
231 if (fds[0].revents & POLLHUP) {
232 printf("Received POLLHUP\n");
233
234 return NULL;
235 }
236
237 if (fds[0].revents & POLLNVAL) {
238 printf("Received POLLNVAL\n");
239
240 return NULL;
241 }
242 }
243
244 return NULL;
245}
246
247/**
248 * ne_alloc_user_mem_region() - Allocate a user space memory region for an enclave.
249 * @ne_user_mem_region: User space memory region allocated using hugetlbfs.
250 *
251 * Context: Process context.
252 * Return:
253 * * 0 on success.
254 * * Negative return value on failure.
255 */
256static int ne_alloc_user_mem_region(struct ne_user_mem_region *ne_user_mem_region)
257{
258 /**
259 * Check available hugetlb encodings for different huge page sizes in
260 * include/uapi/linux/mman.h.
261 */
262 ne_user_mem_region->userspace_addr = mmap(NULL, ne_user_mem_region->memory_size,
263 PROT_READ | PROT_WRITE,
264 MAP_PRIVATE | MAP_ANONYMOUS |
265 MAP_HUGETLB | MAP_HUGE_2MB, -1, 0);
266 if (ne_user_mem_region->userspace_addr == MAP_FAILED) {
267 printf("Error in mmap memory [%m]\n");
268
269 return -1;
270 }
271
272 return 0;
273}
274
275/**
276 * ne_load_enclave_image() - Place the enclave image in the enclave memory.
277 * @enclave_fd : The file descriptor associated with the enclave.
278 * @ne_user_mem_regions: User space memory regions allocated for the enclave.
279 * @enclave_image_path : The file path of the enclave image.
280 *
281 * Context: Process context.
282 * Return:
283 * * 0 on success.
284 * * Negative return value on failure.
285 */
286static int ne_load_enclave_image(int enclave_fd, struct ne_user_mem_region ne_user_mem_regions[],
287 char *enclave_image_path)
288{
289 unsigned char *enclave_image = NULL;
290 int enclave_image_fd = -1;
291 size_t enclave_image_size = 0;
292 size_t enclave_memory_size = 0;
293 unsigned long i = 0;
294 size_t image_written_bytes = 0;
295 struct ne_image_load_info image_load_info = {
296 .flags = NE_EIF_IMAGE,
297 };
298 struct stat image_stat_buf = {};
299 int rc = -EINVAL;
300 size_t temp_image_offset = 0;
301
302 for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++)
303 enclave_memory_size += ne_user_mem_regions[i].memory_size;
304
305 rc = stat(enclave_image_path, &image_stat_buf);
306 if (rc < 0) {
307 printf("Error in get image stat info [%m]\n");
308
309 return rc;
310 }
311
312 enclave_image_size = image_stat_buf.st_size;
313
314 if (enclave_memory_size < enclave_image_size) {
315 printf("The enclave memory is smaller than the enclave image size\n");
316
317 return -ENOMEM;
318 }
319
320 rc = ioctl(enclave_fd, NE_GET_IMAGE_LOAD_INFO, &image_load_info);
321 if (rc < 0) {
322 switch (errno) {
323 case NE_ERR_NOT_IN_INIT_STATE: {
324 printf("Error in get image load info, enclave not in init state\n");
325
326 break;
327 }
328
329 case NE_ERR_INVALID_FLAG_VALUE: {
330 printf("Error in get image load info, provided invalid flag\n");
331
332 break;
333 }
334
335 default:
336 printf("Error in get image load info [%m]\n");
337 }
338
339 return rc;
340 }
341
342 printf("Enclave image offset in enclave memory is %lld\n",
343 image_load_info.memory_offset);
344
345 enclave_image_fd = open(enclave_image_path, O_RDONLY);
346 if (enclave_image_fd < 0) {
347 printf("Error in open enclave image file [%m]\n");
348
349 return enclave_image_fd;
350 }
351
352 enclave_image = mmap(NULL, enclave_image_size, PROT_READ,
353 MAP_PRIVATE, enclave_image_fd, 0);
354 if (enclave_image == MAP_FAILED) {
355 printf("Error in mmap enclave image [%m]\n");
356
357 return -1;
358 }
359
360 temp_image_offset = image_load_info.memory_offset;
361
362 for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) {
363 size_t bytes_to_write = 0;
364 size_t memory_offset = 0;
365 size_t memory_size = ne_user_mem_regions[i].memory_size;
366 size_t remaining_bytes = 0;
367 void *userspace_addr = ne_user_mem_regions[i].userspace_addr;
368
369 if (temp_image_offset >= memory_size) {
370 temp_image_offset -= memory_size;
371
372 continue;
373 } else if (temp_image_offset != 0) {
374 memory_offset = temp_image_offset;
375 memory_size -= temp_image_offset;
376 temp_image_offset = 0;
377 }
378
379 remaining_bytes = enclave_image_size - image_written_bytes;
380 bytes_to_write = memory_size < remaining_bytes ?
381 memory_size : remaining_bytes;
382
383 memcpy(userspace_addr + memory_offset,
384 enclave_image + image_written_bytes, bytes_to_write);
385
386 image_written_bytes += bytes_to_write;
387
388 if (image_written_bytes == enclave_image_size)
389 break;
390 }
391
392 munmap(enclave_image, enclave_image_size);
393
394 close(enclave_image_fd);
395
396 return 0;
397}
398
399/**
400 * ne_set_user_mem_region() - Set a user space memory region for the given enclave.
401 * @enclave_fd : The file descriptor associated with the enclave.
402 * @ne_user_mem_region : User space memory region to be set for the enclave.
403 *
404 * Context: Process context.
405 * Return:
406 * * 0 on success.
407 * * Negative return value on failure.
408 */
409static int ne_set_user_mem_region(int enclave_fd, struct ne_user_mem_region ne_user_mem_region)
410{
411 struct ne_user_memory_region mem_region = {
412 .flags = NE_DEFAULT_MEMORY_REGION,
413 .memory_size = ne_user_mem_region.memory_size,
414 .userspace_addr = (__u64)ne_user_mem_region.userspace_addr,
415 };
416 int rc = -EINVAL;
417
418 rc = ioctl(enclave_fd, NE_SET_USER_MEMORY_REGION, &mem_region);
419 if (rc < 0) {
420 switch (errno) {
421 case NE_ERR_NOT_IN_INIT_STATE: {
422 printf("Error in set user memory region, enclave not in init state\n");
423
424 break;
425 }
426
427 case NE_ERR_INVALID_MEM_REGION_SIZE: {
428 printf("Error in set user memory region, mem size not multiple of 2 MiB\n");
429
430 break;
431 }
432
433 case NE_ERR_INVALID_MEM_REGION_ADDR: {
434 printf("Error in set user memory region, invalid user space address\n");
435
436 break;
437 }
438
439 case NE_ERR_UNALIGNED_MEM_REGION_ADDR: {
440 printf("Error in set user memory region, unaligned user space address\n");
441
442 break;
443 }
444
445 case NE_ERR_MEM_REGION_ALREADY_USED: {
446 printf("Error in set user memory region, memory region already used\n");
447
448 break;
449 }
450
451 case NE_ERR_MEM_NOT_HUGE_PAGE: {
452 printf("Error in set user memory region, not backed by huge pages\n");
453
454 break;
455 }
456
457 case NE_ERR_MEM_DIFFERENT_NUMA_NODE: {
458 printf("Error in set user memory region, different NUMA node than CPUs\n");
459
460 break;
461 }
462
463 case NE_ERR_MEM_MAX_REGIONS: {
464 printf("Error in set user memory region, max memory regions reached\n");
465
466 break;
467 }
468
469 case NE_ERR_INVALID_PAGE_SIZE: {
470 printf("Error in set user memory region, has page not multiple of 2 MiB\n");
471
472 break;
473 }
474
475 case NE_ERR_INVALID_FLAG_VALUE: {
476 printf("Error in set user memory region, provided invalid flag\n");
477
478 break;
479 }
480
481 default:
482 printf("Error in set user memory region [%m]\n");
483 }
484
485 return rc;
486 }
487
488 return 0;
489}
490
491/**
492 * ne_free_mem_regions() - Unmap all the user space memory regions that were set
493 * aside for the enclave.
494 * @ne_user_mem_regions: The user space memory regions associated with an enclave.
495 *
496 * Context: Process context.
497 */
498static void ne_free_mem_regions(struct ne_user_mem_region ne_user_mem_regions[])
499{
500 unsigned int i = 0;
501
502 for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++)
503 munmap(ne_user_mem_regions[i].userspace_addr,
504 ne_user_mem_regions[i].memory_size);
505}
506
507/**
508 * ne_add_vcpu() - Add a vCPU to the given enclave.
509 * @enclave_fd : The file descriptor associated with the enclave.
510 * @vcpu_id: vCPU id to be set for the enclave, either provided or
511 * auto-generated (if provided vCPU id is 0).
512 *
513 * Context: Process context.
514 * Return:
515 * * 0 on success.
516 * * Negative return value on failure.
517 */
518static int ne_add_vcpu(int enclave_fd, unsigned int *vcpu_id)
519{
520 int rc = -EINVAL;
521
522 rc = ioctl(enclave_fd, NE_ADD_VCPU, vcpu_id);
523 if (rc < 0) {
524 switch (errno) {
525 case NE_ERR_NO_CPUS_AVAIL_IN_POOL: {
526 printf("Error in add vcpu, no CPUs available in the NE CPU pool\n");
527
528 break;
529 }
530
531 case NE_ERR_VCPU_ALREADY_USED: {
532 printf("Error in add vcpu, the provided vCPU is already used\n");
533
534 break;
535 }
536
537 case NE_ERR_VCPU_NOT_IN_CPU_POOL: {
538 printf("Error in add vcpu, the provided vCPU is not in the NE CPU pool\n");
539
540 break;
541 }
542
543 case NE_ERR_VCPU_INVALID_CPU_CORE: {
544 printf("Error in add vcpu, the core id of the provided vCPU is invalid\n");
545
546 break;
547 }
548
549 case NE_ERR_NOT_IN_INIT_STATE: {
550 printf("Error in add vcpu, enclave not in init state\n");
551
552 break;
553 }
554
555 case NE_ERR_INVALID_VCPU: {
556 printf("Error in add vcpu, the provided vCPU is out of avail CPUs range\n");
557
558 break;
559 }
560
561 default:
562 printf("Error in add vcpu [%m]\n");
563
564 }
565 return rc;
566 }
567
568 return 0;
569}
570
571/**
572 * ne_start_enclave() - Start the given enclave.
573 * @enclave_fd : The file descriptor associated with the enclave.
574 * @enclave_start_info : Enclave metadata used for starting e.g. vsock CID.
575 *
576 * Context: Process context.
577 * Return:
578 * * 0 on success.
579 * * Negative return value on failure.
580 */
581static int ne_start_enclave(int enclave_fd, struct ne_enclave_start_info *enclave_start_info)
582{
583 int rc = -EINVAL;
584
585 rc = ioctl(enclave_fd, NE_START_ENCLAVE, enclave_start_info);
586 if (rc < 0) {
587 switch (errno) {
588 case NE_ERR_NOT_IN_INIT_STATE: {
589 printf("Error in start enclave, enclave not in init state\n");
590
591 break;
592 }
593
594 case NE_ERR_NO_MEM_REGIONS_ADDED: {
595 printf("Error in start enclave, no memory regions have been added\n");
596
597 break;
598 }
599
600 case NE_ERR_NO_VCPUS_ADDED: {
601 printf("Error in start enclave, no vCPUs have been added\n");
602
603 break;
604 }
605
606 case NE_ERR_FULL_CORES_NOT_USED: {
607 printf("Error in start enclave, enclave has no full cores set\n");
608
609 break;
610 }
611
612 case NE_ERR_ENCLAVE_MEM_MIN_SIZE: {
613 printf("Error in start enclave, enclave memory is less than min size\n");
614
615 break;
616 }
617
618 case NE_ERR_INVALID_FLAG_VALUE: {
619 printf("Error in start enclave, provided invalid flag\n");
620
621 break;
622 }
623
624 case NE_ERR_INVALID_ENCLAVE_CID: {
625 printf("Error in start enclave, provided invalid enclave CID\n");
626
627 break;
628 }
629
630 default:
631 printf("Error in start enclave [%m]\n");
632 }
633
634 return rc;
635 }
636
637 return 0;
638}
639
640/**
641 * ne_start_enclave_check_booted() - Start the enclave and wait for a hearbeat
642 * from it, on a newly created vsock channel,
643 * to check it has booted.
644 * @enclave_fd : The file descriptor associated with the enclave.
645 *
646 * Context: Process context.
647 * Return:
648 * * 0 on success.
649 * * Negative return value on failure.
650 */
651static int ne_start_enclave_check_booted(int enclave_fd)
652{
653 struct sockaddr_vm client_vsock_addr = {};
654 int client_vsock_fd = -1;
655 socklen_t client_vsock_len = sizeof(client_vsock_addr);
656 struct ne_enclave_start_info enclave_start_info = {};
657 struct pollfd fds[1] = {};
658 int rc = -EINVAL;
659 unsigned char recv_buf = 0;
660 struct sockaddr_vm server_vsock_addr = {
661 .svm_family = AF_VSOCK,
662 .svm_cid = NE_IMAGE_LOAD_HEARTBEAT_CID,
663 .svm_port = NE_IMAGE_LOAD_HEARTBEAT_PORT,
664 };
665 int server_vsock_fd = -1;
666
667 server_vsock_fd = socket(AF_VSOCK, SOCK_STREAM, 0);
668 if (server_vsock_fd < 0) {
669 rc = server_vsock_fd;
670
671 printf("Error in socket [%m]\n");
672
673 return rc;
674 }
675
676 rc = bind(server_vsock_fd, (struct sockaddr *)&server_vsock_addr,
677 sizeof(server_vsock_addr));
678 if (rc < 0) {
679 printf("Error in bind [%m]\n");
680
681 goto out;
682 }
683
684 rc = listen(server_vsock_fd, 1);
685 if (rc < 0) {
686 printf("Error in listen [%m]\n");
687
688 goto out;
689 }
690
691 rc = ne_start_enclave(enclave_fd, &enclave_start_info);
692 if (rc < 0)
693 goto out;
694
695 printf("Enclave started, CID %llu\n", enclave_start_info.enclave_cid);
696
697 fds[0].fd = server_vsock_fd;
698 fds[0].events = POLLIN;
699
700 rc = poll(fds, 1, NE_POLL_WAIT_TIME_MS);
701 if (rc < 0) {
702 printf("Error in poll [%m]\n");
703
704 goto out;
705 }
706
707 if (!rc) {
708 printf("Poll timeout, %d seconds elapsed\n", NE_POLL_WAIT_TIME);
709
710 rc = -ETIMEDOUT;
711
712 goto out;
713 }
714
715 if ((fds[0].revents & POLLIN) == 0) {
716 printf("Poll received value %d\n", fds[0].revents);
717
718 rc = -EINVAL;
719
720 goto out;
721 }
722
723 rc = accept(server_vsock_fd, (struct sockaddr *)&client_vsock_addr,
724 &client_vsock_len);
725 if (rc < 0) {
726 printf("Error in accept [%m]\n");
727
728 goto out;
729 }
730
731 client_vsock_fd = rc;
732
733 /*
734 * Read the heartbeat value that the init process in the enclave sends
735 * after vsock connect.
736 */
737 rc = read(client_vsock_fd, &recv_buf, sizeof(recv_buf));
738 if (rc < 0) {
739 printf("Error in read [%m]\n");
740
741 goto out;
742 }
743
744 if (rc != sizeof(recv_buf) || recv_buf != NE_IMAGE_LOAD_HEARTBEAT_VALUE) {
745 printf("Read %d instead of %d\n", recv_buf,
746 NE_IMAGE_LOAD_HEARTBEAT_VALUE);
747
748 goto out;
749 }
750
751 /* Write the heartbeat value back. */
752 rc = write(client_vsock_fd, &recv_buf, sizeof(recv_buf));
753 if (rc < 0) {
754 printf("Error in write [%m]\n");
755
756 goto out;
757 }
758
759 rc = 0;
760
761out:
762 close(server_vsock_fd);
763
764 return rc;
765}
766
767int main(int argc, char *argv[])
768{
769 int enclave_fd = -1;
770 unsigned int i = 0;
771 int ne_dev_fd = -1;
772 struct ne_user_mem_region ne_user_mem_regions[NE_DEFAULT_NR_MEM_REGIONS] = {};
773 unsigned int ne_vcpus[NE_DEFAULT_NR_VCPUS] = {};
774 int rc = -EINVAL;
775 pthread_t thread_id = 0;
776 unsigned long slot_uid = 0;
777
778 if (argc != 2) {
779 printf("Usage: %s <path_to_enclave_image>\n", argv[0]);
780
781 exit(EXIT_FAILURE);
782 }
783
784 if (strlen(argv[1]) >= PATH_MAX) {
785 printf("The size of the path to enclave image is higher than max path\n");
786
787 exit(EXIT_FAILURE);
788 }
789
790 ne_dev_fd = open(NE_DEV_NAME, O_RDWR | O_CLOEXEC);
791 if (ne_dev_fd < 0) {
792 printf("Error in open NE device [%m]\n");
793
794 exit(EXIT_FAILURE);
795 }
796
797 printf("Creating enclave slot ...\n");
798
799 rc = ne_create_vm(ne_dev_fd, &slot_uid, &enclave_fd);
800
801 close(ne_dev_fd);
802
803 if (rc < 0)
804 exit(EXIT_FAILURE);
805
806 printf("Enclave fd %d\n", enclave_fd);
807
808 rc = pthread_create(&thread_id, NULL, ne_poll_enclave_fd, (void *)&enclave_fd);
809 if (rc < 0) {
810 printf("Error in thread create [%m]\n");
811
812 close(enclave_fd);
813
814 exit(EXIT_FAILURE);
815 }
816
817 for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) {
818 ne_user_mem_regions[i].memory_size = NE_MIN_MEM_REGION_SIZE;
819
820 rc = ne_alloc_user_mem_region(&ne_user_mem_regions[i]);
821 if (rc < 0) {
822 printf("Error in alloc userspace memory region, iter %d\n", i);
823
824 goto release_enclave_fd;
825 }
826 }
827
828 rc = ne_load_enclave_image(enclave_fd, ne_user_mem_regions, argv[1]);
829 if (rc < 0)
830 goto release_enclave_fd;
831
832 for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) {
833 rc = ne_set_user_mem_region(enclave_fd, ne_user_mem_regions[i]);
834 if (rc < 0) {
835 printf("Error in set memory region, iter %d\n", i);
836
837 goto release_enclave_fd;
838 }
839 }
840
841 printf("Enclave memory regions were added\n");
842
843 for (i = 0; i < NE_DEFAULT_NR_VCPUS; i++) {
844 /*
845 * The vCPU is chosen from the enclave vCPU pool, if the value
846 * of the vcpu_id is 0.
847 */
848 ne_vcpus[i] = 0;
849 rc = ne_add_vcpu(enclave_fd, &ne_vcpus[i]);
850 if (rc < 0) {
851 printf("Error in add vcpu, iter %d\n", i);
852
853 goto release_enclave_fd;
854 }
855
856 printf("Added vCPU %d to the enclave\n", ne_vcpus[i]);
857 }
858
859 printf("Enclave vCPUs were added\n");
860
861 rc = ne_start_enclave_check_booted(enclave_fd);
862 if (rc < 0) {
863 printf("Error in the enclave start / image loading heartbeat logic [rc=%d]\n", rc);
864
865 goto release_enclave_fd;
866 }
867
868 printf("Entering sleep for %d seconds ...\n", NE_SLEEP_TIME);
869
870 sleep(NE_SLEEP_TIME);
871
872 close(enclave_fd);
873
874 ne_free_mem_regions(ne_user_mem_regions);
875
876 exit(EXIT_SUCCESS);
877
878release_enclave_fd:
879 close(enclave_fd);
880 ne_free_mem_regions(ne_user_mem_regions);
881
882 exit(EXIT_FAILURE);
883}
diff --git a/samples/pidfd/.gitignore b/samples/pidfd/.gitignore
new file mode 100644
index 000000000..eea857fca
--- /dev/null
+++ b/samples/pidfd/.gitignore
@@ -0,0 +1,2 @@
1# SPDX-License-Identifier: GPL-2.0-only
2pidfd-metadata
diff --git a/samples/pidfd/Makefile b/samples/pidfd/Makefile
new file mode 100644
index 000000000..9754e2d81
--- /dev/null
+++ b/samples/pidfd/Makefile
@@ -0,0 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0
2usertprogs-always-y += pidfd-metadata
3
4userccflags += -I usr/include
diff --git a/samples/pidfd/pidfd-metadata.c b/samples/pidfd/pidfd-metadata.c
new file mode 100644
index 000000000..c459155da
--- /dev/null
+++ b/samples/pidfd/pidfd-metadata.c
@@ -0,0 +1,120 @@
1// SPDX-License-Identifier: GPL-2.0
2
3#define _GNU_SOURCE
4#include <err.h>
5#include <errno.h>
6#include <fcntl.h>
7#include <inttypes.h>
8#include <limits.h>
9#include <sched.h>
10#include <signal.h>
11#include <stdio.h>
12#include <stdlib.h>
13#include <string.h>
14#include <sys/stat.h>
15#include <sys/syscall.h>
16#include <sys/types.h>
17#include <sys/wait.h>
18#include <unistd.h>
19
20#ifndef CLONE_PIDFD
21#define CLONE_PIDFD 0x00001000
22#endif
23
24#ifndef __NR_pidfd_send_signal
25#define __NR_pidfd_send_signal -1
26#endif
27
28static int do_child(void *args)
29{
30 printf("%d\n", getpid());
31 _exit(EXIT_SUCCESS);
32}
33
34static pid_t pidfd_clone(int flags, int *pidfd)
35{
36 size_t stack_size = 1024;
37 char *stack[1024] = { 0 };
38
39#ifdef __ia64__
40 return __clone2(do_child, stack, stack_size, flags | SIGCHLD, NULL, pidfd);
41#else
42 return clone(do_child, stack + stack_size, flags | SIGCHLD, NULL, pidfd);
43#endif
44}
45
46static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
47 unsigned int flags)
48{
49 return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags);
50}
51
52static int pidfd_metadata_fd(pid_t pid, int pidfd)
53{
54 int procfd, ret;
55 char path[100];
56
57 snprintf(path, sizeof(path), "/proc/%d", pid);
58 procfd = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
59 if (procfd < 0) {
60 warn("Failed to open %s\n", path);
61 return -1;
62 }
63
64 /*
65 * Verify that the pid has not been recycled and our /proc/<pid> handle
66 * is still valid.
67 */
68 ret = sys_pidfd_send_signal(pidfd, 0, NULL, 0);
69 if (ret < 0) {
70 switch (errno) {
71 case EPERM:
72 /* Process exists, just not allowed to signal it. */
73 break;
74 default:
75 warn("Failed to signal process\n");
76 close(procfd);
77 procfd = -1;
78 }
79 }
80
81 return procfd;
82}
83
84int main(int argc, char *argv[])
85{
86 int pidfd = -1, ret = EXIT_FAILURE;
87 char buf[4096] = { 0 };
88 pid_t pid;
89 int procfd, statusfd;
90 ssize_t bytes;
91
92 pid = pidfd_clone(CLONE_PIDFD, &pidfd);
93 if (pid < 0)
94 err(ret, "CLONE_PIDFD");
95 if (pidfd == -1) {
96 warnx("CLONE_PIDFD is not supported by the kernel");
97 goto out;
98 }
99
100 procfd = pidfd_metadata_fd(pid, pidfd);
101 close(pidfd);
102 if (procfd < 0)
103 goto out;
104
105 statusfd = openat(procfd, "status", O_RDONLY | O_CLOEXEC);
106 close(procfd);
107 if (statusfd < 0)
108 goto out;
109
110 bytes = read(statusfd, buf, sizeof(buf));
111 if (bytes > 0)
112 bytes = write(STDOUT_FILENO, buf, bytes);
113 close(statusfd);
114 ret = EXIT_SUCCESS;
115
116out:
117 (void)wait(NULL);
118
119 exit(ret);
120}
diff --git a/samples/pktgen/README.rst b/samples/pktgen/README.rst
new file mode 100644
index 000000000..f9c53ca5c
--- /dev/null
+++ b/samples/pktgen/README.rst
@@ -0,0 +1,46 @@
1Sample and benchmark scripts for pktgen (packet generator)
2==========================================================
3This directory contains some pktgen sample and benchmark scripts, that
4can easily be copied and adjusted for your own use-case.
5
6General doc is located in kernel: Documentation/networking/pktgen.rst
7
8Helper include files
9====================
10This directory contains two helper shell files, that can be "included"
11by shell source'ing. Namely "functions.sh" and "parameters.sh".
12
13Common parameters
14-----------------
15The parameters.sh file support easy and consistant parameter parsing
16across the sample scripts. Usage example is printed on errors::
17
18 Usage: ./pktgen_sample01_simple.sh [-vx] -i ethX
19 -i : ($DEV) output interface/device (required)
20 -s : ($PKT_SIZE) packet size
21 -d : ($DEST_IP) destination IP. CIDR (e.g. 198.18.0.0/15) is also allowed
22 -m : ($DST_MAC) destination MAC-addr
23 -p : ($DST_PORT) destination PORT range (e.g. 433-444) is also allowed
24 -t : ($THREADS) threads to start
25 -f : ($F_THREAD) index of first thread (zero indexed CPU number)
26 -c : ($SKB_CLONE) SKB clones send before alloc new SKB
27 -n : ($COUNT) num messages to send per thread, 0 means indefinitely
28 -b : ($BURST) HW level bursting of SKBs
29 -v : ($VERBOSE) verbose
30 -x : ($DEBUG) debug
31
32The global variable being set is also listed. E.g. the required
33interface/device parameter "-i" sets variable $DEV.
34
35Common functions
36----------------
37The functions.sh file provides; Three different shell functions for
38configuring the different components of pktgen: pg_ctrl(), pg_thread()
39and pg_set().
40
41These functions correspond to pktgens different components.
42 * pg_ctrl() control "pgctrl" (/proc/net/pktgen/pgctrl)
43 * pg_thread() control the kernel threads and binding to devices
44 * pg_set() control setup of individual devices
45
46See sample scripts for usage examples.
diff --git a/samples/pktgen/functions.sh b/samples/pktgen/functions.sh
new file mode 100644
index 000000000..dae06d5b3
--- /dev/null
+++ b/samples/pktgen/functions.sh
@@ -0,0 +1,334 @@
1#
2# Common functions used by pktgen scripts
3# - Depending on bash 3 (or higher) syntax
4#
5# Author: Jesper Dangaaard Brouer
6# License: GPL
7
8set -o errexit
9
10## -- General shell logging cmds --
11function err() {
12 local exitcode=$1
13 shift
14 echo "ERROR: $@" >&2
15 exit $exitcode
16}
17
18function warn() {
19 echo "WARN : $@" >&2
20}
21
22function info() {
23 if [[ -n "$VERBOSE" ]]; then
24 echo "INFO : $@" >&2
25 fi
26}
27
28## -- Pktgen proc config commands -- ##
29export PROC_DIR=/proc/net/pktgen
30#
31# Three different shell functions for configuring the different
32# components of pktgen:
33# pg_ctrl(), pg_thread() and pg_set().
34#
35# These functions correspond to pktgens different components.
36# * pg_ctrl() control "pgctrl" (/proc/net/pktgen/pgctrl)
37# * pg_thread() control the kernel threads and binding to devices
38# * pg_set() control setup of individual devices
39function pg_ctrl() {
40 local proc_file="pgctrl"
41 proc_cmd ${proc_file} "$@"
42}
43
44function pg_thread() {
45 local thread=$1
46 local proc_file="kpktgend_${thread}"
47 shift
48 proc_cmd ${proc_file} "$@"
49}
50
51function pg_set() {
52 local dev=$1
53 local proc_file="$dev"
54 shift
55 proc_cmd ${proc_file} "$@"
56}
57
58# More generic replacement for pgset(), that does not depend on global
59# variable for proc file.
60function proc_cmd() {
61 local result
62 local proc_file=$1
63 local status=0
64 # after shift, the remaining args are contained in $@
65 shift
66 local proc_ctrl=${PROC_DIR}/$proc_file
67 if [[ ! -e "$proc_ctrl" ]]; then
68 err 3 "proc file:$proc_ctrl does not exists (dev added to thread?)"
69 else
70 if [[ ! -w "$proc_ctrl" ]]; then
71 err 4 "proc file:$proc_ctrl not writable, not root?!"
72 fi
73 fi
74
75 if [[ "$DEBUG" == "yes" ]]; then
76 echo "cmd: $@ > $proc_ctrl"
77 fi
78 # Quoting of "$@" is important for space expansion
79 echo "$@" > "$proc_ctrl" || status=$?
80
81 if [[ "$proc_file" != "pgctrl" ]]; then
82 result=$(grep "Result: OK:" $proc_ctrl) || true
83 if [[ "$result" == "" ]]; then
84 grep "Result:" $proc_ctrl >&2
85 fi
86 fi
87 if (( $status != 0 )); then
88 err 5 "Write error($status) occurred cmd: \"$@ > $proc_ctrl\""
89 fi
90}
91
92# Old obsolete "pgset" function, with slightly improved err handling
93function pgset() {
94 local result
95
96 if [[ "$DEBUG" == "yes" ]]; then
97 echo "cmd: $1 > $PGDEV"
98 fi
99 echo $1 > $PGDEV
100 local status=$?
101
102 result=`cat $PGDEV | fgrep "Result: OK:"`
103 if [[ "$result" == "" ]]; then
104 cat $PGDEV | fgrep Result:
105 fi
106 if (( $status != 0 )); then
107 err 5 "Write error($status) occurred cmd: \"$1 > $PGDEV\""
108 fi
109}
110
111[[ $EUID -eq 0 ]] && trap 'pg_ctrl "reset"' EXIT
112
113## -- General shell tricks --
114
115function root_check_run_with_sudo() {
116 # Trick so, program can be run as normal user, will just use "sudo"
117 # call as root_check_run_as_sudo "$@"
118 if [ "$EUID" -ne 0 ]; then
119 if [ -x $0 ]; then # Directly executable use sudo
120 info "Not root, running with sudo"
121 sudo "$0" "$@"
122 exit $?
123 fi
124 err 4 "cannot perform sudo run of $0"
125 fi
126}
127
128# Exact input device's NUMA node info
129function get_iface_node()
130{
131 local node=$(</sys/class/net/$1/device/numa_node)
132 if [[ $node == -1 ]]; then
133 echo 0
134 else
135 echo $node
136 fi
137}
138
139# Given an Dev/iface, get its queues' irq numbers
140function get_iface_irqs()
141{
142 local IFACE=$1
143 local queues="${IFACE}-.*TxRx"
144
145 irqs=$(grep "$queues" /proc/interrupts | cut -f1 -d:)
146 [ -z "$irqs" ] && irqs=$(grep $IFACE /proc/interrupts | cut -f1 -d:)
147 [ -z "$irqs" ] && irqs=$(for i in `ls -Ux /sys/class/net/$IFACE/device/msi_irqs` ;\
148 do grep "$i:.*TxRx" /proc/interrupts | grep -v fdir | cut -f 1 -d : ;\
149 done)
150 [ -z "$irqs" ] && err 3 "Could not find interrupts for $IFACE"
151
152 echo $irqs
153}
154
155# Given a NUMA node, return cpu ids belonging to it.
156function get_node_cpus()
157{
158 local node=$1
159 local node_cpu_list
160 local node_cpu_range_list=`cut -f1- -d, --output-delimiter=" " \
161 /sys/devices/system/node/node$node/cpulist`
162
163 for cpu_range in $node_cpu_range_list
164 do
165 node_cpu_list="$node_cpu_list "`seq -s " " ${cpu_range//-/ }`
166 done
167
168 echo $node_cpu_list
169}
170
171# Check $1 is in between $2, $3 ($2 <= $1 <= $3)
172function in_between() { [[ ($1 -ge $2) && ($1 -le $3) ]] ; }
173
174# Extend shrunken IPv6 address.
175# fe80::42:bcff:fe84:e10a => fe80:0:0:0:42:bcff:fe84:e10a
176function extend_addr6()
177{
178 local addr=$1
179 local sep=: sep2=::
180 local sep_cnt=$(tr -cd $sep <<< $1 | wc -c)
181 local shrink
182
183 # separator count should be (2 <= $sep_cnt <= 7)
184 if ! (in_between $sep_cnt 2 7); then
185 err 5 "Invalid IP6 address: $1"
186 fi
187
188 # if shrink '::' occurs multiple, it's malformed.
189 shrink=( $(egrep -o "$sep{2,}" <<< $addr) )
190 if [[ ${#shrink[@]} -ne 0 ]]; then
191 if [[ ${#shrink[@]} -gt 1 || ( ${shrink[0]} != $sep2 ) ]]; then
192 err 5 "Invalid IP6 address: $1"
193 fi
194 fi
195
196 # add 0 at begin & end, and extend addr by adding :0
197 [[ ${addr:0:1} == $sep ]] && addr=0${addr}
198 [[ ${addr: -1} == $sep ]] && addr=${addr}0
199 echo "${addr/$sep2/$(printf ':0%.s' $(seq $[8-sep_cnt])):}"
200}
201
202# Given a single IP(v4/v6) address, whether it is valid.
203function validate_addr()
204{
205 # check function is called with (funcname)6
206 [[ ${FUNCNAME[1]: -1} == 6 ]] && local IP6=6
207 local bitlen=$[ IP6 ? 128 : 32 ]
208 local len=$[ IP6 ? 8 : 4 ]
209 local max=$[ 2**(len*2)-1 ]
210 local net prefix
211 local addr sep
212
213 IFS='/' read net prefix <<< $1
214 [[ $IP6 ]] && net=$(extend_addr6 $net)
215
216 # if prefix exists, check (0 <= $prefix <= $bitlen)
217 if [[ -n $prefix ]]; then
218 if ! (in_between $prefix 0 $bitlen); then
219 err 5 "Invalid prefix: /$prefix"
220 fi
221 fi
222
223 # set separator for each IP(v4/v6)
224 [[ $IP6 ]] && sep=: || sep=.
225 IFS=$sep read -a addr <<< $net
226
227 # array length
228 if [[ ${#addr[@]} != $len ]]; then
229 err 5 "Invalid IP$IP6 address: $1"
230 fi
231
232 # check each digit (0 <= $digit <= $max)
233 for digit in "${addr[@]}"; do
234 [[ $IP6 ]] && digit=$[ 16#$digit ]
235 if ! (in_between $digit 0 $max); then
236 err 5 "Invalid IP$IP6 address: $1"
237 fi
238 done
239
240 return 0
241}
242
243function validate_addr6() { validate_addr $@ ; }
244
245# Given a single IP(v4/v6) or CIDR, return minimum and maximum IP addr.
246function parse_addr()
247{
248 # check function is called with (funcname)6
249 [[ ${FUNCNAME[1]: -1} == 6 ]] && local IP6=6
250 local net prefix
251 local min_ip max_ip
252
253 IFS='/' read net prefix <<< $1
254 [[ $IP6 ]] && net=$(extend_addr6 $net)
255
256 if [[ -z $prefix ]]; then
257 min_ip=$net
258 max_ip=$net
259 else
260 # defining array for converting Decimal 2 Binary
261 # 00000000 00000001 00000010 00000011 00000100 ...
262 local d2b='{0..1}{0..1}{0..1}{0..1}{0..1}{0..1}{0..1}{0..1}'
263 [[ $IP6 ]] && d2b+=$d2b
264 eval local D2B=($d2b)
265
266 local bitlen=$[ IP6 ? 128 : 32 ]
267 local remain=$[ bitlen-prefix ]
268 local octet=$[ IP6 ? 16 : 8 ]
269 local min_mask max_mask
270 local min max
271 local ip_bit
272 local ip sep
273
274 # set separator for each IP(v4/v6)
275 [[ $IP6 ]] && sep=: || sep=.
276 IFS=$sep read -ra ip <<< $net
277
278 min_mask="$(printf '1%.s' $(seq $prefix))$(printf '0%.s' $(seq $remain))"
279 max_mask="$(printf '0%.s' $(seq $prefix))$(printf '1%.s' $(seq $remain))"
280
281 # calculate min/max ip with &,| operator
282 for i in "${!ip[@]}"; do
283 digit=$[ IP6 ? 16#${ip[$i]} : ${ip[$i]} ]
284 ip_bit=${D2B[$digit]}
285
286 idx=$[ octet*i ]
287 min[$i]=$[ 2#$ip_bit & 2#${min_mask:$idx:$octet} ]
288 max[$i]=$[ 2#$ip_bit | 2#${max_mask:$idx:$octet} ]
289 [[ $IP6 ]] && { min[$i]=$(printf '%X' ${min[$i]});
290 max[$i]=$(printf '%X' ${max[$i]}); }
291 done
292
293 min_ip=$(IFS=$sep; echo "${min[*]}")
294 max_ip=$(IFS=$sep; echo "${max[*]}")
295 fi
296
297 echo $min_ip $max_ip
298}
299
300function parse_addr6() { parse_addr $@ ; }
301
302# Given a single or range of port(s), return minimum and maximum port number.
303function parse_ports()
304{
305 local port_str=$1
306 local port_list
307 local min_port
308 local max_port
309
310 IFS="-" read -ra port_list <<< $port_str
311
312 min_port=${port_list[0]}
313 max_port=${port_list[1]:-$min_port}
314
315 echo $min_port $max_port
316}
317
318# Given a minimum and maximum port, verify port number.
319function validate_ports()
320{
321 local min_port=$1
322 local max_port=$2
323
324 # 1 <= port <= 65535
325 if (in_between $min_port 1 65535); then
326 if (in_between $max_port 1 65535); then
327 if [[ $min_port -le $max_port ]]; then
328 return 0
329 fi
330 fi
331 fi
332
333 err 5 "Invalid port(s): $min_port-$max_port"
334}
diff --git a/samples/pktgen/parameters.sh b/samples/pktgen/parameters.sh
new file mode 100644
index 000000000..ff0ed474f
--- /dev/null
+++ b/samples/pktgen/parameters.sh
@@ -0,0 +1,121 @@
1#
2# SPDX-License-Identifier: GPL-2.0
3# Common parameter parsing for pktgen scripts
4#
5
6function usage() {
7 echo ""
8 echo "Usage: $0 [-vx] -i ethX"
9 echo " -i : (\$DEV) output interface/device (required)"
10 echo " -s : (\$PKT_SIZE) packet size"
11 echo " -d : (\$DEST_IP) destination IP. CIDR (e.g. 198.18.0.0/15) is also allowed"
12 echo " -m : (\$DST_MAC) destination MAC-addr"
13 echo " -p : (\$DST_PORT) destination PORT range (e.g. 433-444) is also allowed"
14 echo " -t : (\$THREADS) threads to start"
15 echo " -f : (\$F_THREAD) index of first thread (zero indexed CPU number)"
16 echo " -c : (\$SKB_CLONE) SKB clones send before alloc new SKB"
17 echo " -n : (\$COUNT) num messages to send per thread, 0 means indefinitely"
18 echo " -b : (\$BURST) HW level bursting of SKBs"
19 echo " -v : (\$VERBOSE) verbose"
20 echo " -x : (\$DEBUG) debug"
21 echo " -6 : (\$IP6) IPv6"
22 echo ""
23}
24
25## --- Parse command line arguments / parameters ---
26## echo "Commandline options:"
27while getopts "s:i:d:m:p:f:t:c:n:b:vxh6" option; do
28 case $option in
29 i) # interface
30 export DEV=$OPTARG
31 info "Output device set to: DEV=$DEV"
32 ;;
33 s)
34 export PKT_SIZE=$OPTARG
35 info "Packet size set to: PKT_SIZE=$PKT_SIZE bytes"
36 ;;
37 d) # destination IP
38 export DEST_IP=$OPTARG
39 info "Destination IP set to: DEST_IP=$DEST_IP"
40 ;;
41 m) # MAC
42 export DST_MAC=$OPTARG
43 info "Destination MAC set to: DST_MAC=$DST_MAC"
44 ;;
45 p) # PORT
46 export DST_PORT=$OPTARG
47 info "Destination PORT set to: DST_PORT=$DST_PORT"
48 ;;
49 f)
50 export F_THREAD=$OPTARG
51 info "Index of first thread (zero indexed CPU number): $F_THREAD"
52 ;;
53 t)
54 export THREADS=$OPTARG
55 info "Number of threads to start: $THREADS"
56 ;;
57 c)
58 export CLONE_SKB=$OPTARG
59 info "CLONE_SKB=$CLONE_SKB"
60 ;;
61 n)
62 export COUNT=$OPTARG
63 info "COUNT=$COUNT"
64 ;;
65 b)
66 export BURST=$OPTARG
67 info "SKB bursting: BURST=$BURST"
68 ;;
69 v)
70 export VERBOSE=yes
71 info "Verbose mode: VERBOSE=$VERBOSE"
72 ;;
73 x)
74 export DEBUG=yes
75 info "Debug mode: DEBUG=$DEBUG"
76 ;;
77 6)
78 export IP6=6
79 info "IP6: IP6=$IP6"
80 ;;
81 h|?|*)
82 usage;
83 err 2 "[ERROR] Unknown parameters!!!"
84 esac
85done
86shift $(( $OPTIND - 1 ))
87
88if [ -z "$PKT_SIZE" ]; then
89 # NIC adds 4 bytes CRC
90 export PKT_SIZE=60
91 info "Default packet size set to: set to: $PKT_SIZE bytes"
92fi
93
94if [ -z "$F_THREAD" ]; then
95 # First thread (F_THREAD) reference the zero indexed CPU number
96 export F_THREAD=0
97fi
98
99if [ -z "$THREADS" ]; then
100 export THREADS=1
101fi
102
103export L_THREAD=$(( THREADS + F_THREAD - 1 ))
104
105if [ -z "$DEV" ]; then
106 usage
107 err 2 "Please specify output device"
108fi
109
110if [ -z "$DST_MAC" ]; then
111 warn "Missing destination MAC address"
112fi
113
114if [ -z "$DEST_IP" ]; then
115 warn "Missing destination IP address"
116fi
117
118if [ ! -d /proc/net/pktgen ]; then
119 info "Loading kernel module: pktgen"
120 modprobe pktgen
121fi
diff --git a/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh b/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh
new file mode 100755
index 000000000..1b6204125
--- /dev/null
+++ b/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh
@@ -0,0 +1,105 @@
1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3#
4# Benchmark script:
5# - developed for benchmarking ingress qdisc path
6#
7# Script for injecting packets into RX path of the stack with pktgen
8# "xmit_mode netif_receive". With an invalid dst_mac this will only
9# measure the ingress code path as packets gets dropped in ip_rcv().
10#
11# This script don't really need any hardware. It benchmarks software
12# RX path just after NIC driver level. With bursting is also
13# "removes" the SKB alloc/free overhead.
14#
15# Setup scenarios for measuring ingress qdisc (with invalid dst_mac):
16# ------------------------------------------------------------------
17# (1) no ingress (uses static_key_false(&ingress_needed))
18#
19# (2) ingress on other dev (change ingress_needed and calls
20# handle_ing() but exit early)
21#
22# config: tc qdisc add dev $SOMEDEV handle ffff: ingress
23#
24# (3) ingress on this dev, handle_ing() -> tc_classify()
25#
26# config: tc qdisc add dev $DEV handle ffff: ingress
27#
28# (4) ingress on this dev + drop at u32 classifier/action.
29#
30basedir=`dirname $0`
31source ${basedir}/functions.sh
32root_check_run_with_sudo "$@"
33
34# Parameter parsing via include
35source ${basedir}/parameters.sh
36# Using invalid DST_MAC will cause the packets to get dropped in
37# ip_rcv() which is part of the test
38if [ -z "$DEST_IP" ]; then
39 [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1"
40fi
41[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff"
42[ -z "$BURST" ] && BURST=1024
43[ -z "$COUNT" ] && COUNT="10000000" # Zero means indefinitely
44if [ -n "$DEST_IP" ]; then
45 validate_addr${IP6} $DEST_IP
46 read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP)
47fi
48if [ -n "$DST_PORT" ]; then
49 read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT)
50 validate_ports $UDP_DST_MIN $UDP_DST_MAX
51fi
52
53# Base Config
54DELAY="0" # Zero means max speed
55
56# General cleanup everything since last run
57pg_ctrl "reset"
58
59# Threads are specified with parameter -t value in $THREADS
60for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
61 # The device name is extended with @name, using thread number to
62 # make then unique, but any name will do.
63 dev=${DEV}@${thread}
64
65 # Add remove all other devices and add_device $dev to thread
66 pg_thread $thread "rem_device_all"
67 pg_thread $thread "add_device" $dev
68
69 # Base config of dev
70 pg_set $dev "flag QUEUE_MAP_CPU"
71 pg_set $dev "count $COUNT"
72 pg_set $dev "pkt_size $PKT_SIZE"
73 pg_set $dev "delay $DELAY"
74 pg_set $dev "flag NO_TIMESTAMP"
75
76 # Destination
77 pg_set $dev "dst_mac $DST_MAC"
78 pg_set $dev "dst${IP6}_min $DST_MIN"
79 pg_set $dev "dst${IP6}_max $DST_MAX"
80
81 if [ -n "$DST_PORT" ]; then
82 # Single destination port or random port range
83 pg_set $dev "flag UDPDST_RND"
84 pg_set $dev "udp_dst_min $UDP_DST_MIN"
85 pg_set $dev "udp_dst_max $UDP_DST_MAX"
86 fi
87
88 # Inject packet into RX path of stack
89 pg_set $dev "xmit_mode netif_receive"
90
91 # Burst allow us to avoid measuring SKB alloc/free overhead
92 pg_set $dev "burst $BURST"
93done
94
95# start_run
96echo "Running... ctrl^C to stop" >&2
97pg_ctrl "start"
98echo "Done" >&2
99
100# Print results
101for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
102 dev=${DEV}@${thread}
103 echo "Device: $dev"
104 cat /proc/net/pktgen/$dev | grep -A2 "Result:"
105done
diff --git a/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh b/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh
new file mode 100755
index 000000000..e607cb369
--- /dev/null
+++ b/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh
@@ -0,0 +1,85 @@
1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3#
4# Benchmark script:
5# - developed for benchmarking egress qdisc path, derived (more
6# like cut'n'pasted) from ingress benchmark script.
7#
8# Script for injecting packets into egress qdisc path of the stack
9# with pktgen "xmit_mode queue_xmit".
10#
11basedir=`dirname $0`
12source ${basedir}/functions.sh
13root_check_run_with_sudo "$@"
14
15# Parameter parsing via include
16source ${basedir}/parameters.sh
17if [ -z "$DEST_IP" ]; then
18 [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1"
19fi
20[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff"
21
22# Burst greater than 1 are invalid for queue_xmit mode
23if [[ -n "$BURST" ]]; then
24 err 1 "Bursting not supported for this mode"
25fi
26[ -z "$COUNT" ] && COUNT="10000000" # Zero means indefinitely
27if [ -n "$DEST_IP" ]; then
28 validate_addr${IP6} $DEST_IP
29 read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP)
30fi
31if [ -n "$DST_PORT" ]; then
32 read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT)
33 validate_ports $UDP_DST_MIN $UDP_DST_MAX
34fi
35
36# Base Config
37DELAY="0" # Zero means max speed
38
39# General cleanup everything since last run
40pg_ctrl "reset"
41
42# Threads are specified with parameter -t value in $THREADS
43for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
44 # The device name is extended with @name, using thread number to
45 # make then unique, but any name will do.
46 dev=${DEV}@${thread}
47
48 # Add remove all other devices and add_device $dev to thread
49 pg_thread $thread "rem_device_all"
50 pg_thread $thread "add_device" $dev
51
52 # Base config of dev
53 pg_set $dev "flag QUEUE_MAP_CPU"
54 pg_set $dev "count $COUNT"
55 pg_set $dev "pkt_size $PKT_SIZE"
56 pg_set $dev "delay $DELAY"
57 pg_set $dev "flag NO_TIMESTAMP"
58
59 # Destination
60 pg_set $dev "dst_mac $DST_MAC"
61 pg_set $dev "dst${IP6}_min $DST_MIN"
62 pg_set $dev "dst${IP6}_max $DST_MAX"
63
64 if [ -n "$DST_PORT" ]; then
65 # Single destination port or random port range
66 pg_set $dev "flag UDPDST_RND"
67 pg_set $dev "udp_dst_min $UDP_DST_MIN"
68 pg_set $dev "udp_dst_max $UDP_DST_MAX"
69 fi
70
71 # Inject packet into TX qdisc egress path of stack
72 pg_set $dev "xmit_mode queue_xmit"
73done
74
75# start_run
76echo "Running... ctrl^C to stop" >&2
77pg_ctrl "start"
78echo "Done" >&2
79
80# Print results
81for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
82 dev=${DEV}@${thread}
83 echo "Device: $dev"
84 cat /proc/net/pktgen/$dev | grep -A2 "Result:"
85done
diff --git a/samples/pktgen/pktgen_sample01_simple.sh b/samples/pktgen/pktgen_sample01_simple.sh
new file mode 100755
index 000000000..a4e250b45
--- /dev/null
+++ b/samples/pktgen/pktgen_sample01_simple.sh
@@ -0,0 +1,90 @@
1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3#
4# Simple example:
5# * pktgen sending with single thread and single interface
6# * flow variation via random UDP source port
7#
8basedir=`dirname $0`
9source ${basedir}/functions.sh
10root_check_run_with_sudo "$@"
11
12# Parameter parsing via include
13# - go look in parameters.sh to see which setting are avail
14# - required param is the interface "-i" stored in $DEV
15source ${basedir}/parameters.sh
16#
17# Set some default params, if they didn't get set
18if [ -z "$DEST_IP" ]; then
19 [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1"
20fi
21[ -z "$CLONE_SKB" ] && CLONE_SKB="0"
22# Example enforce param "-m" for dst_mac
23[ -z "$DST_MAC" ] && usage && err 2 "Must specify -m dst_mac"
24[ -z "$COUNT" ] && COUNT="100000" # Zero means indefinitely
25if [ -n "$DEST_IP" ]; then
26 validate_addr${IP6} $DEST_IP
27 read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP)
28fi
29if [ -n "$DST_PORT" ]; then
30 read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT)
31 validate_ports $UDP_DST_MIN $UDP_DST_MAX
32fi
33
34# Base Config
35DELAY="0" # Zero means max speed
36
37# Flow variation random source port between min and max
38UDP_SRC_MIN=9
39UDP_SRC_MAX=109
40
41# General cleanup everything since last run
42# (especially important if other threads were configured by other scripts)
43pg_ctrl "reset"
44
45# Add remove all other devices and add_device $DEV to thread 0
46thread=0
47pg_thread $thread "rem_device_all"
48pg_thread $thread "add_device" $DEV
49
50# How many packets to send (zero means indefinitely)
51pg_set $DEV "count $COUNT"
52
53# Reduce alloc cost by sending same SKB many times
54# - this obviously affects the randomness within the packet
55pg_set $DEV "clone_skb $CLONE_SKB"
56
57# Set packet size
58pg_set $DEV "pkt_size $PKT_SIZE"
59
60# Delay between packets (zero means max speed)
61pg_set $DEV "delay $DELAY"
62
63# Flag example disabling timestamping
64pg_set $DEV "flag NO_TIMESTAMP"
65
66# Destination
67pg_set $DEV "dst_mac $DST_MAC"
68pg_set $DEV "dst${IP6}_min $DST_MIN"
69pg_set $DEV "dst${IP6}_max $DST_MAX"
70
71if [ -n "$DST_PORT" ]; then
72 # Single destination port or random port range
73 pg_set $DEV "flag UDPDST_RND"
74 pg_set $DEV "udp_dst_min $UDP_DST_MIN"
75 pg_set $DEV "udp_dst_max $UDP_DST_MAX"
76fi
77
78# Setup random UDP port src range
79pg_set $DEV "flag UDPSRC_RND"
80pg_set $DEV "udp_src_min $UDP_SRC_MIN"
81pg_set $DEV "udp_src_max $UDP_SRC_MAX"
82
83# start_run
84echo "Running... ctrl^C to stop" >&2
85pg_ctrl "start"
86echo "Done" >&2
87
88# Print results
89echo "Result device: $DEV"
90cat /proc/net/pktgen/$DEV
diff --git a/samples/pktgen/pktgen_sample02_multiqueue.sh b/samples/pktgen/pktgen_sample02_multiqueue.sh
new file mode 100755
index 000000000..cb2495fcd
--- /dev/null
+++ b/samples/pktgen/pktgen_sample02_multiqueue.sh
@@ -0,0 +1,95 @@
1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3#
4# Multiqueue: Using pktgen threads for sending on multiple CPUs
5# * adding devices to kernel threads
6# * notice the naming scheme for keeping device names unique
7# * nameing scheme: dev@thread_number
8# * flow variation via random UDP source port
9#
10basedir=`dirname $0`
11source ${basedir}/functions.sh
12root_check_run_with_sudo "$@"
13#
14# Required param: -i dev in $DEV
15source ${basedir}/parameters.sh
16
17[ -z "$COUNT" ] && COUNT="100000" # Zero means indefinitely
18
19# Base Config
20DELAY="0" # Zero means max speed
21[ -z "$CLONE_SKB" ] && CLONE_SKB="0"
22
23# Flow variation random source port between min and max
24UDP_SRC_MIN=9
25UDP_SRC_MAX=109
26
27# (example of setting default params in your script)
28if [ -z "$DEST_IP" ]; then
29 [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1"
30fi
31[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff"
32if [ -n "$DEST_IP" ]; then
33 validate_addr${IP6} $DEST_IP
34 read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP)
35fi
36if [ -n "$DST_PORT" ]; then
37 read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT)
38 validate_ports $UDP_DST_MIN $UDP_DST_MAX
39fi
40
41# General cleanup everything since last run
42pg_ctrl "reset"
43
44# Threads are specified with parameter -t value in $THREADS
45for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
46 # The device name is extended with @name, using thread number to
47 # make then unique, but any name will do.
48 dev=${DEV}@${thread}
49
50 # Add remove all other devices and add_device $dev to thread
51 pg_thread $thread "rem_device_all"
52 pg_thread $thread "add_device" $dev
53
54 # Notice config queue to map to cpu (mirrors smp_processor_id())
55 # It is beneficial to map IRQ /proc/irq/*/smp_affinity 1:1 to CPU number
56 pg_set $dev "flag QUEUE_MAP_CPU"
57
58 # Base config of dev
59 pg_set $dev "count $COUNT"
60 pg_set $dev "clone_skb $CLONE_SKB"
61 pg_set $dev "pkt_size $PKT_SIZE"
62 pg_set $dev "delay $DELAY"
63
64 # Flag example disabling timestamping
65 pg_set $dev "flag NO_TIMESTAMP"
66
67 # Destination
68 pg_set $dev "dst_mac $DST_MAC"
69 pg_set $dev "dst${IP6}_min $DST_MIN"
70 pg_set $dev "dst${IP6}_max $DST_MAX"
71
72 if [ -n "$DST_PORT" ]; then
73 # Single destination port or random port range
74 pg_set $dev "flag UDPDST_RND"
75 pg_set $dev "udp_dst_min $UDP_DST_MIN"
76 pg_set $dev "udp_dst_max $UDP_DST_MAX"
77 fi
78
79 # Setup random UDP port src range
80 pg_set $dev "flag UDPSRC_RND"
81 pg_set $dev "udp_src_min $UDP_SRC_MIN"
82 pg_set $dev "udp_src_max $UDP_SRC_MAX"
83done
84
85# start_run
86echo "Running... ctrl^C to stop" >&2
87pg_ctrl "start"
88echo "Done" >&2
89
90# Print results
91for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
92 dev=${DEV}@${thread}
93 echo "Device: $dev"
94 cat /proc/net/pktgen/$dev | grep -A2 "Result:"
95done
diff --git a/samples/pktgen/pktgen_sample03_burst_single_flow.sh b/samples/pktgen/pktgen_sample03_burst_single_flow.sh
new file mode 100755
index 000000000..fff50765a
--- /dev/null
+++ b/samples/pktgen/pktgen_sample03_burst_single_flow.sh
@@ -0,0 +1,101 @@
1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3#
4# Script for max single flow performance
5# - If correctly tuned[1], single CPU 10G wirespeed small pkts is possible[2]
6#
7# Using pktgen "burst" option (use -b $N)
8# - To boost max performance
9# - Avail since: kernel v3.18
10# * commit 38b2cf2982dc73 ("net: pktgen: packet bursting via skb->xmit_more")
11# - This avoids writing the HW tailptr on every driver xmit
12# - The performance boost is impressive, see commit and blog [2]
13#
14# Notice: On purpose generates a single (UDP) flow towards target,
15# reason behind this is to only overload/activate a single CPU on
16# target host. And no randomness for pktgen also makes it faster.
17#
18# Tuning see:
19# [1] http://netoptimizer.blogspot.dk/2014/06/pktgen-for-network-overload-testing.html
20# [2] http://netoptimizer.blogspot.dk/2014/10/unlocked-10gbps-tx-wirespeed-smallest.html
21#
22basedir=`dirname $0`
23source ${basedir}/functions.sh
24root_check_run_with_sudo "$@"
25
26# Parameter parsing via include
27source ${basedir}/parameters.sh
28# Set some default params, if they didn't get set
29if [ -z "$DEST_IP" ]; then
30 [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1"
31fi
32[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff"
33[ -z "$BURST" ] && BURST=32
34[ -z "$CLONE_SKB" ] && CLONE_SKB="0" # No need for clones when bursting
35[ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely
36if [ -n "$DEST_IP" ]; then
37 validate_addr${IP6} $DEST_IP
38 read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP)
39fi
40if [ -n "$DST_PORT" ]; then
41 read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT)
42 validate_ports $UDP_DST_MIN $UDP_DST_MAX
43fi
44
45# Base Config
46DELAY="0" # Zero means max speed
47
48# General cleanup everything since last run
49pg_ctrl "reset"
50
51# Threads are specified with parameter -t value in $THREADS
52for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
53 dev=${DEV}@${thread}
54
55 # Add remove all other devices and add_device $dev to thread
56 pg_thread $thread "rem_device_all"
57 pg_thread $thread "add_device" $dev
58
59 # Base config
60 pg_set $dev "flag QUEUE_MAP_CPU"
61 pg_set $dev "count $COUNT"
62 pg_set $dev "clone_skb $CLONE_SKB"
63 pg_set $dev "pkt_size $PKT_SIZE"
64 pg_set $dev "delay $DELAY"
65 pg_set $dev "flag NO_TIMESTAMP"
66
67 # Destination
68 pg_set $dev "dst_mac $DST_MAC"
69 pg_set $dev "dst${IP6}_min $DST_MIN"
70 pg_set $dev "dst${IP6}_max $DST_MAX"
71
72 if [ -n "$DST_PORT" ]; then
73 # Single destination port or random port range
74 pg_set $dev "flag UDPDST_RND"
75 pg_set $dev "udp_dst_min $UDP_DST_MIN"
76 pg_set $dev "udp_dst_max $UDP_DST_MAX"
77 fi
78
79 # Setup burst, for easy testing -b 0 disable bursting
80 # (internally in pktgen default and minimum burst=1)
81 if [[ ${BURST} -ne 0 ]]; then
82 pg_set $dev "burst $BURST"
83 else
84 info "$dev: Not using burst"
85 fi
86done
87
88# Run if user hits control-c
89function control_c() {
90 # Print results
91 for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
92 dev=${DEV}@${thread}
93 echo "Device: $dev"
94 cat /proc/net/pktgen/$dev | grep -A2 "Result:"
95 done
96}
97# trap keyboard interrupt (Ctrl-C)
98trap control_c SIGINT
99
100echo "Running... ctrl^C to stop" >&2
101pg_ctrl "start"
diff --git a/samples/pktgen/pktgen_sample04_many_flows.sh b/samples/pktgen/pktgen_sample04_many_flows.sh
new file mode 100755
index 000000000..9db1ecf8d
--- /dev/null
+++ b/samples/pktgen/pktgen_sample04_many_flows.sh
@@ -0,0 +1,115 @@
1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3#
4# Script example for many flows testing
5#
6# Number of simultaneous flows limited by variable $FLOWS
7# and number of packets per flow controlled by variable $FLOWLEN
8#
9basedir=`dirname $0`
10source ${basedir}/functions.sh
11root_check_run_with_sudo "$@"
12
13# Parameter parsing via include
14source ${basedir}/parameters.sh
15# Set some default params, if they didn't get set
16if [ -z "$DEST_IP" ]; then
17 [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1"
18fi
19[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff"
20[ -z "$CLONE_SKB" ] && CLONE_SKB="0"
21[ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely
22if [ -n "$DEST_IP" ]; then
23 validate_addr${IP6} $DEST_IP
24 read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP)
25fi
26if [ -n "$DST_PORT" ]; then
27 read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT)
28 validate_ports $UDP_DST_MIN $UDP_DST_MAX
29fi
30
31# NOTICE: Script specific settings
32# =======
33# Limiting the number of concurrent flows ($FLOWS)
34# and also set how many packets each flow contains ($FLOWLEN)
35#
36[ -z "$FLOWS" ] && FLOWS="8000"
37[ -z "$FLOWLEN" ] && FLOWLEN="10"
38
39# Base Config
40DELAY="0" # Zero means max speed
41
42if [[ -n "$BURST" ]]; then
43 err 1 "Bursting not supported for this mode"
44fi
45
46# 198.18.0.0 / 198.19.255.255
47read -r SRC_MIN SRC_MAX <<< $(parse_addr 198.18.0.0/15)
48
49# General cleanup everything since last run
50pg_ctrl "reset"
51
52# Threads are specified with parameter -t value in $THREADS
53for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
54 dev=${DEV}@${thread}
55
56 # Add remove all other devices and add_device $dev to thread
57 pg_thread $thread "rem_device_all"
58 pg_thread $thread "add_device" $dev
59
60 # Base config
61 pg_set $dev "flag QUEUE_MAP_CPU"
62 pg_set $dev "count $COUNT"
63 pg_set $dev "clone_skb $CLONE_SKB"
64 pg_set $dev "pkt_size $PKT_SIZE"
65 pg_set $dev "delay $DELAY"
66 pg_set $dev "flag NO_TIMESTAMP"
67
68 # Single destination
69 pg_set $dev "dst_mac $DST_MAC"
70 pg_set $dev "dst${IP6}_min $DST_MIN"
71 pg_set $dev "dst${IP6}_max $DST_MAX"
72
73 if [ -n "$DST_PORT" ]; then
74 # Single destination port or random port range
75 pg_set $dev "flag UDPDST_RND"
76 pg_set $dev "udp_dst_min $UDP_DST_MIN"
77 pg_set $dev "udp_dst_max $UDP_DST_MAX"
78 fi
79
80 # Randomize source IP-addresses
81 pg_set $dev "flag IPSRC_RND"
82 pg_set $dev "src_min $SRC_MIN"
83 pg_set $dev "src_max $SRC_MAX"
84
85 # Limit number of flows (max 65535)
86 pg_set $dev "flows $FLOWS"
87 #
88 # How many packets a flow will send, before flow "entry" is
89 # re-generated/setup.
90 pg_set $dev "flowlen $FLOWLEN"
91 #
92 # Flag FLOW_SEQ will cause $FLOWLEN packets from the same flow
93 # being send back-to-back, before next flow is selected
94 # incrementally. This helps lookup caches, and is more realistic.
95 #
96 pg_set $dev "flag FLOW_SEQ"
97
98done
99
100# Run if user hits control-c
101function print_result() {
102 # Print results
103 for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
104 dev=${DEV}@${thread}
105 echo "Device: $dev"
106 cat /proc/net/pktgen/$dev | grep -A2 "Result:"
107 done
108}
109# trap keyboard interrupt (Ctrl-C)
110trap true SIGINT
111
112echo "Running... ctrl^C to stop" >&2
113pg_ctrl "start"
114
115print_result
diff --git a/samples/pktgen/pktgen_sample05_flow_per_thread.sh b/samples/pktgen/pktgen_sample05_flow_per_thread.sh
new file mode 100755
index 000000000..9fc6c6da0
--- /dev/null
+++ b/samples/pktgen/pktgen_sample05_flow_per_thread.sh
@@ -0,0 +1,99 @@
1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3#
4# Script will generate one flow per thread (-t N)
5# - Same destination IP
6# - Fake source IPs for each flow (fixed based on thread number)
7#
8# Useful for scale testing on receiver, to see whether silo'ing flows
9# works and scales. For optimal scalability (on receiver) each
10# separate-flow should not access shared variables/data. This script
11# helps magnify any of these scaling issues by overloading the receiver.
12#
13basedir=`dirname $0`
14source ${basedir}/functions.sh
15root_check_run_with_sudo "$@"
16
17# Parameter parsing via include
18source ${basedir}/parameters.sh
19# Set some default params, if they didn't get set
20if [ -z "$DEST_IP" ]; then
21 [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1"
22fi
23[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff"
24[ -z "$CLONE_SKB" ] && CLONE_SKB="0"
25[ -z "$BURST" ] && BURST=32
26[ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely
27if [ -n "$DEST_IP" ]; then
28 validate_addr${IP6} $DEST_IP
29 read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP)
30fi
31if [ -n "$DST_PORT" ]; then
32 read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT)
33 validate_ports $UDP_DST_MIN $UDP_DST_MAX
34fi
35
36# Base Config
37DELAY="0" # Zero means max speed
38
39# General cleanup everything since last run
40pg_ctrl "reset"
41
42# Threads are specified with parameter -t value in $THREADS
43for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
44 dev=${DEV}@${thread}
45
46 # Add remove all other devices and add_device $dev to thread
47 pg_thread $thread "rem_device_all"
48 pg_thread $thread "add_device" $dev
49
50 # Base config
51 pg_set $dev "flag QUEUE_MAP_CPU"
52 pg_set $dev "count $COUNT"
53 pg_set $dev "clone_skb $CLONE_SKB"
54 pg_set $dev "pkt_size $PKT_SIZE"
55 pg_set $dev "delay $DELAY"
56 pg_set $dev "flag NO_TIMESTAMP"
57
58 # Single destination
59 pg_set $dev "dst_mac $DST_MAC"
60 pg_set $dev "dst${IP6}_min $DST_MIN"
61 pg_set $dev "dst${IP6}_max $DST_MAX"
62
63 if [ -n "$DST_PORT" ]; then
64 # Single destination port or random port range
65 pg_set $dev "flag UDPDST_RND"
66 pg_set $dev "udp_dst_min $UDP_DST_MIN"
67 pg_set $dev "udp_dst_max $UDP_DST_MAX"
68 fi
69
70 # Setup source IP-addresses based on thread number
71 pg_set $dev "src_min 198.18.$((thread+1)).1"
72 pg_set $dev "src_max 198.18.$((thread+1)).1"
73
74 # Setup burst, for easy testing -b 0 disable bursting
75 # (internally in pktgen default and minimum burst=1)
76 if [[ ${BURST} -ne 0 ]]; then
77 pg_set $dev "burst $BURST"
78 else
79 info "$dev: Not using burst"
80 fi
81
82done
83
84# Run if user hits control-c
85function print_result() {
86 # Print results
87 for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do
88 dev=${DEV}@${thread}
89 echo "Device: $dev"
90 cat /proc/net/pktgen/$dev | grep -A2 "Result:"
91 done
92}
93# trap keyboard interrupt (Ctrl-C)
94trap true SIGINT
95
96echo "Running... ctrl^C to stop" >&2
97pg_ctrl "start"
98
99print_result
diff --git a/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh
new file mode 100755
index 000000000..728106060
--- /dev/null
+++ b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh
@@ -0,0 +1,113 @@
1#!/bin/bash
2#
3# Multiqueue: Using pktgen threads for sending on multiple CPUs
4# * adding devices to kernel threads which are in the same NUMA node
5# * bound devices queue's irq affinity to the threads, 1:1 mapping
6# * notice the naming scheme for keeping device names unique
7# * nameing scheme: dev@thread_number
8# * flow variation via random UDP source port
9#
10basedir=`dirname $0`
11source ${basedir}/functions.sh
12root_check_run_with_sudo "$@"
13#
14# Required param: -i dev in $DEV
15source ${basedir}/parameters.sh
16
17# Base Config
18DELAY="0" # Zero means max speed
19[ -z "$COUNT" ] && COUNT="20000000" # Zero means indefinitely
20[ -z "$CLONE_SKB" ] && CLONE_SKB="0"
21
22# Flow variation random source port between min and max
23UDP_SRC_MIN=9
24UDP_SRC_MAX=109
25
26node=`get_iface_node $DEV`
27irq_array=(`get_iface_irqs $DEV`)
28cpu_array=(`get_node_cpus $node`)
29
30[ $THREADS -gt ${#irq_array[*]} -o $THREADS -gt ${#cpu_array[*]} ] && \
31 err 1 "Thread number $THREADS exceeds: min (${#irq_array[*]},${#cpu_array[*]})"
32
33# (example of setting default params in your script)
34if [ -z "$DEST_IP" ]; then
35 [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1"
36fi
37[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff"
38if [ -n "$DEST_IP" ]; then
39 validate_addr${IP6} $DEST_IP
40 read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP)
41fi
42if [ -n "$DST_PORT" ]; then
43 read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT)
44 validate_ports $UDP_DST_MIN $UDP_DST_MAX
45fi
46
47# General cleanup everything since last run
48pg_ctrl "reset"
49
50# Threads are specified with parameter -t value in $THREADS
51for ((i = 0; i < $THREADS; i++)); do
52 # The device name is extended with @name, using thread number to
53 # make then unique, but any name will do.
54 # Set the queue's irq affinity to this $thread (processor)
55 # if '-f' is designated, offset cpu id
56 thread=${cpu_array[$((i+F_THREAD))]}
57 dev=${DEV}@${thread}
58 echo $thread > /proc/irq/${irq_array[$i]}/smp_affinity_list
59 info "irq ${irq_array[$i]} is set affinity to `cat /proc/irq/${irq_array[$i]}/smp_affinity_list`"
60
61 # Add remove all other devices and add_device $dev to thread
62 pg_thread $thread "rem_device_all"
63 pg_thread $thread "add_device" $dev
64
65 # select queue and bind the queue and $dev in 1:1 relationship
66 queue_num=$i
67 info "queue number is $queue_num"
68 pg_set $dev "queue_map_min $queue_num"
69 pg_set $dev "queue_map_max $queue_num"
70
71 # Notice config queue to map to cpu (mirrors smp_processor_id())
72 # It is beneficial to map IRQ /proc/irq/*/smp_affinity 1:1 to CPU number
73 pg_set $dev "flag QUEUE_MAP_CPU"
74
75 # Base config of dev
76 pg_set $dev "count $COUNT"
77 pg_set $dev "clone_skb $CLONE_SKB"
78 pg_set $dev "pkt_size $PKT_SIZE"
79 pg_set $dev "delay $DELAY"
80
81 # Flag example disabling timestamping
82 pg_set $dev "flag NO_TIMESTAMP"
83
84 # Destination
85 pg_set $dev "dst_mac $DST_MAC"
86 pg_set $dev "dst${IP6}_min $DST_MIN"
87 pg_set $dev "dst${IP6}_max $DST_MAX"
88
89 if [ -n "$DST_PORT" ]; then
90 # Single destination port or random port range
91 pg_set $dev "flag UDPDST_RND"
92 pg_set $dev "udp_dst_min $UDP_DST_MIN"
93 pg_set $dev "udp_dst_max $UDP_DST_MAX"
94 fi
95
96 # Setup random UDP port src range
97 pg_set $dev "flag UDPSRC_RND"
98 pg_set $dev "udp_src_min $UDP_SRC_MIN"
99 pg_set $dev "udp_src_max $UDP_SRC_MAX"
100done
101
102# start_run
103echo "Running... ctrl^C to stop" >&2
104pg_ctrl "start"
105echo "Done" >&2
106
107# Print results
108for ((i = 0; i < $THREADS; i++)); do
109 thread=${cpu_array[$((i+F_THREAD))]}
110 dev=${DEV}@${thread}
111 echo "Device: $dev"
112 cat /proc/net/pktgen/$dev | grep -A2 "Result:"
113done
diff --git a/samples/qmi/Makefile b/samples/qmi/Makefile
new file mode 100644
index 000000000..641943d40
--- /dev/null
+++ b/samples/qmi/Makefile
@@ -0,0 +1,2 @@
1# SPDX-License-Identifier: GPL-2.0-only
2obj-$(CONFIG_SAMPLE_QMI_CLIENT) += qmi_sample_client.o
diff --git a/samples/qmi/qmi_sample_client.c b/samples/qmi/qmi_sample_client.c
new file mode 100644
index 000000000..c9e7276c3
--- /dev/null
+++ b/samples/qmi/qmi_sample_client.c
@@ -0,0 +1,622 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Sample in-kernel QMI client driver
4 *
5 * Copyright (c) 2013-2014, The Linux Foundation. All rights reserved.
6 * Copyright (C) 2017 Linaro Ltd.
7 */
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/debugfs.h>
11#include <linux/device.h>
12#include <linux/platform_device.h>
13#include <linux/qrtr.h>
14#include <linux/net.h>
15#include <linux/completion.h>
16#include <linux/idr.h>
17#include <linux/string.h>
18#include <net/sock.h>
19#include <linux/soc/qcom/qmi.h>
20
21#define PING_REQ1_TLV_TYPE 0x1
22#define PING_RESP1_TLV_TYPE 0x2
23#define PING_OPT1_TLV_TYPE 0x10
24#define PING_OPT2_TLV_TYPE 0x11
25
26#define DATA_REQ1_TLV_TYPE 0x1
27#define DATA_RESP1_TLV_TYPE 0x2
28#define DATA_OPT1_TLV_TYPE 0x10
29#define DATA_OPT2_TLV_TYPE 0x11
30
31#define TEST_MED_DATA_SIZE_V01 8192
32#define TEST_MAX_NAME_SIZE_V01 255
33
34#define TEST_PING_REQ_MSG_ID_V01 0x20
35#define TEST_DATA_REQ_MSG_ID_V01 0x21
36
37#define TEST_PING_REQ_MAX_MSG_LEN_V01 266
38#define TEST_DATA_REQ_MAX_MSG_LEN_V01 8456
39
40struct test_name_type_v01 {
41 u32 name_len;
42 char name[TEST_MAX_NAME_SIZE_V01];
43};
44
45static struct qmi_elem_info test_name_type_v01_ei[] = {
46 {
47 .data_type = QMI_DATA_LEN,
48 .elem_len = 1,
49 .elem_size = sizeof(u8),
50 .array_type = NO_ARRAY,
51 .tlv_type = QMI_COMMON_TLV_TYPE,
52 .offset = offsetof(struct test_name_type_v01,
53 name_len),
54 },
55 {
56 .data_type = QMI_UNSIGNED_1_BYTE,
57 .elem_len = TEST_MAX_NAME_SIZE_V01,
58 .elem_size = sizeof(char),
59 .array_type = VAR_LEN_ARRAY,
60 .tlv_type = QMI_COMMON_TLV_TYPE,
61 .offset = offsetof(struct test_name_type_v01,
62 name),
63 },
64 {}
65};
66
67struct test_ping_req_msg_v01 {
68 char ping[4];
69
70 u8 client_name_valid;
71 struct test_name_type_v01 client_name;
72};
73
74static struct qmi_elem_info test_ping_req_msg_v01_ei[] = {
75 {
76 .data_type = QMI_UNSIGNED_1_BYTE,
77 .elem_len = 4,
78 .elem_size = sizeof(char),
79 .array_type = STATIC_ARRAY,
80 .tlv_type = PING_REQ1_TLV_TYPE,
81 .offset = offsetof(struct test_ping_req_msg_v01,
82 ping),
83 },
84 {
85 .data_type = QMI_OPT_FLAG,
86 .elem_len = 1,
87 .elem_size = sizeof(u8),
88 .array_type = NO_ARRAY,
89 .tlv_type = PING_OPT1_TLV_TYPE,
90 .offset = offsetof(struct test_ping_req_msg_v01,
91 client_name_valid),
92 },
93 {
94 .data_type = QMI_STRUCT,
95 .elem_len = 1,
96 .elem_size = sizeof(struct test_name_type_v01),
97 .array_type = NO_ARRAY,
98 .tlv_type = PING_OPT1_TLV_TYPE,
99 .offset = offsetof(struct test_ping_req_msg_v01,
100 client_name),
101 .ei_array = test_name_type_v01_ei,
102 },
103 {}
104};
105
106struct test_ping_resp_msg_v01 {
107 struct qmi_response_type_v01 resp;
108
109 u8 pong_valid;
110 char pong[4];
111
112 u8 service_name_valid;
113 struct test_name_type_v01 service_name;
114};
115
116static struct qmi_elem_info test_ping_resp_msg_v01_ei[] = {
117 {
118 .data_type = QMI_STRUCT,
119 .elem_len = 1,
120 .elem_size = sizeof(struct qmi_response_type_v01),
121 .array_type = NO_ARRAY,
122 .tlv_type = PING_RESP1_TLV_TYPE,
123 .offset = offsetof(struct test_ping_resp_msg_v01,
124 resp),
125 .ei_array = qmi_response_type_v01_ei,
126 },
127 {
128 .data_type = QMI_OPT_FLAG,
129 .elem_len = 1,
130 .elem_size = sizeof(u8),
131 .array_type = NO_ARRAY,
132 .tlv_type = PING_OPT1_TLV_TYPE,
133 .offset = offsetof(struct test_ping_resp_msg_v01,
134 pong_valid),
135 },
136 {
137 .data_type = QMI_UNSIGNED_1_BYTE,
138 .elem_len = 4,
139 .elem_size = sizeof(char),
140 .array_type = STATIC_ARRAY,
141 .tlv_type = PING_OPT1_TLV_TYPE,
142 .offset = offsetof(struct test_ping_resp_msg_v01,
143 pong),
144 },
145 {
146 .data_type = QMI_OPT_FLAG,
147 .elem_len = 1,
148 .elem_size = sizeof(u8),
149 .array_type = NO_ARRAY,
150 .tlv_type = PING_OPT2_TLV_TYPE,
151 .offset = offsetof(struct test_ping_resp_msg_v01,
152 service_name_valid),
153 },
154 {
155 .data_type = QMI_STRUCT,
156 .elem_len = 1,
157 .elem_size = sizeof(struct test_name_type_v01),
158 .array_type = NO_ARRAY,
159 .tlv_type = PING_OPT2_TLV_TYPE,
160 .offset = offsetof(struct test_ping_resp_msg_v01,
161 service_name),
162 .ei_array = test_name_type_v01_ei,
163 },
164 {}
165};
166
167struct test_data_req_msg_v01 {
168 u32 data_len;
169 u8 data[TEST_MED_DATA_SIZE_V01];
170
171 u8 client_name_valid;
172 struct test_name_type_v01 client_name;
173};
174
175static struct qmi_elem_info test_data_req_msg_v01_ei[] = {
176 {
177 .data_type = QMI_DATA_LEN,
178 .elem_len = 1,
179 .elem_size = sizeof(u32),
180 .array_type = NO_ARRAY,
181 .tlv_type = DATA_REQ1_TLV_TYPE,
182 .offset = offsetof(struct test_data_req_msg_v01,
183 data_len),
184 },
185 {
186 .data_type = QMI_UNSIGNED_1_BYTE,
187 .elem_len = TEST_MED_DATA_SIZE_V01,
188 .elem_size = sizeof(u8),
189 .array_type = VAR_LEN_ARRAY,
190 .tlv_type = DATA_REQ1_TLV_TYPE,
191 .offset = offsetof(struct test_data_req_msg_v01,
192 data),
193 },
194 {
195 .data_type = QMI_OPT_FLAG,
196 .elem_len = 1,
197 .elem_size = sizeof(u8),
198 .array_type = NO_ARRAY,
199 .tlv_type = DATA_OPT1_TLV_TYPE,
200 .offset = offsetof(struct test_data_req_msg_v01,
201 client_name_valid),
202 },
203 {
204 .data_type = QMI_STRUCT,
205 .elem_len = 1,
206 .elem_size = sizeof(struct test_name_type_v01),
207 .array_type = NO_ARRAY,
208 .tlv_type = DATA_OPT1_TLV_TYPE,
209 .offset = offsetof(struct test_data_req_msg_v01,
210 client_name),
211 .ei_array = test_name_type_v01_ei,
212 },
213 {}
214};
215
216struct test_data_resp_msg_v01 {
217 struct qmi_response_type_v01 resp;
218
219 u8 data_valid;
220 u32 data_len;
221 u8 data[TEST_MED_DATA_SIZE_V01];
222
223 u8 service_name_valid;
224 struct test_name_type_v01 service_name;
225};
226
227static struct qmi_elem_info test_data_resp_msg_v01_ei[] = {
228 {
229 .data_type = QMI_STRUCT,
230 .elem_len = 1,
231 .elem_size = sizeof(struct qmi_response_type_v01),
232 .array_type = NO_ARRAY,
233 .tlv_type = DATA_RESP1_TLV_TYPE,
234 .offset = offsetof(struct test_data_resp_msg_v01,
235 resp),
236 .ei_array = qmi_response_type_v01_ei,
237 },
238 {
239 .data_type = QMI_OPT_FLAG,
240 .elem_len = 1,
241 .elem_size = sizeof(u8),
242 .array_type = NO_ARRAY,
243 .tlv_type = DATA_OPT1_TLV_TYPE,
244 .offset = offsetof(struct test_data_resp_msg_v01,
245 data_valid),
246 },
247 {
248 .data_type = QMI_DATA_LEN,
249 .elem_len = 1,
250 .elem_size = sizeof(u32),
251 .array_type = NO_ARRAY,
252 .tlv_type = DATA_OPT1_TLV_TYPE,
253 .offset = offsetof(struct test_data_resp_msg_v01,
254 data_len),
255 },
256 {
257 .data_type = QMI_UNSIGNED_1_BYTE,
258 .elem_len = TEST_MED_DATA_SIZE_V01,
259 .elem_size = sizeof(u8),
260 .array_type = VAR_LEN_ARRAY,
261 .tlv_type = DATA_OPT1_TLV_TYPE,
262 .offset = offsetof(struct test_data_resp_msg_v01,
263 data),
264 },
265 {
266 .data_type = QMI_OPT_FLAG,
267 .elem_len = 1,
268 .elem_size = sizeof(u8),
269 .array_type = NO_ARRAY,
270 .tlv_type = DATA_OPT2_TLV_TYPE,
271 .offset = offsetof(struct test_data_resp_msg_v01,
272 service_name_valid),
273 },
274 {
275 .data_type = QMI_STRUCT,
276 .elem_len = 1,
277 .elem_size = sizeof(struct test_name_type_v01),
278 .array_type = NO_ARRAY,
279 .tlv_type = DATA_OPT2_TLV_TYPE,
280 .offset = offsetof(struct test_data_resp_msg_v01,
281 service_name),
282 .ei_array = test_name_type_v01_ei,
283 },
284 {}
285};
286
287/*
288 * ping_write() - ping_pong debugfs file write handler
289 * @file: debugfs file context
290 * @user_buf: reference to the user data (ignored)
291 * @count: number of bytes in @user_buf
292 * @ppos: offset in @file to write
293 *
294 * This function allows user space to send out a ping_pong QMI encoded message
295 * to the associated remote test service and will return with the result of the
296 * transaction. It serves as an example of how to provide a custom response
297 * handler.
298 *
299 * Return: @count, or negative errno on failure.
300 */
301static ssize_t ping_write(struct file *file, const char __user *user_buf,
302 size_t count, loff_t *ppos)
303{
304 struct qmi_handle *qmi = file->private_data;
305 struct test_ping_req_msg_v01 req = {};
306 struct qmi_txn txn;
307 int ret;
308
309 memcpy(req.ping, "ping", sizeof(req.ping));
310
311 ret = qmi_txn_init(qmi, &txn, NULL, NULL);
312 if (ret < 0)
313 return ret;
314
315 ret = qmi_send_request(qmi, NULL, &txn,
316 TEST_PING_REQ_MSG_ID_V01,
317 TEST_PING_REQ_MAX_MSG_LEN_V01,
318 test_ping_req_msg_v01_ei, &req);
319 if (ret < 0) {
320 qmi_txn_cancel(&txn);
321 return ret;
322 }
323
324 ret = qmi_txn_wait(&txn, 5 * HZ);
325 if (ret < 0)
326 count = ret;
327
328 return count;
329}
330
331static const struct file_operations ping_fops = {
332 .open = simple_open,
333 .write = ping_write,
334};
335
336static void ping_pong_cb(struct qmi_handle *qmi, struct sockaddr_qrtr *sq,
337 struct qmi_txn *txn, const void *data)
338{
339 const struct test_ping_resp_msg_v01 *resp = data;
340
341 if (!txn) {
342 pr_err("spurious ping response\n");
343 return;
344 }
345
346 if (resp->resp.result == QMI_RESULT_FAILURE_V01)
347 txn->result = -ENXIO;
348 else if (!resp->pong_valid || memcmp(resp->pong, "pong", 4))
349 txn->result = -EINVAL;
350
351 complete(&txn->completion);
352}
353
354/*
355 * data_write() - data debugfs file write handler
356 * @file: debugfs file context
357 * @user_buf: reference to the user data
358 * @count: number of bytes in @user_buf
359 * @ppos: offset in @file to write
360 *
361 * This function allows user space to send out a data QMI encoded message to
362 * the associated remote test service and will return with the result of the
363 * transaction. It serves as an example of how to have the QMI helpers decode a
364 * transaction response into a provided object automatically.
365 *
366 * Return: @count, or negative errno on failure.
367 */
368static ssize_t data_write(struct file *file, const char __user *user_buf,
369 size_t count, loff_t *ppos)
370
371{
372 struct qmi_handle *qmi = file->private_data;
373 struct test_data_resp_msg_v01 *resp;
374 struct test_data_req_msg_v01 *req;
375 struct qmi_txn txn;
376 int ret;
377
378 req = kzalloc(sizeof(*req), GFP_KERNEL);
379 if (!req)
380 return -ENOMEM;
381
382 resp = kzalloc(sizeof(*resp), GFP_KERNEL);
383 if (!resp) {
384 kfree(req);
385 return -ENOMEM;
386 }
387
388 req->data_len = min_t(size_t, sizeof(req->data), count);
389 if (copy_from_user(req->data, user_buf, req->data_len)) {
390 ret = -EFAULT;
391 goto out;
392 }
393
394 ret = qmi_txn_init(qmi, &txn, test_data_resp_msg_v01_ei, resp);
395 if (ret < 0)
396 goto out;
397
398 ret = qmi_send_request(qmi, NULL, &txn,
399 TEST_DATA_REQ_MSG_ID_V01,
400 TEST_DATA_REQ_MAX_MSG_LEN_V01,
401 test_data_req_msg_v01_ei, req);
402 if (ret < 0) {
403 qmi_txn_cancel(&txn);
404 goto out;
405 }
406
407 ret = qmi_txn_wait(&txn, 5 * HZ);
408 if (ret < 0) {
409 goto out;
410 } else if (!resp->data_valid ||
411 resp->data_len != req->data_len ||
412 memcmp(resp->data, req->data, req->data_len)) {
413 pr_err("response data doesn't match expectation\n");
414 ret = -EINVAL;
415 goto out;
416 }
417
418 ret = count;
419
420out:
421 kfree(resp);
422 kfree(req);
423
424 return ret;
425}
426
427static const struct file_operations data_fops = {
428 .open = simple_open,
429 .write = data_write,
430};
431
432static struct qmi_msg_handler qmi_sample_handlers[] = {
433 {
434 .type = QMI_RESPONSE,
435 .msg_id = TEST_PING_REQ_MSG_ID_V01,
436 .ei = test_ping_resp_msg_v01_ei,
437 .decoded_size = sizeof(struct test_ping_req_msg_v01),
438 .fn = ping_pong_cb
439 },
440 {}
441};
442
443struct qmi_sample {
444 struct qmi_handle qmi;
445
446 struct dentry *de_dir;
447 struct dentry *de_data;
448 struct dentry *de_ping;
449};
450
451static struct dentry *qmi_debug_dir;
452
453static int qmi_sample_probe(struct platform_device *pdev)
454{
455 struct sockaddr_qrtr *sq;
456 struct qmi_sample *sample;
457 char path[20];
458 int ret;
459
460 sample = devm_kzalloc(&pdev->dev, sizeof(*sample), GFP_KERNEL);
461 if (!sample)
462 return -ENOMEM;
463
464 ret = qmi_handle_init(&sample->qmi, TEST_DATA_REQ_MAX_MSG_LEN_V01,
465 NULL,
466 qmi_sample_handlers);
467 if (ret < 0)
468 return ret;
469
470 sq = dev_get_platdata(&pdev->dev);
471 ret = kernel_connect(sample->qmi.sock, (struct sockaddr *)sq,
472 sizeof(*sq), 0);
473 if (ret < 0) {
474 pr_err("failed to connect to remote service port\n");
475 goto err_release_qmi_handle;
476 }
477
478 snprintf(path, sizeof(path), "%d:%d", sq->sq_node, sq->sq_port);
479
480 sample->de_dir = debugfs_create_dir(path, qmi_debug_dir);
481 if (IS_ERR(sample->de_dir)) {
482 ret = PTR_ERR(sample->de_dir);
483 goto err_release_qmi_handle;
484 }
485
486 sample->de_data = debugfs_create_file("data", 0600, sample->de_dir,
487 sample, &data_fops);
488 if (IS_ERR(sample->de_data)) {
489 ret = PTR_ERR(sample->de_data);
490 goto err_remove_de_dir;
491 }
492
493 sample->de_ping = debugfs_create_file("ping", 0600, sample->de_dir,
494 sample, &ping_fops);
495 if (IS_ERR(sample->de_ping)) {
496 ret = PTR_ERR(sample->de_ping);
497 goto err_remove_de_data;
498 }
499
500 platform_set_drvdata(pdev, sample);
501
502 return 0;
503
504err_remove_de_data:
505 debugfs_remove(sample->de_data);
506err_remove_de_dir:
507 debugfs_remove(sample->de_dir);
508err_release_qmi_handle:
509 qmi_handle_release(&sample->qmi);
510
511 return ret;
512}
513
514static int qmi_sample_remove(struct platform_device *pdev)
515{
516 struct qmi_sample *sample = platform_get_drvdata(pdev);
517
518 debugfs_remove(sample->de_ping);
519 debugfs_remove(sample->de_data);
520 debugfs_remove(sample->de_dir);
521
522 qmi_handle_release(&sample->qmi);
523
524 return 0;
525}
526
527static struct platform_driver qmi_sample_driver = {
528 .probe = qmi_sample_probe,
529 .remove = qmi_sample_remove,
530 .driver = {
531 .name = "qmi_sample_client",
532 },
533};
534
535static int qmi_sample_new_server(struct qmi_handle *qmi,
536 struct qmi_service *service)
537{
538 struct platform_device *pdev;
539 struct sockaddr_qrtr sq = { AF_QIPCRTR, service->node, service->port };
540 int ret;
541
542 pdev = platform_device_alloc("qmi_sample_client", PLATFORM_DEVID_AUTO);
543 if (!pdev)
544 return -ENOMEM;
545
546 ret = platform_device_add_data(pdev, &sq, sizeof(sq));
547 if (ret)
548 goto err_put_device;
549
550 ret = platform_device_add(pdev);
551 if (ret)
552 goto err_put_device;
553
554 service->priv = pdev;
555
556 return 0;
557
558err_put_device:
559 platform_device_put(pdev);
560
561 return ret;
562}
563
564static void qmi_sample_del_server(struct qmi_handle *qmi,
565 struct qmi_service *service)
566{
567 struct platform_device *pdev = service->priv;
568
569 platform_device_unregister(pdev);
570}
571
572static struct qmi_handle lookup_client;
573
574static struct qmi_ops lookup_ops = {
575 .new_server = qmi_sample_new_server,
576 .del_server = qmi_sample_del_server,
577};
578
579static int qmi_sample_init(void)
580{
581 int ret;
582
583 qmi_debug_dir = debugfs_create_dir("qmi_sample", NULL);
584 if (IS_ERR(qmi_debug_dir)) {
585 pr_err("failed to create qmi_sample dir\n");
586 return PTR_ERR(qmi_debug_dir);
587 }
588
589 ret = platform_driver_register(&qmi_sample_driver);
590 if (ret)
591 goto err_remove_debug_dir;
592
593 ret = qmi_handle_init(&lookup_client, 0, &lookup_ops, NULL);
594 if (ret < 0)
595 goto err_unregister_driver;
596
597 qmi_add_lookup(&lookup_client, 15, 0, 0);
598
599 return 0;
600
601err_unregister_driver:
602 platform_driver_unregister(&qmi_sample_driver);
603err_remove_debug_dir:
604 debugfs_remove(qmi_debug_dir);
605
606 return ret;
607}
608
609static void qmi_sample_exit(void)
610{
611 qmi_handle_release(&lookup_client);
612
613 platform_driver_unregister(&qmi_sample_driver);
614
615 debugfs_remove(qmi_debug_dir);
616}
617
618module_init(qmi_sample_init);
619module_exit(qmi_sample_exit);
620
621MODULE_DESCRIPTION("Sample QMI client driver");
622MODULE_LICENSE("GPL v2");
diff --git a/samples/rpmsg/Makefile b/samples/rpmsg/Makefile
new file mode 100644
index 000000000..ddf9a5d13
--- /dev/null
+++ b/samples/rpmsg/Makefile
@@ -0,0 +1,2 @@
1# SPDX-License-Identifier: GPL-2.0-only
2obj-$(CONFIG_SAMPLE_RPMSG_CLIENT) += rpmsg_client_sample.o
diff --git a/samples/rpmsg/rpmsg_client_sample.c b/samples/rpmsg/rpmsg_client_sample.c
new file mode 100644
index 000000000..ae5081662
--- /dev/null
+++ b/samples/rpmsg/rpmsg_client_sample.c
@@ -0,0 +1,96 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Remote processor messaging - sample client driver
4 *
5 * Copyright (C) 2011 Texas Instruments, Inc.
6 * Copyright (C) 2011 Google, Inc.
7 *
8 * Ohad Ben-Cohen <ohad@wizery.com>
9 * Brian Swetland <swetland@google.com>
10 */
11
12#include <linux/kernel.h>
13#include <linux/module.h>
14#include <linux/rpmsg.h>
15
16#define MSG "hello world!"
17
18static int count = 100;
19module_param(count, int, 0644);
20
21struct instance_data {
22 int rx_count;
23};
24
25static int rpmsg_sample_cb(struct rpmsg_device *rpdev, void *data, int len,
26 void *priv, u32 src)
27{
28 int ret;
29 struct instance_data *idata = dev_get_drvdata(&rpdev->dev);
30
31 dev_info(&rpdev->dev, "incoming msg %d (src: 0x%x)\n",
32 ++idata->rx_count, src);
33
34 print_hex_dump_debug(__func__, DUMP_PREFIX_NONE, 16, 1, data, len,
35 true);
36
37 /* samples should not live forever */
38 if (idata->rx_count >= count) {
39 dev_info(&rpdev->dev, "goodbye!\n");
40 return 0;
41 }
42
43 /* send a new message now */
44 ret = rpmsg_send(rpdev->ept, MSG, strlen(MSG));
45 if (ret)
46 dev_err(&rpdev->dev, "rpmsg_send failed: %d\n", ret);
47
48 return 0;
49}
50
51static int rpmsg_sample_probe(struct rpmsg_device *rpdev)
52{
53 int ret;
54 struct instance_data *idata;
55
56 dev_info(&rpdev->dev, "new channel: 0x%x -> 0x%x!\n",
57 rpdev->src, rpdev->dst);
58
59 idata = devm_kzalloc(&rpdev->dev, sizeof(*idata), GFP_KERNEL);
60 if (!idata)
61 return -ENOMEM;
62
63 dev_set_drvdata(&rpdev->dev, idata);
64
65 /* send a message to our remote processor */
66 ret = rpmsg_send(rpdev->ept, MSG, strlen(MSG));
67 if (ret) {
68 dev_err(&rpdev->dev, "rpmsg_send failed: %d\n", ret);
69 return ret;
70 }
71
72 return 0;
73}
74
75static void rpmsg_sample_remove(struct rpmsg_device *rpdev)
76{
77 dev_info(&rpdev->dev, "rpmsg sample client driver is removed\n");
78}
79
80static struct rpmsg_device_id rpmsg_driver_sample_id_table[] = {
81 { .name = "rpmsg-client-sample" },
82 { },
83};
84MODULE_DEVICE_TABLE(rpmsg, rpmsg_driver_sample_id_table);
85
86static struct rpmsg_driver rpmsg_sample_client = {
87 .drv.name = KBUILD_MODNAME,
88 .id_table = rpmsg_driver_sample_id_table,
89 .probe = rpmsg_sample_probe,
90 .callback = rpmsg_sample_cb,
91 .remove = rpmsg_sample_remove,
92};
93module_rpmsg_driver(rpmsg_sample_client);
94
95MODULE_DESCRIPTION("Remote processor messaging sample client driver");
96MODULE_LICENSE("GPL v2");
diff --git a/samples/seccomp/.gitignore b/samples/seccomp/.gitignore
new file mode 100644
index 000000000..4a5a5b7db
--- /dev/null
+++ b/samples/seccomp/.gitignore
@@ -0,0 +1,5 @@
1# SPDX-License-Identifier: GPL-2.0-only
2bpf-direct
3bpf-fancy
4dropper
5user-trap
diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile
new file mode 100644
index 000000000..c85ae0ed8
--- /dev/null
+++ b/samples/seccomp/Makefile
@@ -0,0 +1,6 @@
1# SPDX-License-Identifier: GPL-2.0
2userprogs-always-y += bpf-fancy dropper bpf-direct user-trap
3
4bpf-fancy-objs := bpf-fancy.o bpf-helper.o
5
6userccflags += -I usr/include
diff --git a/samples/seccomp/bpf-direct.c b/samples/seccomp/bpf-direct.c
new file mode 100644
index 000000000..c09e4a17a
--- /dev/null
+++ b/samples/seccomp/bpf-direct.c
@@ -0,0 +1,191 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Seccomp filter example for x86 (32-bit and 64-bit) with BPF macros
4 *
5 * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
6 * Author: Will Drewry <wad@chromium.org>
7 *
8 * The code may be used by anyone for any purpose,
9 * and can serve as a starting point for developing
10 * applications using prctl(PR_SET_SECCOMP, 2, ...).
11 */
12#if defined(__i386__) || defined(__x86_64__)
13#define SUPPORTED_ARCH 1
14#endif
15
16#if defined(SUPPORTED_ARCH)
17#define __USE_GNU 1
18#define _GNU_SOURCE 1
19
20#include <linux/types.h>
21#include <linux/filter.h>
22#include <linux/seccomp.h>
23#include <linux/unistd.h>
24#include <signal.h>
25#include <stdio.h>
26#include <stddef.h>
27#include <string.h>
28#include <sys/prctl.h>
29#include <unistd.h>
30
31#define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
32#define syscall_nr (offsetof(struct seccomp_data, nr))
33
34#if defined(__i386__)
35#define REG_RESULT REG_EAX
36#define REG_SYSCALL REG_EAX
37#define REG_ARG0 REG_EBX
38#define REG_ARG1 REG_ECX
39#define REG_ARG2 REG_EDX
40#define REG_ARG3 REG_ESI
41#define REG_ARG4 REG_EDI
42#define REG_ARG5 REG_EBP
43#elif defined(__x86_64__)
44#define REG_RESULT REG_RAX
45#define REG_SYSCALL REG_RAX
46#define REG_ARG0 REG_RDI
47#define REG_ARG1 REG_RSI
48#define REG_ARG2 REG_RDX
49#define REG_ARG3 REG_R10
50#define REG_ARG4 REG_R8
51#define REG_ARG5 REG_R9
52#endif
53
54#ifndef PR_SET_NO_NEW_PRIVS
55#define PR_SET_NO_NEW_PRIVS 38
56#endif
57
58#ifndef SYS_SECCOMP
59#define SYS_SECCOMP 1
60#endif
61
62static void emulator(int nr, siginfo_t *info, void *void_context)
63{
64 ucontext_t *ctx = (ucontext_t *)(void_context);
65 int syscall;
66 char *buf;
67 ssize_t bytes;
68 size_t len;
69 if (info->si_code != SYS_SECCOMP)
70 return;
71 if (!ctx)
72 return;
73 syscall = ctx->uc_mcontext.gregs[REG_SYSCALL];
74 buf = (char *) ctx->uc_mcontext.gregs[REG_ARG1];
75 len = (size_t) ctx->uc_mcontext.gregs[REG_ARG2];
76
77 if (syscall != __NR_write)
78 return;
79 if (ctx->uc_mcontext.gregs[REG_ARG0] != STDERR_FILENO)
80 return;
81 /* Redirect stderr messages to stdout. Doesn't handle EINTR, etc */
82 ctx->uc_mcontext.gregs[REG_RESULT] = -1;
83 if (write(STDOUT_FILENO, "[ERR] ", 6) > 0) {
84 bytes = write(STDOUT_FILENO, buf, len);
85 ctx->uc_mcontext.gregs[REG_RESULT] = bytes;
86 }
87 return;
88}
89
90static int install_emulator(void)
91{
92 struct sigaction act;
93 sigset_t mask;
94 memset(&act, 0, sizeof(act));
95 sigemptyset(&mask);
96 sigaddset(&mask, SIGSYS);
97
98 act.sa_sigaction = &emulator;
99 act.sa_flags = SA_SIGINFO;
100 if (sigaction(SIGSYS, &act, NULL) < 0) {
101 perror("sigaction");
102 return -1;
103 }
104 if (sigprocmask(SIG_UNBLOCK, &mask, NULL)) {
105 perror("sigprocmask");
106 return -1;
107 }
108 return 0;
109}
110
111static int install_filter(void)
112{
113 struct sock_filter filter[] = {
114 /* Grab the system call number */
115 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_nr),
116 /* Jump table for the allowed syscalls */
117 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_rt_sigreturn, 0, 1),
118 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
119#ifdef __NR_sigreturn
120 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_sigreturn, 0, 1),
121 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
122#endif
123 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit_group, 0, 1),
124 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
125 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit, 0, 1),
126 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
127 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_read, 1, 0),
128 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_write, 3, 2),
129
130 /* Check that read is only using stdin. */
131 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)),
132 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDIN_FILENO, 4, 0),
133 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
134
135 /* Check that write is only using stdout */
136 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)),
137 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDOUT_FILENO, 1, 0),
138 /* Trap attempts to write to stderr */
139 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDERR_FILENO, 1, 2),
140
141 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
142 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_TRAP),
143 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
144 };
145 struct sock_fprog prog = {
146 .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
147 .filter = filter,
148 };
149
150 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
151 perror("prctl(NO_NEW_PRIVS)");
152 return 1;
153 }
154
155
156 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
157 perror("prctl");
158 return 1;
159 }
160 return 0;
161}
162
163#define payload(_c) (_c), sizeof((_c))
164int main(int argc, char **argv)
165{
166 char buf[4096];
167 ssize_t bytes = 0;
168 if (install_emulator())
169 return 1;
170 if (install_filter())
171 return 1;
172 syscall(__NR_write, STDOUT_FILENO,
173 payload("OHAI! WHAT IS YOUR NAME? "));
174 bytes = syscall(__NR_read, STDIN_FILENO, buf, sizeof(buf));
175 syscall(__NR_write, STDOUT_FILENO, payload("HELLO, "));
176 syscall(__NR_write, STDOUT_FILENO, buf, bytes);
177 syscall(__NR_write, STDERR_FILENO,
178 payload("Error message going to STDERR\n"));
179 return 0;
180}
181#else /* SUPPORTED_ARCH */
182/*
183 * This sample is x86-only. Since kernel samples are compiled with the
184 * host toolchain, a non-x86 host will result in using only the main()
185 * below.
186 */
187int main(void)
188{
189 return 1;
190}
191#endif /* SUPPORTED_ARCH */
diff --git a/samples/seccomp/bpf-fancy.c b/samples/seccomp/bpf-fancy.c
new file mode 100644
index 000000000..1ccb43502
--- /dev/null
+++ b/samples/seccomp/bpf-fancy.c
@@ -0,0 +1,105 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Seccomp BPF example using a macro-based generator.
4 *
5 * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
6 * Author: Will Drewry <wad@chromium.org>
7 *
8 * The code may be used by anyone for any purpose,
9 * and can serve as a starting point for developing
10 * applications using prctl(PR_ATTACH_SECCOMP_FILTER).
11 */
12
13#include <linux/filter.h>
14#include <linux/seccomp.h>
15#include <linux/unistd.h>
16#include <stdio.h>
17#include <string.h>
18#include <sys/prctl.h>
19#include <unistd.h>
20
21#include "bpf-helper.h"
22
23#ifndef PR_SET_NO_NEW_PRIVS
24#define PR_SET_NO_NEW_PRIVS 38
25#endif
26
27int main(int argc, char **argv)
28{
29 struct bpf_labels l = {
30 .count = 0,
31 };
32 static const char msg1[] = "Please type something: ";
33 static const char msg2[] = "You typed: ";
34 char buf[256];
35 struct sock_filter filter[] = {
36 /* TODO: LOAD_SYSCALL_NR(arch) and enforce an arch */
37 LOAD_SYSCALL_NR,
38 SYSCALL(__NR_exit, ALLOW),
39 SYSCALL(__NR_exit_group, ALLOW),
40 SYSCALL(__NR_write, JUMP(&l, write_fd)),
41 SYSCALL(__NR_read, JUMP(&l, read)),
42 DENY, /* Don't passthrough into a label */
43
44 LABEL(&l, read),
45 ARG(0),
46 JNE(STDIN_FILENO, DENY),
47 ARG(1),
48 JNE((unsigned long)buf, DENY),
49 ARG(2),
50 JGE(sizeof(buf), DENY),
51 ALLOW,
52
53 LABEL(&l, write_fd),
54 ARG(0),
55 JEQ(STDOUT_FILENO, JUMP(&l, write_buf)),
56 JEQ(STDERR_FILENO, JUMP(&l, write_buf)),
57 DENY,
58
59 LABEL(&l, write_buf),
60 ARG(1),
61 JEQ((unsigned long)msg1, JUMP(&l, msg1_len)),
62 JEQ((unsigned long)msg2, JUMP(&l, msg2_len)),
63 JEQ((unsigned long)buf, JUMP(&l, buf_len)),
64 DENY,
65
66 LABEL(&l, msg1_len),
67 ARG(2),
68 JLT(sizeof(msg1), ALLOW),
69 DENY,
70
71 LABEL(&l, msg2_len),
72 ARG(2),
73 JLT(sizeof(msg2), ALLOW),
74 DENY,
75
76 LABEL(&l, buf_len),
77 ARG(2),
78 JLT(sizeof(buf), ALLOW),
79 DENY,
80 };
81 struct sock_fprog prog = {
82 .filter = filter,
83 .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
84 };
85 ssize_t bytes;
86 bpf_resolve_jumps(&l, filter, sizeof(filter)/sizeof(*filter));
87
88 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
89 perror("prctl(NO_NEW_PRIVS)");
90 return 1;
91 }
92
93 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
94 perror("prctl(SECCOMP)");
95 return 1;
96 }
97 syscall(__NR_write, STDOUT_FILENO, msg1, strlen(msg1));
98 bytes = syscall(__NR_read, STDIN_FILENO, buf, sizeof(buf)-1);
99 bytes = (bytes > 0 ? bytes : 0);
100 syscall(__NR_write, STDERR_FILENO, msg2, strlen(msg2));
101 syscall(__NR_write, STDERR_FILENO, buf, bytes);
102 /* Now get killed */
103 syscall(__NR_write, STDERR_FILENO, msg2, strlen(msg2)+2);
104 return 0;
105}
diff --git a/samples/seccomp/bpf-helper.c b/samples/seccomp/bpf-helper.c
new file mode 100644
index 000000000..ae260d77a
--- /dev/null
+++ b/samples/seccomp/bpf-helper.c
@@ -0,0 +1,96 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Seccomp BPF helper functions
4 *
5 * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
6 * Author: Will Drewry <wad@chromium.org>
7 *
8 * The code may be used by anyone for any purpose,
9 * and can serve as a starting point for developing
10 * applications using prctl(PR_ATTACH_SECCOMP_FILTER).
11 */
12
13#include <stdio.h>
14#include <stdlib.h>
15#include <string.h>
16
17#include "bpf-helper.h"
18
19int bpf_resolve_jumps(struct bpf_labels *labels,
20 struct sock_filter *filter, size_t count)
21{
22 size_t i;
23
24 if (count < 1 || count > BPF_MAXINSNS)
25 return -1;
26 /*
27 * Walk it once, backwards, to build the label table and do fixups.
28 * Since backward jumps are disallowed by BPF, this is easy.
29 */
30 for (i = 0; i < count; ++i) {
31 size_t offset = count - i - 1;
32 struct sock_filter *instr = &filter[offset];
33 if (instr->code != (BPF_JMP+BPF_JA))
34 continue;
35 switch ((instr->jt<<8)|instr->jf) {
36 case (JUMP_JT<<8)|JUMP_JF:
37 if (labels->labels[instr->k].location == 0xffffffff) {
38 fprintf(stderr, "Unresolved label: '%s'\n",
39 labels->labels[instr->k].label);
40 return 1;
41 }
42 instr->k = labels->labels[instr->k].location -
43 (offset + 1);
44 instr->jt = 0;
45 instr->jf = 0;
46 continue;
47 case (LABEL_JT<<8)|LABEL_JF:
48 if (labels->labels[instr->k].location != 0xffffffff) {
49 fprintf(stderr, "Duplicate label use: '%s'\n",
50 labels->labels[instr->k].label);
51 return 1;
52 }
53 labels->labels[instr->k].location = offset;
54 instr->k = 0; /* fall through */
55 instr->jt = 0;
56 instr->jf = 0;
57 continue;
58 }
59 }
60 return 0;
61}
62
63/* Simple lookup table for labels. */
64__u32 seccomp_bpf_label(struct bpf_labels *labels, const char *label)
65{
66 struct __bpf_label *begin = labels->labels, *end;
67 int id;
68
69 if (labels->count == BPF_LABELS_MAX) {
70 fprintf(stderr, "Too many labels\n");
71 exit(1);
72 }
73 if (labels->count == 0) {
74 begin->label = label;
75 begin->location = 0xffffffff;
76 labels->count++;
77 return 0;
78 }
79 end = begin + labels->count;
80 for (id = 0; begin < end; ++begin, ++id) {
81 if (!strcmp(label, begin->label))
82 return id;
83 }
84 begin->label = label;
85 begin->location = 0xffffffff;
86 labels->count++;
87 return id;
88}
89
90void seccomp_bpf_print(struct sock_filter *filter, size_t count)
91{
92 struct sock_filter *end = filter + count;
93 for ( ; filter < end; ++filter)
94 printf("{ code=%u,jt=%u,jf=%u,k=%u },\n",
95 filter->code, filter->jt, filter->jf, filter->k);
96}
diff --git a/samples/seccomp/bpf-helper.h b/samples/seccomp/bpf-helper.h
new file mode 100644
index 000000000..0cc9816fe
--- /dev/null
+++ b/samples/seccomp/bpf-helper.h
@@ -0,0 +1,263 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Example wrapper around BPF macros.
4 *
5 * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
6 * Author: Will Drewry <wad@chromium.org>
7 *
8 * The code may be used by anyone for any purpose,
9 * and can serve as a starting point for developing
10 * applications using prctl(PR_SET_SECCOMP, 2, ...).
11 *
12 * No guarantees are provided with respect to the correctness
13 * or functionality of this code.
14 */
15#ifndef __BPF_HELPER_H__
16#define __BPF_HELPER_H__
17
18#include <asm/bitsperlong.h> /* for __BITS_PER_LONG */
19#include <endian.h>
20#include <linux/filter.h>
21#include <linux/seccomp.h> /* for seccomp_data */
22#include <linux/types.h>
23#include <linux/unistd.h>
24#include <stddef.h>
25
26#define BPF_LABELS_MAX 256
27struct bpf_labels {
28 int count;
29 struct __bpf_label {
30 const char *label;
31 __u32 location;
32 } labels[BPF_LABELS_MAX];
33};
34
35int bpf_resolve_jumps(struct bpf_labels *labels,
36 struct sock_filter *filter, size_t count);
37__u32 seccomp_bpf_label(struct bpf_labels *labels, const char *label);
38void seccomp_bpf_print(struct sock_filter *filter, size_t count);
39
40#define JUMP_JT 0xff
41#define JUMP_JF 0xff
42#define LABEL_JT 0xfe
43#define LABEL_JF 0xfe
44
45#define ALLOW \
46 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
47#define DENY \
48 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL)
49#define JUMP(labels, label) \
50 BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \
51 JUMP_JT, JUMP_JF)
52#define LABEL(labels, label) \
53 BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \
54 LABEL_JT, LABEL_JF)
55#define SYSCALL(nr, jt) \
56 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (nr), 0, 1), \
57 jt
58
59/* Lame, but just an example */
60#define FIND_LABEL(labels, label) seccomp_bpf_label((labels), #label)
61
62#define EXPAND(...) __VA_ARGS__
63
64/* Ensure that we load the logically correct offset. */
65#if __BYTE_ORDER == __LITTLE_ENDIAN
66#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)])
67#elif __BYTE_ORDER == __BIG_ENDIAN
68#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32)
69#else
70#error "Unknown endianness"
71#endif
72
73/* Map all width-sensitive operations */
74#if __BITS_PER_LONG == 32
75
76#define JEQ(x, jt) JEQ32(x, EXPAND(jt))
77#define JNE(x, jt) JNE32(x, EXPAND(jt))
78#define JGT(x, jt) JGT32(x, EXPAND(jt))
79#define JLT(x, jt) JLT32(x, EXPAND(jt))
80#define JGE(x, jt) JGE32(x, EXPAND(jt))
81#define JLE(x, jt) JLE32(x, EXPAND(jt))
82#define JA(x, jt) JA32(x, EXPAND(jt))
83#define ARG(i) ARG_32(i)
84
85#elif __BITS_PER_LONG == 64
86
87/* Ensure that we load the logically correct offset. */
88#if __BYTE_ORDER == __LITTLE_ENDIAN
89#define ENDIAN(_lo, _hi) _lo, _hi
90#define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32)
91#elif __BYTE_ORDER == __BIG_ENDIAN
92#define ENDIAN(_lo, _hi) _hi, _lo
93#define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)])
94#endif
95
96union arg64 {
97 struct {
98 __u32 ENDIAN(lo32, hi32);
99 };
100 __u64 u64;
101};
102
103#define JEQ(x, jt) \
104 JEQ64(((union arg64){.u64 = (x)}).lo32, \
105 ((union arg64){.u64 = (x)}).hi32, \
106 EXPAND(jt))
107#define JGT(x, jt) \
108 JGT64(((union arg64){.u64 = (x)}).lo32, \
109 ((union arg64){.u64 = (x)}).hi32, \
110 EXPAND(jt))
111#define JGE(x, jt) \
112 JGE64(((union arg64){.u64 = (x)}).lo32, \
113 ((union arg64){.u64 = (x)}).hi32, \
114 EXPAND(jt))
115#define JNE(x, jt) \
116 JNE64(((union arg64){.u64 = (x)}).lo32, \
117 ((union arg64){.u64 = (x)}).hi32, \
118 EXPAND(jt))
119#define JLT(x, jt) \
120 JLT64(((union arg64){.u64 = (x)}).lo32, \
121 ((union arg64){.u64 = (x)}).hi32, \
122 EXPAND(jt))
123#define JLE(x, jt) \
124 JLE64(((union arg64){.u64 = (x)}).lo32, \
125 ((union arg64){.u64 = (x)}).hi32, \
126 EXPAND(jt))
127
128#define JA(x, jt) \
129 JA64(((union arg64){.u64 = (x)}).lo32, \
130 ((union arg64){.u64 = (x)}).hi32, \
131 EXPAND(jt))
132#define ARG(i) ARG_64(i)
133
134#else
135#error __BITS_PER_LONG value unusable.
136#endif
137
138/* Loads the arg into A */
139#define ARG_32(idx) \
140 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, LO_ARG(idx))
141
142/* Loads lo into M[0] and hi into M[1] and A */
143#define ARG_64(idx) \
144 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, LO_ARG(idx)), \
145 BPF_STMT(BPF_ST, 0), /* lo -> M[0] */ \
146 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, HI_ARG(idx)), \
147 BPF_STMT(BPF_ST, 1) /* hi -> M[1] */
148
149#define JEQ32(value, jt) \
150 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (value), 0, 1), \
151 jt
152
153#define JNE32(value, jt) \
154 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (value), 1, 0), \
155 jt
156
157#define JA32(value, jt) \
158 BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (value), 0, 1), \
159 jt
160
161#define JGE32(value, jt) \
162 BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (value), 0, 1), \
163 jt
164
165#define JGT32(value, jt) \
166 BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (value), 0, 1), \
167 jt
168
169#define JLE32(value, jt) \
170 BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (value), 1, 0), \
171 jt
172
173#define JLT32(value, jt) \
174 BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (value), 1, 0), \
175 jt
176
177/*
178 * All the JXX64 checks assume lo is saved in M[0] and hi is saved in both
179 * A and M[1]. This invariant is kept by restoring A if necessary.
180 */
181#define JEQ64(lo, hi, jt) \
182 /* if (hi != arg.hi) goto NOMATCH; */ \
183 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
184 BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
185 /* if (lo != arg.lo) goto NOMATCH; */ \
186 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (lo), 0, 2), \
187 BPF_STMT(BPF_LD+BPF_MEM, 1), \
188 jt, \
189 BPF_STMT(BPF_LD+BPF_MEM, 1)
190
191#define JNE64(lo, hi, jt) \
192 /* if (hi != arg.hi) goto MATCH; */ \
193 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 3), \
194 BPF_STMT(BPF_LD+BPF_MEM, 0), \
195 /* if (lo != arg.lo) goto MATCH; */ \
196 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (lo), 2, 0), \
197 BPF_STMT(BPF_LD+BPF_MEM, 1), \
198 jt, \
199 BPF_STMT(BPF_LD+BPF_MEM, 1)
200
201#define JA64(lo, hi, jt) \
202 /* if (hi & arg.hi) goto MATCH; */ \
203 BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (hi), 3, 0), \
204 BPF_STMT(BPF_LD+BPF_MEM, 0), \
205 /* if (lo & arg.lo) goto MATCH; */ \
206 BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (lo), 0, 2), \
207 BPF_STMT(BPF_LD+BPF_MEM, 1), \
208 jt, \
209 BPF_STMT(BPF_LD+BPF_MEM, 1)
210
211#define JGE64(lo, hi, jt) \
212 /* if (hi > arg.hi) goto MATCH; */ \
213 BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 4, 0), \
214 /* if (hi != arg.hi) goto NOMATCH; */ \
215 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
216 BPF_STMT(BPF_LD+BPF_MEM, 0), \
217 /* if (lo >= arg.lo) goto MATCH; */ \
218 BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (lo), 0, 2), \
219 BPF_STMT(BPF_LD+BPF_MEM, 1), \
220 jt, \
221 BPF_STMT(BPF_LD+BPF_MEM, 1)
222
223#define JGT64(lo, hi, jt) \
224 /* if (hi > arg.hi) goto MATCH; */ \
225 BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 4, 0), \
226 /* if (hi != arg.hi) goto NOMATCH; */ \
227 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
228 BPF_STMT(BPF_LD+BPF_MEM, 0), \
229 /* if (lo > arg.lo) goto MATCH; */ \
230 BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 0, 2), \
231 BPF_STMT(BPF_LD+BPF_MEM, 1), \
232 jt, \
233 BPF_STMT(BPF_LD+BPF_MEM, 1)
234
235#define JLE64(lo, hi, jt) \
236 /* if (hi < arg.hi) goto MATCH; */ \
237 BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (hi), 0, 4), \
238 /* if (hi != arg.hi) goto NOMATCH; */ \
239 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
240 BPF_STMT(BPF_LD+BPF_MEM, 0), \
241 /* if (lo <= arg.lo) goto MATCH; */ \
242 BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 2, 0), \
243 BPF_STMT(BPF_LD+BPF_MEM, 1), \
244 jt, \
245 BPF_STMT(BPF_LD+BPF_MEM, 1)
246
247#define JLT64(lo, hi, jt) \
248 /* if (hi < arg.hi) goto MATCH; */ \
249 BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (hi), 0, 4), \
250 /* if (hi != arg.hi) goto NOMATCH; */ \
251 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
252 BPF_STMT(BPF_LD+BPF_MEM, 0), \
253 /* if (lo < arg.lo) goto MATCH; */ \
254 BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (lo), 2, 0), \
255 BPF_STMT(BPF_LD+BPF_MEM, 1), \
256 jt, \
257 BPF_STMT(BPF_LD+BPF_MEM, 1)
258
259#define LOAD_SYSCALL_NR \
260 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
261 offsetof(struct seccomp_data, nr))
262
263#endif /* __BPF_HELPER_H__ */
diff --git a/samples/seccomp/dropper.c b/samples/seccomp/dropper.c
new file mode 100644
index 000000000..cc0648eb3
--- /dev/null
+++ b/samples/seccomp/dropper.c
@@ -0,0 +1,72 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Naive system call dropper built on seccomp_filter.
4 *
5 * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
6 * Author: Will Drewry <wad@chromium.org>
7 *
8 * The code may be used by anyone for any purpose,
9 * and can serve as a starting point for developing
10 * applications using prctl(PR_SET_SECCOMP, 2, ...).
11 *
12 * When run, returns the specified errno for the specified
13 * system call number against the given architecture.
14 *
15 */
16
17#include <errno.h>
18#include <linux/audit.h>
19#include <linux/filter.h>
20#include <linux/seccomp.h>
21#include <linux/unistd.h>
22#include <stdio.h>
23#include <stddef.h>
24#include <stdlib.h>
25#include <sys/prctl.h>
26#include <unistd.h>
27
28static int install_filter(int nr, int arch, int error)
29{
30 struct sock_filter filter[] = {
31 BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
32 (offsetof(struct seccomp_data, arch))),
33 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, arch, 0, 3),
34 BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
35 (offsetof(struct seccomp_data, nr))),
36 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1),
37 BPF_STMT(BPF_RET+BPF_K,
38 SECCOMP_RET_ERRNO|(error & SECCOMP_RET_DATA)),
39 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
40 };
41 struct sock_fprog prog = {
42 .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
43 .filter = filter,
44 };
45 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
46 perror("prctl(NO_NEW_PRIVS)");
47 return 1;
48 }
49 if (prctl(PR_SET_SECCOMP, 2, &prog)) {
50 perror("prctl(PR_SET_SECCOMP)");
51 return 1;
52 }
53 return 0;
54}
55
56int main(int argc, char **argv)
57{
58 if (argc < 5) {
59 fprintf(stderr, "Usage:\n"
60 "dropper <syscall_nr> <arch> <errno> <prog> [<args>]\n"
61 "Hint: AUDIT_ARCH_I386: 0x%X\n"
62 " AUDIT_ARCH_X86_64: 0x%X\n"
63 "\n", AUDIT_ARCH_I386, AUDIT_ARCH_X86_64);
64 return 1;
65 }
66 if (install_filter(strtol(argv[1], NULL, 0), strtol(argv[2], NULL, 0),
67 strtol(argv[3], NULL, 0)))
68 return 1;
69 execv(argv[4], &argv[4]);
70 printf("Failed to execv\n");
71 return 255;
72}
diff --git a/samples/seccomp/user-trap.c b/samples/seccomp/user-trap.c
new file mode 100644
index 000000000..20291ec64
--- /dev/null
+++ b/samples/seccomp/user-trap.c
@@ -0,0 +1,375 @@
1#include <signal.h>
2#include <stdio.h>
3#include <stdlib.h>
4#include <unistd.h>
5#include <errno.h>
6#include <fcntl.h>
7#include <string.h>
8#include <stddef.h>
9#include <sys/sysmacros.h>
10#include <sys/types.h>
11#include <sys/wait.h>
12#include <sys/socket.h>
13#include <sys/stat.h>
14#include <sys/mman.h>
15#include <sys/syscall.h>
16#include <sys/user.h>
17#include <sys/ioctl.h>
18#include <sys/ptrace.h>
19#include <sys/mount.h>
20#include <linux/limits.h>
21#include <linux/filter.h>
22#include <linux/seccomp.h>
23
24#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
25
26static int seccomp(unsigned int op, unsigned int flags, void *args)
27{
28 errno = 0;
29 return syscall(__NR_seccomp, op, flags, args);
30}
31
32static int send_fd(int sock, int fd)
33{
34 struct msghdr msg = {};
35 struct cmsghdr *cmsg;
36 char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
37 struct iovec io = {
38 .iov_base = &c,
39 .iov_len = 1,
40 };
41
42 msg.msg_iov = &io;
43 msg.msg_iovlen = 1;
44 msg.msg_control = buf;
45 msg.msg_controllen = sizeof(buf);
46 cmsg = CMSG_FIRSTHDR(&msg);
47 cmsg->cmsg_level = SOL_SOCKET;
48 cmsg->cmsg_type = SCM_RIGHTS;
49 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
50 *((int *)CMSG_DATA(cmsg)) = fd;
51 msg.msg_controllen = cmsg->cmsg_len;
52
53 if (sendmsg(sock, &msg, 0) < 0) {
54 perror("sendmsg");
55 return -1;
56 }
57
58 return 0;
59}
60
61static int recv_fd(int sock)
62{
63 struct msghdr msg = {};
64 struct cmsghdr *cmsg;
65 char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
66 struct iovec io = {
67 .iov_base = &c,
68 .iov_len = 1,
69 };
70
71 msg.msg_iov = &io;
72 msg.msg_iovlen = 1;
73 msg.msg_control = buf;
74 msg.msg_controllen = sizeof(buf);
75
76 if (recvmsg(sock, &msg, 0) < 0) {
77 perror("recvmsg");
78 return -1;
79 }
80
81 cmsg = CMSG_FIRSTHDR(&msg);
82
83 return *((int *)CMSG_DATA(cmsg));
84}
85
86static int user_trap_syscall(int nr, unsigned int flags)
87{
88 struct sock_filter filter[] = {
89 BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
90 offsetof(struct seccomp_data, nr)),
91 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1),
92 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF),
93 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
94 };
95
96 struct sock_fprog prog = {
97 .len = (unsigned short)ARRAY_SIZE(filter),
98 .filter = filter,
99 };
100
101 return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
102}
103
104static int handle_req(struct seccomp_notif *req,
105 struct seccomp_notif_resp *resp, int listener)
106{
107 char path[PATH_MAX], source[PATH_MAX], target[PATH_MAX];
108 int ret = -1, mem;
109
110 resp->id = req->id;
111 resp->error = -EPERM;
112 resp->val = 0;
113
114 if (req->data.nr != __NR_mount) {
115 fprintf(stderr, "huh? trapped something besides mount? %d\n", req->data.nr);
116 return -1;
117 }
118
119 /* Only allow bind mounts. */
120 if (!(req->data.args[3] & MS_BIND))
121 return 0;
122
123 /*
124 * Ok, let's read the task's memory to see where they wanted their
125 * mount to go.
126 */
127 snprintf(path, sizeof(path), "/proc/%d/mem", req->pid);
128 mem = open(path, O_RDONLY);
129 if (mem < 0) {
130 perror("open mem");
131 return -1;
132 }
133
134 /*
135 * Now we avoid a TOCTOU: we referred to a pid by its pid, but since
136 * the pid that made the syscall may have died, we need to confirm that
137 * the pid is still valid after we open its /proc/pid/mem file. We can
138 * ask the listener fd this as follows.
139 *
140 * Note that this check should occur *after* any task-specific
141 * resources are opened, to make sure that the task has not died and
142 * we're not wrongly reading someone else's state in order to make
143 * decisions.
144 */
145 if (ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req->id) < 0) {
146 fprintf(stderr, "task died before we could map its memory\n");
147 goto out;
148 }
149
150 /*
151 * Phew, we've got the right /proc/pid/mem. Now we can read it. Note
152 * that to avoid another TOCTOU, we should read all of the pointer args
153 * before we decide to allow the syscall.
154 */
155 if (lseek(mem, req->data.args[0], SEEK_SET) < 0) {
156 perror("seek");
157 goto out;
158 }
159
160 ret = read(mem, source, sizeof(source));
161 if (ret < 0) {
162 perror("read");
163 goto out;
164 }
165
166 if (lseek(mem, req->data.args[1], SEEK_SET) < 0) {
167 perror("seek");
168 goto out;
169 }
170
171 ret = read(mem, target, sizeof(target));
172 if (ret < 0) {
173 perror("read");
174 goto out;
175 }
176
177 /*
178 * Our policy is to only allow bind mounts inside /tmp. This isn't very
179 * interesting, because we could do unprivlieged bind mounts with user
180 * namespaces already, but you get the idea.
181 */
182 if (!strncmp(source, "/tmp/", 5) && !strncmp(target, "/tmp/", 5)) {
183 if (mount(source, target, NULL, req->data.args[3], NULL) < 0) {
184 ret = -1;
185 perror("actual mount");
186 goto out;
187 }
188 resp->error = 0;
189 }
190
191 /* Even if we didn't allow it because of policy, generating the
192 * response was be a success, because we want to tell the worker EPERM.
193 */
194 ret = 0;
195
196out:
197 close(mem);
198 return ret;
199}
200
201int main(void)
202{
203 int sk_pair[2], ret = 1, status, listener;
204 pid_t worker = 0 , tracer = 0;
205
206 if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair) < 0) {
207 perror("socketpair");
208 return 1;
209 }
210
211 worker = fork();
212 if (worker < 0) {
213 perror("fork");
214 goto close_pair;
215 }
216
217 if (worker == 0) {
218 listener = user_trap_syscall(__NR_mount,
219 SECCOMP_FILTER_FLAG_NEW_LISTENER);
220 if (listener < 0) {
221 perror("seccomp");
222 exit(1);
223 }
224
225 /*
226 * Drop privileges. We definitely can't mount as uid 1000.
227 */
228 if (setuid(1000) < 0) {
229 perror("setuid");
230 exit(1);
231 }
232
233 /*
234 * Send the listener to the parent; also serves as
235 * synchronization.
236 */
237 if (send_fd(sk_pair[1], listener) < 0)
238 exit(1);
239 close(listener);
240
241 if (mkdir("/tmp/foo", 0755) < 0) {
242 perror("mkdir");
243 exit(1);
244 }
245
246 /*
247 * Try a bad mount just for grins.
248 */
249 if (mount("/dev/sda", "/tmp/foo", NULL, 0, NULL) != -1) {
250 fprintf(stderr, "huh? mounted /dev/sda?\n");
251 exit(1);
252 }
253
254 if (errno != EPERM) {
255 perror("bad error from mount");
256 exit(1);
257 }
258
259 /*
260 * Ok, we expect this one to succeed.
261 */
262 if (mount("/tmp/foo", "/tmp/foo", NULL, MS_BIND, NULL) < 0) {
263 perror("mount");
264 exit(1);
265 }
266
267 exit(0);
268 }
269
270 /*
271 * Get the listener from the child.
272 */
273 listener = recv_fd(sk_pair[0]);
274 if (listener < 0)
275 goto out_kill;
276
277 /*
278 * Fork a task to handle the requests. This isn't strictly necessary,
279 * but it makes the particular writing of this sample easier, since we
280 * can just wait ofr the tracee to exit and kill the tracer.
281 */
282 tracer = fork();
283 if (tracer < 0) {
284 perror("fork");
285 goto out_kill;
286 }
287
288 if (tracer == 0) {
289 struct seccomp_notif *req;
290 struct seccomp_notif_resp *resp;
291 struct seccomp_notif_sizes sizes;
292
293 if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) < 0) {
294 perror("seccomp(GET_NOTIF_SIZES)");
295 goto out_close;
296 }
297
298 req = malloc(sizes.seccomp_notif);
299 if (!req)
300 goto out_close;
301
302 resp = malloc(sizes.seccomp_notif_resp);
303 if (!resp)
304 goto out_req;
305 memset(resp, 0, sizes.seccomp_notif_resp);
306
307 while (1) {
308 memset(req, 0, sizes.seccomp_notif);
309 if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) {
310 perror("ioctl recv");
311 goto out_resp;
312 }
313
314 if (handle_req(req, resp, listener) < 0)
315 goto out_resp;
316
317 /*
318 * ENOENT here means that the task may have gotten a
319 * signal and restarted the syscall. It's up to the
320 * handler to decide what to do in this case, but for
321 * the sample code, we just ignore it. Probably
322 * something better should happen, like undoing the
323 * mount, or keeping track of the args to make sure we
324 * don't do it again.
325 */
326 if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, resp) < 0 &&
327 errno != ENOENT) {
328 perror("ioctl send");
329 goto out_resp;
330 }
331 }
332out_resp:
333 free(resp);
334out_req:
335 free(req);
336out_close:
337 close(listener);
338 exit(1);
339 }
340
341 close(listener);
342
343 if (waitpid(worker, &status, 0) != worker) {
344 perror("waitpid");
345 goto out_kill;
346 }
347
348 if (umount2("/tmp/foo", MNT_DETACH) < 0 && errno != EINVAL) {
349 perror("umount2");
350 goto out_kill;
351 }
352
353 if (remove("/tmp/foo") < 0 && errno != ENOENT) {
354 perror("remove");
355 exit(1);
356 }
357
358 if (!WIFEXITED(status) || WEXITSTATUS(status)) {
359 fprintf(stderr, "worker exited nonzero\n");
360 goto out_kill;
361 }
362
363 ret = 0;
364
365out_kill:
366 if (tracer > 0)
367 kill(tracer, SIGKILL);
368 if (worker > 0)
369 kill(worker, SIGKILL);
370
371close_pair:
372 close(sk_pair[0]);
373 close(sk_pair[1]);
374 return ret;
375}
diff --git a/samples/timers/.gitignore b/samples/timers/.gitignore
new file mode 100644
index 000000000..40510c33c
--- /dev/null
+++ b/samples/timers/.gitignore
@@ -0,0 +1,2 @@
1# SPDX-License-Identifier: GPL-2.0-only
2hpet_example
diff --git a/samples/timers/Makefile b/samples/timers/Makefile
new file mode 100644
index 000000000..e6836cdea
--- /dev/null
+++ b/samples/timers/Makefile
@@ -0,0 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0
2userprogs-always-y += hpet_example
3
4userccflags += -I usr/include
diff --git a/samples/timers/hpet_example.c b/samples/timers/hpet_example.c
new file mode 100644
index 000000000..f1cb622f6
--- /dev/null
+++ b/samples/timers/hpet_example.c
@@ -0,0 +1,295 @@
1// SPDX-License-Identifier: GPL-2.0
2#include <stdio.h>
3#include <stdlib.h>
4#include <unistd.h>
5#include <fcntl.h>
6#include <string.h>
7#include <memory.h>
8#include <malloc.h>
9#include <time.h>
10#include <ctype.h>
11#include <sys/types.h>
12#include <sys/wait.h>
13#include <signal.h>
14#include <errno.h>
15#include <sys/time.h>
16#include <linux/hpet.h>
17
18
19extern void hpet_open_close(int, const char **);
20extern void hpet_info(int, const char **);
21extern void hpet_poll(int, const char **);
22extern void hpet_fasync(int, const char **);
23extern void hpet_read(int, const char **);
24
25#include <sys/poll.h>
26#include <sys/ioctl.h>
27
28struct hpet_command {
29 char *command;
30 void (*func)(int argc, const char ** argv);
31} hpet_command[] = {
32 {
33 "open-close",
34 hpet_open_close
35 },
36 {
37 "info",
38 hpet_info
39 },
40 {
41 "poll",
42 hpet_poll
43 },
44 {
45 "fasync",
46 hpet_fasync
47 },
48};
49
50int
51main(int argc, const char ** argv)
52{
53 unsigned int i;
54
55 argc--;
56 argv++;
57
58 if (!argc) {
59 fprintf(stderr, "-hpet: requires command\n");
60 return -1;
61 }
62
63
64 for (i = 0; i < (sizeof (hpet_command) / sizeof (hpet_command[0])); i++)
65 if (!strcmp(argv[0], hpet_command[i].command)) {
66 argc--;
67 argv++;
68 fprintf(stderr, "-hpet: executing %s\n",
69 hpet_command[i].command);
70 hpet_command[i].func(argc, argv);
71 return 0;
72 }
73
74 fprintf(stderr, "do_hpet: command %s not implemented\n", argv[0]);
75
76 return -1;
77}
78
79void
80hpet_open_close(int argc, const char **argv)
81{
82 int fd;
83
84 if (argc != 1) {
85 fprintf(stderr, "hpet_open_close: device-name\n");
86 return;
87 }
88
89 fd = open(argv[0], O_RDONLY);
90 if (fd < 0)
91 fprintf(stderr, "hpet_open_close: open failed\n");
92 else
93 close(fd);
94
95 return;
96}
97
98void
99hpet_info(int argc, const char **argv)
100{
101 struct hpet_info info;
102 int fd;
103
104 if (argc != 1) {
105 fprintf(stderr, "hpet_info: device-name\n");
106 return;
107 }
108
109 fd = open(argv[0], O_RDONLY);
110 if (fd < 0) {
111 fprintf(stderr, "hpet_info: open of %s failed\n", argv[0]);
112 return;
113 }
114
115 if (ioctl(fd, HPET_INFO, &info) < 0) {
116 fprintf(stderr, "hpet_info: failed to get info\n");
117 goto out;
118 }
119
120 fprintf(stderr, "hpet_info: hi_irqfreq 0x%lx hi_flags 0x%lx ",
121 info.hi_ireqfreq, info.hi_flags);
122 fprintf(stderr, "hi_hpet %d hi_timer %d\n",
123 info.hi_hpet, info.hi_timer);
124
125out:
126 close(fd);
127 return;
128}
129
130void
131hpet_poll(int argc, const char **argv)
132{
133 unsigned long freq;
134 int iterations, i, fd;
135 struct pollfd pfd;
136 struct hpet_info info;
137 struct timeval stv, etv;
138 struct timezone tz;
139 long usec;
140
141 if (argc != 3) {
142 fprintf(stderr, "hpet_poll: device-name freq iterations\n");
143 return;
144 }
145
146 freq = atoi(argv[1]);
147 iterations = atoi(argv[2]);
148
149 fd = open(argv[0], O_RDONLY);
150
151 if (fd < 0) {
152 fprintf(stderr, "hpet_poll: open of %s failed\n", argv[0]);
153 return;
154 }
155
156 if (ioctl(fd, HPET_IRQFREQ, freq) < 0) {
157 fprintf(stderr, "hpet_poll: HPET_IRQFREQ failed\n");
158 goto out;
159 }
160
161 if (ioctl(fd, HPET_INFO, &info) < 0) {
162 fprintf(stderr, "hpet_poll: failed to get info\n");
163 goto out;
164 }
165
166 fprintf(stderr, "hpet_poll: info.hi_flags 0x%lx\n", info.hi_flags);
167
168 if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) {
169 fprintf(stderr, "hpet_poll: HPET_EPI failed\n");
170 goto out;
171 }
172
173 if (ioctl(fd, HPET_IE_ON, 0) < 0) {
174 fprintf(stderr, "hpet_poll, HPET_IE_ON failed\n");
175 goto out;
176 }
177
178 pfd.fd = fd;
179 pfd.events = POLLIN;
180
181 for (i = 0; i < iterations; i++) {
182 pfd.revents = 0;
183 gettimeofday(&stv, &tz);
184 if (poll(&pfd, 1, -1) < 0)
185 fprintf(stderr, "hpet_poll: poll failed\n");
186 else {
187 long data;
188
189 gettimeofday(&etv, &tz);
190 usec = stv.tv_sec * 1000000 + stv.tv_usec;
191 usec = (etv.tv_sec * 1000000 + etv.tv_usec) - usec;
192
193 fprintf(stderr,
194 "hpet_poll: expired time = 0x%lx\n", usec);
195
196 fprintf(stderr, "hpet_poll: revents = 0x%x\n",
197 pfd.revents);
198
199 if (read(fd, &data, sizeof(data)) != sizeof(data)) {
200 fprintf(stderr, "hpet_poll: read failed\n");
201 }
202 else
203 fprintf(stderr, "hpet_poll: data 0x%lx\n",
204 data);
205 }
206 }
207
208out:
209 close(fd);
210 return;
211}
212
213static int hpet_sigio_count;
214
215static void
216hpet_sigio(int val)
217{
218 fprintf(stderr, "hpet_sigio: called\n");
219 hpet_sigio_count++;
220}
221
222void
223hpet_fasync(int argc, const char **argv)
224{
225 unsigned long freq;
226 int iterations, i, fd, value;
227 sig_t oldsig;
228 struct hpet_info info;
229
230 hpet_sigio_count = 0;
231 fd = -1;
232
233 if ((oldsig = signal(SIGIO, hpet_sigio)) == SIG_ERR) {
234 fprintf(stderr, "hpet_fasync: failed to set signal handler\n");
235 return;
236 }
237
238 if (argc != 3) {
239 fprintf(stderr, "hpet_fasync: device-name freq iterations\n");
240 goto out;
241 }
242
243 fd = open(argv[0], O_RDONLY);
244
245 if (fd < 0) {
246 fprintf(stderr, "hpet_fasync: failed to open %s\n", argv[0]);
247 return;
248 }
249
250
251 if ((fcntl(fd, F_SETOWN, getpid()) == 1) ||
252 ((value = fcntl(fd, F_GETFL)) == 1) ||
253 (fcntl(fd, F_SETFL, value | O_ASYNC) == 1)) {
254 fprintf(stderr, "hpet_fasync: fcntl failed\n");
255 goto out;
256 }
257
258 freq = atoi(argv[1]);
259 iterations = atoi(argv[2]);
260
261 if (ioctl(fd, HPET_IRQFREQ, freq) < 0) {
262 fprintf(stderr, "hpet_fasync: HPET_IRQFREQ failed\n");
263 goto out;
264 }
265
266 if (ioctl(fd, HPET_INFO, &info) < 0) {
267 fprintf(stderr, "hpet_fasync: failed to get info\n");
268 goto out;
269 }
270
271 fprintf(stderr, "hpet_fasync: info.hi_flags 0x%lx\n", info.hi_flags);
272
273 if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) {
274 fprintf(stderr, "hpet_fasync: HPET_EPI failed\n");
275 goto out;
276 }
277
278 if (ioctl(fd, HPET_IE_ON, 0) < 0) {
279 fprintf(stderr, "hpet_fasync, HPET_IE_ON failed\n");
280 goto out;
281 }
282
283 for (i = 0; i < iterations; i++) {
284 (void) pause();
285 fprintf(stderr, "hpet_fasync: count = %d\n", hpet_sigio_count);
286 }
287
288out:
289 signal(SIGIO, oldsig);
290
291 if (fd >= 0)
292 close(fd);
293
294 return;
295}
diff --git a/samples/trace_events/Makefile b/samples/trace_events/Makefile
new file mode 100644
index 000000000..b78344e7b
--- /dev/null
+++ b/samples/trace_events/Makefile
@@ -0,0 +1,15 @@
1# SPDX-License-Identifier: GPL-2.0-only
2# builds the trace events example kernel modules;
3# then to use one (as root): insmod <module_name.ko>
4
5# If you include a trace header outside of include/trace/events
6# then the file that does the #define CREATE_TRACE_POINTS must
7# have that tracer file in its main search path. This is because
8# define_trace.h will include it, and must be able to find it from
9# the include/trace directory.
10#
11# Here trace-events-sample.c does the CREATE_TRACE_POINTS.
12#
13CFLAGS_trace-events-sample.o := -I$(src)
14
15obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace-events-sample.o
diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
new file mode 100644
index 000000000..1a72b7d95
--- /dev/null
+++ b/samples/trace_events/trace-events-sample.c
@@ -0,0 +1,140 @@
1// SPDX-License-Identifier: GPL-2.0-only
2#include <linux/module.h>
3#include <linux/kthread.h>
4
5/*
6 * Any file that uses trace points, must include the header.
7 * But only one file, must include the header by defining
8 * CREATE_TRACE_POINTS first. This will make the C code that
9 * creates the handles for the trace points.
10 */
11#define CREATE_TRACE_POINTS
12#include "trace-events-sample.h"
13
14static const char *random_strings[] = {
15 "Mother Goose",
16 "Snoopy",
17 "Gandalf",
18 "Frodo",
19 "One ring to rule them all"
20};
21
22static void simple_thread_func(int cnt)
23{
24 int array[6];
25 int len = cnt % 5;
26 int i;
27
28 set_current_state(TASK_INTERRUPTIBLE);
29 schedule_timeout(HZ);
30
31 for (i = 0; i < len; i++)
32 array[i] = i + 1;
33 array[i] = 0;
34
35 /* Silly tracepoints */
36 trace_foo_bar("hello", cnt, array, random_strings[len],
37 current->cpus_ptr);
38
39 trace_foo_with_template_simple("HELLO", cnt);
40
41 trace_foo_bar_with_cond("Some times print", cnt);
42
43 trace_foo_with_template_cond("prints other times", cnt);
44
45 trace_foo_with_template_print("I have to be different", cnt);
46}
47
48static int simple_thread(void *arg)
49{
50 int cnt = 0;
51
52 while (!kthread_should_stop())
53 simple_thread_func(cnt++);
54
55 return 0;
56}
57
58static struct task_struct *simple_tsk;
59static struct task_struct *simple_tsk_fn;
60
61static void simple_thread_func_fn(int cnt)
62{
63 set_current_state(TASK_INTERRUPTIBLE);
64 schedule_timeout(HZ);
65
66 /* More silly tracepoints */
67 trace_foo_bar_with_fn("Look at me", cnt);
68 trace_foo_with_template_fn("Look at me too", cnt);
69}
70
71static int simple_thread_fn(void *arg)
72{
73 int cnt = 0;
74
75 while (!kthread_should_stop())
76 simple_thread_func_fn(cnt++);
77
78 return 0;
79}
80
81static DEFINE_MUTEX(thread_mutex);
82static int simple_thread_cnt;
83
84int foo_bar_reg(void)
85{
86 mutex_lock(&thread_mutex);
87 if (simple_thread_cnt++)
88 goto out;
89
90 pr_info("Starting thread for foo_bar_fn\n");
91 /*
92 * We shouldn't be able to start a trace when the module is
93 * unloading (there's other locks to prevent that). But
94 * for consistency sake, we still take the thread_mutex.
95 */
96 simple_tsk_fn = kthread_run(simple_thread_fn, NULL, "event-sample-fn");
97 out:
98 mutex_unlock(&thread_mutex);
99 return 0;
100}
101
102void foo_bar_unreg(void)
103{
104 mutex_lock(&thread_mutex);
105 if (--simple_thread_cnt)
106 goto out;
107
108 pr_info("Killing thread for foo_bar_fn\n");
109 if (simple_tsk_fn)
110 kthread_stop(simple_tsk_fn);
111 simple_tsk_fn = NULL;
112 out:
113 mutex_unlock(&thread_mutex);
114}
115
116static int __init trace_event_init(void)
117{
118 simple_tsk = kthread_run(simple_thread, NULL, "event-sample");
119 if (IS_ERR(simple_tsk))
120 return -1;
121
122 return 0;
123}
124
125static void __exit trace_event_exit(void)
126{
127 kthread_stop(simple_tsk);
128 mutex_lock(&thread_mutex);
129 if (simple_tsk_fn)
130 kthread_stop(simple_tsk_fn);
131 simple_tsk_fn = NULL;
132 mutex_unlock(&thread_mutex);
133}
134
135module_init(trace_event_init);
136module_exit(trace_event_exit);
137
138MODULE_AUTHOR("Steven Rostedt");
139MODULE_DESCRIPTION("trace-events-sample");
140MODULE_LICENSE("GPL");
diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h
new file mode 100644
index 000000000..13a35f7cb
--- /dev/null
+++ b/samples/trace_events/trace-events-sample.h
@@ -0,0 +1,524 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * If TRACE_SYSTEM is defined, that will be the directory created
4 * in the ftrace directory under /sys/kernel/tracing/events/<system>
5 *
6 * The define_trace.h below will also look for a file name of
7 * TRACE_SYSTEM.h where TRACE_SYSTEM is what is defined here.
8 * In this case, it would look for sample-trace.h
9 *
10 * If the header name will be different than the system name
11 * (as in this case), then you can override the header name that
12 * define_trace.h will look up by defining TRACE_INCLUDE_FILE
13 *
14 * This file is called trace-events-sample.h but we want the system
15 * to be called "sample-trace". Therefore we must define the name of this
16 * file:
17 *
18 * #define TRACE_INCLUDE_FILE trace-events-sample
19 *
20 * As we do an the bottom of this file.
21 *
22 * Notice that TRACE_SYSTEM should be defined outside of #if
23 * protection, just like TRACE_INCLUDE_FILE.
24 */
25#undef TRACE_SYSTEM
26#define TRACE_SYSTEM sample-trace
27
28/*
29 * TRACE_SYSTEM is expected to be a C valid variable (alpha-numeric
30 * and underscore), although it may start with numbers. If for some
31 * reason it is not, you need to add the following lines:
32 */
33#undef TRACE_SYSTEM_VAR
34#define TRACE_SYSTEM_VAR sample_trace
35/*
36 * But the above is only needed if TRACE_SYSTEM is not alpha-numeric
37 * and underscored. By default, TRACE_SYSTEM_VAR will be equal to
38 * TRACE_SYSTEM. As TRACE_SYSTEM_VAR must be alpha-numeric, if
39 * TRACE_SYSTEM is not, then TRACE_SYSTEM_VAR must be defined with
40 * only alpha-numeric and underscores.
41 *
42 * The TRACE_SYSTEM_VAR is only used internally and not visible to
43 * user space.
44 */
45
46/*
47 * Notice that this file is not protected like a normal header.
48 * We also must allow for rereading of this file. The
49 *
50 * || defined(TRACE_HEADER_MULTI_READ)
51 *
52 * serves this purpose.
53 */
54#if !defined(_TRACE_EVENT_SAMPLE_H) || defined(TRACE_HEADER_MULTI_READ)
55#define _TRACE_EVENT_SAMPLE_H
56
57/*
58 * All trace headers should include tracepoint.h, until we finally
59 * make it into a standard header.
60 */
61#include <linux/tracepoint.h>
62
63/*
64 * The TRACE_EVENT macro is broken up into 5 parts.
65 *
66 * name: name of the trace point. This is also how to enable the tracepoint.
67 * A function called trace_foo_bar() will be created.
68 *
69 * proto: the prototype of the function trace_foo_bar()
70 * Here it is trace_foo_bar(char *foo, int bar).
71 *
72 * args: must match the arguments in the prototype.
73 * Here it is simply "foo, bar".
74 *
75 * struct: This defines the way the data will be stored in the ring buffer.
76 * The items declared here become part of a special structure
77 * called "__entry", which can be used in the fast_assign part of the
78 * TRACE_EVENT macro.
79 *
80 * Here are the currently defined types you can use:
81 *
82 * __field : Is broken up into type and name. Where type can be any
83 * primitive type (integer, long or pointer).
84 *
85 * __field(int, foo)
86 *
87 * __entry->foo = 5;
88 *
89 * __field_struct : This can be any static complex data type (struct, union
90 * but not an array). Be careful using complex types, as each
91 * event is limited in size, and copying large amounts of data
92 * into the ring buffer can slow things down.
93 *
94 * __field_struct(struct bar, foo)
95 *
96 * __entry->bar.x = y;
97
98 * __array: There are three fields (type, name, size). The type is the
99 * type of elements in the array, the name is the name of the array.
100 * size is the number of items in the array (not the total size).
101 *
102 * __array( char, foo, 10) is the same as saying: char foo[10];
103 *
104 * Assigning arrays can be done like any array:
105 *
106 * __entry->foo[0] = 'a';
107 *
108 * memcpy(__entry->foo, bar, 10);
109 *
110 * __dynamic_array: This is similar to array, but can vary its size from
111 * instance to instance of the tracepoint being called.
112 * Like __array, this too has three elements (type, name, size);
113 * type is the type of the element, name is the name of the array.
114 * The size is different than __array. It is not a static number,
115 * but the algorithm to figure out the length of the array for the
116 * specific instance of tracepoint. Again, size is the number of
117 * items in the array, not the total length in bytes.
118 *
119 * __dynamic_array( int, foo, bar) is similar to: int foo[bar];
120 *
121 * Note, unlike arrays, you must use the __get_dynamic_array() macro
122 * to access the array.
123 *
124 * memcpy(__get_dynamic_array(foo), bar, 10);
125 *
126 * Notice, that "__entry" is not needed here.
127 *
128 * __string: This is a special kind of __dynamic_array. It expects to
129 * have a null terminated character array passed to it (it allows
130 * for NULL too, which would be converted into "(null)"). __string
131 * takes two parameter (name, src), where name is the name of
132 * the string saved, and src is the string to copy into the
133 * ring buffer.
134 *
135 * __string(foo, bar) is similar to: strcpy(foo, bar)
136 *
137 * To assign a string, use the helper macro __assign_str().
138 *
139 * __assign_str(foo, bar);
140 *
141 * In most cases, the __assign_str() macro will take the same
142 * parameters as the __string() macro had to declare the string.
143 *
144 * __bitmask: This is another kind of __dynamic_array, but it expects
145 * an array of longs, and the number of bits to parse. It takes
146 * two parameters (name, nr_bits), where name is the name of the
147 * bitmask to save, and the nr_bits is the number of bits to record.
148 *
149 * __bitmask(target_cpu, nr_cpumask_bits)
150 *
151 * To assign a bitmask, use the __assign_bitmask() helper macro.
152 *
153 * __assign_bitmask(target_cpus, cpumask_bits(bar), nr_cpumask_bits);
154 *
155 *
156 * fast_assign: This is a C like function that is used to store the items
157 * into the ring buffer. A special variable called "__entry" will be the
158 * structure that points into the ring buffer and has the same fields as
159 * described by the struct part of TRACE_EVENT above.
160 *
161 * printk: This is a way to print out the data in pretty print. This is
162 * useful if the system crashes and you are logging via a serial line,
163 * the data can be printed to the console using this "printk" method.
164 * This is also used to print out the data from the trace files.
165 * Again, the __entry macro is used to access the data from the ring buffer.
166 *
167 * Note, __dynamic_array, __string, and __bitmask require special helpers
168 * to access the data.
169 *
170 * For __dynamic_array(int, foo, bar) use __get_dynamic_array(foo)
171 * Use __get_dynamic_array_len(foo) to get the length of the array
172 * saved. Note, __get_dynamic_array_len() returns the total allocated
173 * length of the dynamic array; __print_array() expects the second
174 * parameter to be the number of elements. To get that, the array length
175 * needs to be divided by the element size.
176 *
177 * For __string(foo, bar) use __get_str(foo)
178 *
179 * For __bitmask(target_cpus, nr_cpumask_bits) use __get_bitmask(target_cpus)
180 *
181 *
182 * Note, that for both the assign and the printk, __entry is the handler
183 * to the data structure in the ring buffer, and is defined by the
184 * TP_STRUCT__entry.
185 */
186
187/*
188 * It is OK to have helper functions in the file, but they need to be protected
189 * from being defined more than once. Remember, this file gets included more
190 * than once.
191 */
192#ifndef __TRACE_EVENT_SAMPLE_HELPER_FUNCTIONS
193#define __TRACE_EVENT_SAMPLE_HELPER_FUNCTIONS
194static inline int __length_of(const int *list)
195{
196 int i;
197
198 if (!list)
199 return 0;
200
201 for (i = 0; list[i]; i++)
202 ;
203 return i;
204}
205
206enum {
207 TRACE_SAMPLE_FOO = 2,
208 TRACE_SAMPLE_BAR = 4,
209 TRACE_SAMPLE_ZOO = 8,
210};
211#endif
212
213/*
214 * If enums are used in the TP_printk(), their names will be shown in
215 * format files and not their values. This can cause problems with user
216 * space programs that parse the format files to know how to translate
217 * the raw binary trace output into human readable text.
218 *
219 * To help out user space programs, any enum that is used in the TP_printk()
220 * should be defined by TRACE_DEFINE_ENUM() macro. All that is needed to
221 * be done is to add this macro with the enum within it in the trace
222 * header file, and it will be converted in the output.
223 */
224
225TRACE_DEFINE_ENUM(TRACE_SAMPLE_FOO);
226TRACE_DEFINE_ENUM(TRACE_SAMPLE_BAR);
227TRACE_DEFINE_ENUM(TRACE_SAMPLE_ZOO);
228
229TRACE_EVENT(foo_bar,
230
231 TP_PROTO(const char *foo, int bar, const int *lst,
232 const char *string, const struct cpumask *mask),
233
234 TP_ARGS(foo, bar, lst, string, mask),
235
236 TP_STRUCT__entry(
237 __array( char, foo, 10 )
238 __field( int, bar )
239 __dynamic_array(int, list, __length_of(lst))
240 __string( str, string )
241 __bitmask( cpus, num_possible_cpus() )
242 ),
243
244 TP_fast_assign(
245 strlcpy(__entry->foo, foo, 10);
246 __entry->bar = bar;
247 memcpy(__get_dynamic_array(list), lst,
248 __length_of(lst) * sizeof(int));
249 __assign_str(str, string);
250 __assign_bitmask(cpus, cpumask_bits(mask), num_possible_cpus());
251 ),
252
253 TP_printk("foo %s %d %s %s %s %s (%s)", __entry->foo, __entry->bar,
254
255/*
256 * Notice here the use of some helper functions. This includes:
257 *
258 * __print_symbolic( variable, { value, "string" }, ... ),
259 *
260 * The variable is tested against each value of the { } pair. If
261 * the variable matches one of the values, then it will print the
262 * string in that pair. If non are matched, it returns a string
263 * version of the number (if __entry->bar == 7 then "7" is returned).
264 */
265 __print_symbolic(__entry->bar,
266 { 0, "zero" },
267 { TRACE_SAMPLE_FOO, "TWO" },
268 { TRACE_SAMPLE_BAR, "FOUR" },
269 { TRACE_SAMPLE_ZOO, "EIGHT" },
270 { 10, "TEN" }
271 ),
272
273/*
274 * __print_flags( variable, "delim", { value, "flag" }, ... ),
275 *
276 * This is similar to __print_symbolic, except that it tests the bits
277 * of the value. If ((FLAG & variable) == FLAG) then the string is
278 * printed. If more than one flag matches, then each one that does is
279 * also printed with delim in between them.
280 * If not all bits are accounted for, then the not found bits will be
281 * added in hex format: 0x506 will show BIT2|BIT4|0x500
282 */
283 __print_flags(__entry->bar, "|",
284 { 1, "BIT1" },
285 { 2, "BIT2" },
286 { 4, "BIT3" },
287 { 8, "BIT4" }
288 ),
289/*
290 * __print_array( array, len, element_size )
291 *
292 * This prints out the array that is defined by __array in a nice format.
293 */
294 __print_array(__get_dynamic_array(list),
295 __get_dynamic_array_len(list) / sizeof(int),
296 sizeof(int)),
297 __get_str(str), __get_bitmask(cpus))
298);
299
300/*
301 * There may be a case where a tracepoint should only be called if
302 * some condition is set. Otherwise the tracepoint should not be called.
303 * But to do something like:
304 *
305 * if (cond)
306 * trace_foo();
307 *
308 * Would cause a little overhead when tracing is not enabled, and that
309 * overhead, even if small, is not something we want. As tracepoints
310 * use static branch (aka jump_labels), where no branch is taken to
311 * skip the tracepoint when not enabled, and a jmp is placed to jump
312 * to the tracepoint code when it is enabled, having a if statement
313 * nullifies that optimization. It would be nice to place that
314 * condition within the static branch. This is where TRACE_EVENT_CONDITION
315 * comes in.
316 *
317 * TRACE_EVENT_CONDITION() is just like TRACE_EVENT, except it adds another
318 * parameter just after args. Where TRACE_EVENT has:
319 *
320 * TRACE_EVENT(name, proto, args, struct, assign, printk)
321 *
322 * the CONDITION version has:
323 *
324 * TRACE_EVENT_CONDITION(name, proto, args, cond, struct, assign, printk)
325 *
326 * Everything is the same as TRACE_EVENT except for the new cond. Think
327 * of the cond variable as:
328 *
329 * if (cond)
330 * trace_foo_bar_with_cond();
331 *
332 * Except that the logic for the if branch is placed after the static branch.
333 * That is, the if statement that processes the condition will not be
334 * executed unless that traecpoint is enabled. Otherwise it still remains
335 * a nop.
336 */
337TRACE_EVENT_CONDITION(foo_bar_with_cond,
338
339 TP_PROTO(const char *foo, int bar),
340
341 TP_ARGS(foo, bar),
342
343 TP_CONDITION(!(bar % 10)),
344
345 TP_STRUCT__entry(
346 __string( foo, foo )
347 __field( int, bar )
348 ),
349
350 TP_fast_assign(
351 __assign_str(foo, foo);
352 __entry->bar = bar;
353 ),
354
355 TP_printk("foo %s %d", __get_str(foo), __entry->bar)
356);
357
358int foo_bar_reg(void);
359void foo_bar_unreg(void);
360
361/*
362 * Now in the case that some function needs to be called when the
363 * tracepoint is enabled and/or when it is disabled, the
364 * TRACE_EVENT_FN() serves this purpose. This is just like TRACE_EVENT()
365 * but adds two more parameters at the end:
366 *
367 * TRACE_EVENT_FN( name, proto, args, struct, assign, printk, reg, unreg)
368 *
369 * reg and unreg are functions with the prototype of:
370 *
371 * void reg(void)
372 *
373 * The reg function gets called before the tracepoint is enabled, and
374 * the unreg function gets called after the tracepoint is disabled.
375 *
376 * Note, reg and unreg are allowed to be NULL. If you only need to
377 * call a function before enabling, or after disabling, just set one
378 * function and pass in NULL for the other parameter.
379 */
380TRACE_EVENT_FN(foo_bar_with_fn,
381
382 TP_PROTO(const char *foo, int bar),
383
384 TP_ARGS(foo, bar),
385
386 TP_STRUCT__entry(
387 __string( foo, foo )
388 __field( int, bar )
389 ),
390
391 TP_fast_assign(
392 __assign_str(foo, foo);
393 __entry->bar = bar;
394 ),
395
396 TP_printk("foo %s %d", __get_str(foo), __entry->bar),
397
398 foo_bar_reg, foo_bar_unreg
399);
400
401/*
402 * Each TRACE_EVENT macro creates several helper functions to produce
403 * the code to add the tracepoint, create the files in the trace
404 * directory, hook it to perf, assign the values and to print out
405 * the raw data from the ring buffer. To prevent too much bloat,
406 * if there are more than one tracepoint that uses the same format
407 * for the proto, args, struct, assign and printk, and only the name
408 * is different, it is highly recommended to use the DECLARE_EVENT_CLASS
409 *
410 * DECLARE_EVENT_CLASS() macro creates most of the functions for the
411 * tracepoint. Then DEFINE_EVENT() is use to hook a tracepoint to those
412 * functions. This DEFINE_EVENT() is an instance of the class and can
413 * be enabled and disabled separately from other events (either TRACE_EVENT
414 * or other DEFINE_EVENT()s).
415 *
416 * Note, TRACE_EVENT() itself is simply defined as:
417 *
418 * #define TRACE_EVENT(name, proto, args, tstruct, assign, printk) \
419 * DECLARE_EVENT_CLASS(name, proto, args, tstruct, assign, printk); \
420 * DEFINE_EVENT(name, name, proto, args)
421 *
422 * The DEFINE_EVENT() also can be declared with conditions and reg functions:
423 *
424 * DEFINE_EVENT_CONDITION(template, name, proto, args, cond);
425 * DEFINE_EVENT_FN(template, name, proto, args, reg, unreg);
426 */
427DECLARE_EVENT_CLASS(foo_template,
428
429 TP_PROTO(const char *foo, int bar),
430
431 TP_ARGS(foo, bar),
432
433 TP_STRUCT__entry(
434 __string( foo, foo )
435 __field( int, bar )
436 ),
437
438 TP_fast_assign(
439 __assign_str(foo, foo);
440 __entry->bar = bar;
441 ),
442
443 TP_printk("foo %s %d", __get_str(foo), __entry->bar)
444);
445
446/*
447 * Here's a better way for the previous samples (except, the first
448 * example had more fields and could not be used here).
449 */
450DEFINE_EVENT(foo_template, foo_with_template_simple,
451 TP_PROTO(const char *foo, int bar),
452 TP_ARGS(foo, bar));
453
454DEFINE_EVENT_CONDITION(foo_template, foo_with_template_cond,
455 TP_PROTO(const char *foo, int bar),
456 TP_ARGS(foo, bar),
457 TP_CONDITION(!(bar % 8)));
458
459
460DEFINE_EVENT_FN(foo_template, foo_with_template_fn,
461 TP_PROTO(const char *foo, int bar),
462 TP_ARGS(foo, bar),
463 foo_bar_reg, foo_bar_unreg);
464
465/*
466 * Anytime two events share basically the same values and have
467 * the same output, use the DECLARE_EVENT_CLASS() and DEFINE_EVENT()
468 * when ever possible.
469 */
470
471/*
472 * If the event is similar to the DECLARE_EVENT_CLASS, but you need
473 * to have a different output, then use DEFINE_EVENT_PRINT() which
474 * lets you override the TP_printk() of the class.
475 */
476
477DEFINE_EVENT_PRINT(foo_template, foo_with_template_print,
478 TP_PROTO(const char *foo, int bar),
479 TP_ARGS(foo, bar),
480 TP_printk("bar %s %d", __get_str(foo), __entry->bar));
481
482#endif
483
484/***** NOTICE! The #if protection ends here. *****/
485
486
487/*
488 * There are several ways I could have done this. If I left out the
489 * TRACE_INCLUDE_PATH, then it would default to the kernel source
490 * include/trace/events directory.
491 *
492 * I could specify a path from the define_trace.h file back to this
493 * file.
494 *
495 * #define TRACE_INCLUDE_PATH ../../samples/trace_events
496 *
497 * But the safest and easiest way to simply make it use the directory
498 * that the file is in is to add in the Makefile:
499 *
500 * CFLAGS_trace-events-sample.o := -I$(src)
501 *
502 * This will make sure the current path is part of the include
503 * structure for our file so that define_trace.h can find it.
504 *
505 * I could have made only the top level directory the include:
506 *
507 * CFLAGS_trace-events-sample.o := -I$(PWD)
508 *
509 * And then let the path to this directory be the TRACE_INCLUDE_PATH:
510 *
511 * #define TRACE_INCLUDE_PATH samples/trace_events
512 *
513 * But then if something defines "samples" or "trace_events" as a macro
514 * then we could risk that being converted too, and give us an unexpected
515 * result.
516 */
517#undef TRACE_INCLUDE_PATH
518#undef TRACE_INCLUDE_FILE
519#define TRACE_INCLUDE_PATH .
520/*
521 * TRACE_INCLUDE_FILE is not needed if the filename and TRACE_SYSTEM are equal
522 */
523#define TRACE_INCLUDE_FILE trace-events-sample
524#include <trace/define_trace.h>
diff --git a/samples/trace_printk/Makefile b/samples/trace_printk/Makefile
new file mode 100644
index 000000000..c0df36167
--- /dev/null
+++ b/samples/trace_printk/Makefile
@@ -0,0 +1,7 @@
1# SPDX-License-Identifier: GPL-2.0-only
2# builds a module that calls various trace_printk routines
3# then to use one (as root): insmod <module_name.ko>
4
5# This module can also be used to test the trace_printk code.
6
7obj-$(CONFIG_SAMPLE_TRACE_PRINTK) += trace-printk.o
diff --git a/samples/trace_printk/trace-printk.c b/samples/trace_printk/trace-printk.c
new file mode 100644
index 000000000..cfc159580
--- /dev/null
+++ b/samples/trace_printk/trace-printk.c
@@ -0,0 +1,58 @@
1// SPDX-License-Identifier: GPL-2.0-only
2#include <linux/module.h>
3#include <linux/kthread.h>
4#include <linux/irq_work.h>
5
6/* Must not be static to force gcc to consider these non constant */
7char *trace_printk_test_global_str =
8 "This is a dynamic string that will use trace_puts\n";
9
10char *trace_printk_test_global_str_irq =
11 "(irq) This is a dynamic string that will use trace_puts\n";
12
13char *trace_printk_test_global_str_fmt =
14 "%sThis is a %s that will use trace_printk\n";
15
16static struct irq_work irqwork;
17
18static void trace_printk_irq_work(struct irq_work *work)
19{
20 trace_printk("(irq) This is a static string that will use trace_bputs\n");
21 trace_printk(trace_printk_test_global_str_irq);
22
23 trace_printk("(irq) This is a %s that will use trace_bprintk()\n",
24 "static string");
25
26 trace_printk(trace_printk_test_global_str_fmt,
27 "(irq) ", "dynamic string");
28}
29
30static int __init trace_printk_init(void)
31{
32 init_irq_work(&irqwork, trace_printk_irq_work);
33
34 trace_printk("This is a static string that will use trace_bputs\n");
35 trace_printk(trace_printk_test_global_str);
36
37 /* Kick off printing in irq context */
38 irq_work_queue(&irqwork);
39 irq_work_sync(&irqwork);
40
41 trace_printk("This is a %s that will use trace_bprintk()\n",
42 "static string");
43
44 trace_printk(trace_printk_test_global_str_fmt, "", "dynamic string");
45
46 return 0;
47}
48
49static void __exit trace_printk_exit(void)
50{
51}
52
53module_init(trace_printk_init);
54module_exit(trace_printk_exit);
55
56MODULE_AUTHOR("Steven Rostedt");
57MODULE_DESCRIPTION("trace-printk");
58MODULE_LICENSE("GPL");
diff --git a/samples/uhid/.gitignore b/samples/uhid/.gitignore
new file mode 100644
index 000000000..0e0a5a929
--- /dev/null
+++ b/samples/uhid/.gitignore
@@ -0,0 +1,2 @@
1# SPDX-License-Identifier: GPL-2.0-only
2/uhid-example
diff --git a/samples/uhid/Makefile b/samples/uhid/Makefile
new file mode 100644
index 000000000..0aa424ec4
--- /dev/null
+++ b/samples/uhid/Makefile
@@ -0,0 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0-only
2userprogs-always-y += uhid-example
3
4userccflags += -I usr/include
diff --git a/samples/uhid/uhid-example.c b/samples/uhid/uhid-example.c
new file mode 100644
index 000000000..015cb06a2
--- /dev/null
+++ b/samples/uhid/uhid-example.c
@@ -0,0 +1,465 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * UHID Example
4 *
5 * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
6 *
7 * The code may be used by anyone for any purpose,
8 * and can serve as a starting point for developing
9 * applications using uhid.
10 */
11
12/*
13 * UHID Example
14 * This example emulates a basic 3 buttons mouse with wheel over UHID. Run this
15 * program as root and then use the following keys to control the mouse:
16 * q: Quit the application
17 * 1: Toggle left button (down, up, ...)
18 * 2: Toggle right button
19 * 3: Toggle middle button
20 * a: Move mouse left
21 * d: Move mouse right
22 * w: Move mouse up
23 * s: Move mouse down
24 * r: Move wheel up
25 * f: Move wheel down
26 *
27 * Additionally to 3 button mouse, 3 keyboard LEDs are also supported (LED_NUML,
28 * LED_CAPSL and LED_SCROLLL). The device doesn't generate any related keyboard
29 * events, though. You need to manually write the EV_LED/LED_XY/1 activation
30 * input event to the evdev device to see it being sent to this device.
31 *
32 * If uhid is not available as /dev/uhid, then you can pass a different path as
33 * first argument.
34 * If <linux/uhid.h> is not installed in /usr, then compile this with:
35 * gcc -o ./uhid_test -Wall -I./include ./samples/uhid/uhid-example.c
36 * And ignore the warning about kernel headers. However, it is recommended to
37 * use the installed uhid.h if available.
38 */
39
40#include <errno.h>
41#include <fcntl.h>
42#include <poll.h>
43#include <stdbool.h>
44#include <stdio.h>
45#include <stdlib.h>
46#include <string.h>
47#include <termios.h>
48#include <unistd.h>
49#include <linux/uhid.h>
50
51/*
52 * HID Report Desciptor
53 * We emulate a basic 3 button mouse with wheel and 3 keyboard LEDs. This is
54 * the report-descriptor as the kernel will parse it:
55 *
56 * INPUT(1)[INPUT]
57 * Field(0)
58 * Physical(GenericDesktop.Pointer)
59 * Application(GenericDesktop.Mouse)
60 * Usage(3)
61 * Button.0001
62 * Button.0002
63 * Button.0003
64 * Logical Minimum(0)
65 * Logical Maximum(1)
66 * Report Size(1)
67 * Report Count(3)
68 * Report Offset(0)
69 * Flags( Variable Absolute )
70 * Field(1)
71 * Physical(GenericDesktop.Pointer)
72 * Application(GenericDesktop.Mouse)
73 * Usage(3)
74 * GenericDesktop.X
75 * GenericDesktop.Y
76 * GenericDesktop.Wheel
77 * Logical Minimum(-128)
78 * Logical Maximum(127)
79 * Report Size(8)
80 * Report Count(3)
81 * Report Offset(8)
82 * Flags( Variable Relative )
83 * OUTPUT(2)[OUTPUT]
84 * Field(0)
85 * Application(GenericDesktop.Keyboard)
86 * Usage(3)
87 * LED.NumLock
88 * LED.CapsLock
89 * LED.ScrollLock
90 * Logical Minimum(0)
91 * Logical Maximum(1)
92 * Report Size(1)
93 * Report Count(3)
94 * Report Offset(0)
95 * Flags( Variable Absolute )
96 *
97 * This is the mapping that we expect:
98 * Button.0001 ---> Key.LeftBtn
99 * Button.0002 ---> Key.RightBtn
100 * Button.0003 ---> Key.MiddleBtn
101 * GenericDesktop.X ---> Relative.X
102 * GenericDesktop.Y ---> Relative.Y
103 * GenericDesktop.Wheel ---> Relative.Wheel
104 * LED.NumLock ---> LED.NumLock
105 * LED.CapsLock ---> LED.CapsLock
106 * LED.ScrollLock ---> LED.ScrollLock
107 *
108 * This information can be verified by reading /sys/kernel/debug/hid/<dev>/rdesc
109 * This file should print the same information as showed above.
110 */
111
112static unsigned char rdesc[] = {
113 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */
114 0x09, 0x02, /* USAGE (Mouse) */
115 0xa1, 0x01, /* COLLECTION (Application) */
116 0x09, 0x01, /* USAGE (Pointer) */
117 0xa1, 0x00, /* COLLECTION (Physical) */
118 0x85, 0x01, /* REPORT_ID (1) */
119 0x05, 0x09, /* USAGE_PAGE (Button) */
120 0x19, 0x01, /* USAGE_MINIMUM (Button 1) */
121 0x29, 0x03, /* USAGE_MAXIMUM (Button 3) */
122 0x15, 0x00, /* LOGICAL_MINIMUM (0) */
123 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */
124 0x95, 0x03, /* REPORT_COUNT (3) */
125 0x75, 0x01, /* REPORT_SIZE (1) */
126 0x81, 0x02, /* INPUT (Data,Var,Abs) */
127 0x95, 0x01, /* REPORT_COUNT (1) */
128 0x75, 0x05, /* REPORT_SIZE (5) */
129 0x81, 0x01, /* INPUT (Cnst,Var,Abs) */
130 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */
131 0x09, 0x30, /* USAGE (X) */
132 0x09, 0x31, /* USAGE (Y) */
133 0x09, 0x38, /* USAGE (WHEEL) */
134 0x15, 0x81, /* LOGICAL_MINIMUM (-127) */
135 0x25, 0x7f, /* LOGICAL_MAXIMUM (127) */
136 0x75, 0x08, /* REPORT_SIZE (8) */
137 0x95, 0x03, /* REPORT_COUNT (3) */
138 0x81, 0x06, /* INPUT (Data,Var,Rel) */
139 0xc0, /* END_COLLECTION */
140 0xc0, /* END_COLLECTION */
141 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */
142 0x09, 0x06, /* USAGE (Keyboard) */
143 0xa1, 0x01, /* COLLECTION (Application) */
144 0x85, 0x02, /* REPORT_ID (2) */
145 0x05, 0x08, /* USAGE_PAGE (Led) */
146 0x19, 0x01, /* USAGE_MINIMUM (1) */
147 0x29, 0x03, /* USAGE_MAXIMUM (3) */
148 0x15, 0x00, /* LOGICAL_MINIMUM (0) */
149 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */
150 0x95, 0x03, /* REPORT_COUNT (3) */
151 0x75, 0x01, /* REPORT_SIZE (1) */
152 0x91, 0x02, /* Output (Data,Var,Abs) */
153 0x95, 0x01, /* REPORT_COUNT (1) */
154 0x75, 0x05, /* REPORT_SIZE (5) */
155 0x91, 0x01, /* Output (Cnst,Var,Abs) */
156 0xc0, /* END_COLLECTION */
157};
158
159static int uhid_write(int fd, const struct uhid_event *ev)
160{
161 ssize_t ret;
162
163 ret = write(fd, ev, sizeof(*ev));
164 if (ret < 0) {
165 fprintf(stderr, "Cannot write to uhid: %m\n");
166 return -errno;
167 } else if (ret != sizeof(*ev)) {
168 fprintf(stderr, "Wrong size written to uhid: %zd != %zu\n",
169 ret, sizeof(ev));
170 return -EFAULT;
171 } else {
172 return 0;
173 }
174}
175
176static int create(int fd)
177{
178 struct uhid_event ev;
179
180 memset(&ev, 0, sizeof(ev));
181 ev.type = UHID_CREATE;
182 strcpy((char*)ev.u.create.name, "test-uhid-device");
183 ev.u.create.rd_data = rdesc;
184 ev.u.create.rd_size = sizeof(rdesc);
185 ev.u.create.bus = BUS_USB;
186 ev.u.create.vendor = 0x15d9;
187 ev.u.create.product = 0x0a37;
188 ev.u.create.version = 0;
189 ev.u.create.country = 0;
190
191 return uhid_write(fd, &ev);
192}
193
194static void destroy(int fd)
195{
196 struct uhid_event ev;
197
198 memset(&ev, 0, sizeof(ev));
199 ev.type = UHID_DESTROY;
200
201 uhid_write(fd, &ev);
202}
203
204/* This parses raw output reports sent by the kernel to the device. A normal
205 * uhid program shouldn't do this but instead just forward the raw report.
206 * However, for ducomentational purposes, we try to detect LED events here and
207 * print debug messages for it. */
208static void handle_output(struct uhid_event *ev)
209{
210 /* LED messages are adverised via OUTPUT reports; ignore the rest */
211 if (ev->u.output.rtype != UHID_OUTPUT_REPORT)
212 return;
213 /* LED reports have length 2 bytes */
214 if (ev->u.output.size != 2)
215 return;
216 /* first byte is report-id which is 0x02 for LEDs in our rdesc */
217 if (ev->u.output.data[0] != 0x2)
218 return;
219
220 /* print flags payload */
221 fprintf(stderr, "LED output report received with flags %x\n",
222 ev->u.output.data[1]);
223}
224
225static int event(int fd)
226{
227 struct uhid_event ev;
228 ssize_t ret;
229
230 memset(&ev, 0, sizeof(ev));
231 ret = read(fd, &ev, sizeof(ev));
232 if (ret == 0) {
233 fprintf(stderr, "Read HUP on uhid-cdev\n");
234 return -EFAULT;
235 } else if (ret < 0) {
236 fprintf(stderr, "Cannot read uhid-cdev: %m\n");
237 return -errno;
238 } else if (ret != sizeof(ev)) {
239 fprintf(stderr, "Invalid size read from uhid-dev: %zd != %zu\n",
240 ret, sizeof(ev));
241 return -EFAULT;
242 }
243
244 switch (ev.type) {
245 case UHID_START:
246 fprintf(stderr, "UHID_START from uhid-dev\n");
247 break;
248 case UHID_STOP:
249 fprintf(stderr, "UHID_STOP from uhid-dev\n");
250 break;
251 case UHID_OPEN:
252 fprintf(stderr, "UHID_OPEN from uhid-dev\n");
253 break;
254 case UHID_CLOSE:
255 fprintf(stderr, "UHID_CLOSE from uhid-dev\n");
256 break;
257 case UHID_OUTPUT:
258 fprintf(stderr, "UHID_OUTPUT from uhid-dev\n");
259 handle_output(&ev);
260 break;
261 case UHID_OUTPUT_EV:
262 fprintf(stderr, "UHID_OUTPUT_EV from uhid-dev\n");
263 break;
264 default:
265 fprintf(stderr, "Invalid event from uhid-dev: %u\n", ev.type);
266 }
267
268 return 0;
269}
270
271static bool btn1_down;
272static bool btn2_down;
273static bool btn3_down;
274static signed char abs_hor;
275static signed char abs_ver;
276static signed char wheel;
277
278static int send_event(int fd)
279{
280 struct uhid_event ev;
281
282 memset(&ev, 0, sizeof(ev));
283 ev.type = UHID_INPUT;
284 ev.u.input.size = 5;
285
286 ev.u.input.data[0] = 0x1;
287 if (btn1_down)
288 ev.u.input.data[1] |= 0x1;
289 if (btn2_down)
290 ev.u.input.data[1] |= 0x2;
291 if (btn3_down)
292 ev.u.input.data[1] |= 0x4;
293
294 ev.u.input.data[2] = abs_hor;
295 ev.u.input.data[3] = abs_ver;
296 ev.u.input.data[4] = wheel;
297
298 return uhid_write(fd, &ev);
299}
300
301static int keyboard(int fd)
302{
303 char buf[128];
304 ssize_t ret, i;
305
306 ret = read(STDIN_FILENO, buf, sizeof(buf));
307 if (ret == 0) {
308 fprintf(stderr, "Read HUP on stdin\n");
309 return -EFAULT;
310 } else if (ret < 0) {
311 fprintf(stderr, "Cannot read stdin: %m\n");
312 return -errno;
313 }
314
315 for (i = 0; i < ret; ++i) {
316 switch (buf[i]) {
317 case '1':
318 btn1_down = !btn1_down;
319 ret = send_event(fd);
320 if (ret)
321 return ret;
322 break;
323 case '2':
324 btn2_down = !btn2_down;
325 ret = send_event(fd);
326 if (ret)
327 return ret;
328 break;
329 case '3':
330 btn3_down = !btn3_down;
331 ret = send_event(fd);
332 if (ret)
333 return ret;
334 break;
335 case 'a':
336 abs_hor = -20;
337 ret = send_event(fd);
338 abs_hor = 0;
339 if (ret)
340 return ret;
341 break;
342 case 'd':
343 abs_hor = 20;
344 ret = send_event(fd);
345 abs_hor = 0;
346 if (ret)
347 return ret;
348 break;
349 case 'w':
350 abs_ver = -20;
351 ret = send_event(fd);
352 abs_ver = 0;
353 if (ret)
354 return ret;
355 break;
356 case 's':
357 abs_ver = 20;
358 ret = send_event(fd);
359 abs_ver = 0;
360 if (ret)
361 return ret;
362 break;
363 case 'r':
364 wheel = 1;
365 ret = send_event(fd);
366 wheel = 0;
367 if (ret)
368 return ret;
369 break;
370 case 'f':
371 wheel = -1;
372 ret = send_event(fd);
373 wheel = 0;
374 if (ret)
375 return ret;
376 break;
377 case 'q':
378 return -ECANCELED;
379 default:
380 fprintf(stderr, "Invalid input: %c\n", buf[i]);
381 }
382 }
383
384 return 0;
385}
386
387int main(int argc, char **argv)
388{
389 int fd;
390 const char *path = "/dev/uhid";
391 struct pollfd pfds[2];
392 int ret;
393 struct termios state;
394
395 ret = tcgetattr(STDIN_FILENO, &state);
396 if (ret) {
397 fprintf(stderr, "Cannot get tty state\n");
398 } else {
399 state.c_lflag &= ~ICANON;
400 state.c_cc[VMIN] = 1;
401 ret = tcsetattr(STDIN_FILENO, TCSANOW, &state);
402 if (ret)
403 fprintf(stderr, "Cannot set tty state\n");
404 }
405
406 if (argc >= 2) {
407 if (!strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
408 fprintf(stderr, "Usage: %s [%s]\n", argv[0], path);
409 return EXIT_SUCCESS;
410 } else {
411 path = argv[1];
412 }
413 }
414
415 fprintf(stderr, "Open uhid-cdev %s\n", path);
416 fd = open(path, O_RDWR | O_CLOEXEC);
417 if (fd < 0) {
418 fprintf(stderr, "Cannot open uhid-cdev %s: %m\n", path);
419 return EXIT_FAILURE;
420 }
421
422 fprintf(stderr, "Create uhid device\n");
423 ret = create(fd);
424 if (ret) {
425 close(fd);
426 return EXIT_FAILURE;
427 }
428
429 pfds[0].fd = STDIN_FILENO;
430 pfds[0].events = POLLIN;
431 pfds[1].fd = fd;
432 pfds[1].events = POLLIN;
433
434 fprintf(stderr, "Press 'q' to quit...\n");
435 while (1) {
436 ret = poll(pfds, 2, -1);
437 if (ret < 0) {
438 fprintf(stderr, "Cannot poll for fds: %m\n");
439 break;
440 }
441 if (pfds[0].revents & POLLHUP) {
442 fprintf(stderr, "Received HUP on stdin\n");
443 break;
444 }
445 if (pfds[1].revents & POLLHUP) {
446 fprintf(stderr, "Received HUP on uhid-cdev\n");
447 break;
448 }
449
450 if (pfds[0].revents & POLLIN) {
451 ret = keyboard(fd);
452 if (ret)
453 break;
454 }
455 if (pfds[1].revents & POLLIN) {
456 ret = event(fd);
457 if (ret)
458 break;
459 }
460 }
461
462 fprintf(stderr, "Destroy uhid device\n");
463 destroy(fd);
464 return EXIT_SUCCESS;
465}
diff --git a/samples/v4l/Makefile b/samples/v4l/Makefile
new file mode 100644
index 000000000..f86ab1245
--- /dev/null
+++ b/samples/v4l/Makefile
@@ -0,0 +1,2 @@
1# SPDX-License-Identifier: GPL-2.0-only
2obj-$(CONFIG_VIDEO_PCI_SKELETON) := v4l2-pci-skeleton.o
diff --git a/samples/v4l/v4l2-pci-skeleton.c b/samples/v4l/v4l2-pci-skeleton.c
new file mode 100644
index 000000000..3fa6582b4
--- /dev/null
+++ b/samples/v4l/v4l2-pci-skeleton.c
@@ -0,0 +1,915 @@
1/*
2 * This is a V4L2 PCI Skeleton Driver. It gives an initial skeleton source
3 * for use with other PCI drivers.
4 *
5 * This skeleton PCI driver assumes that the card has an S-Video connector as
6 * input 0 and an HDMI connector as input 1.
7 *
8 * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
9 *
10 * This program is free software; you may redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; version 2 of the License.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24#include <linux/types.h>
25#include <linux/kernel.h>
26#include <linux/module.h>
27#include <linux/init.h>
28#include <linux/kmod.h>
29#include <linux/mutex.h>
30#include <linux/pci.h>
31#include <linux/interrupt.h>
32#include <linux/videodev2.h>
33#include <linux/v4l2-dv-timings.h>
34#include <media/v4l2-device.h>
35#include <media/v4l2-dev.h>
36#include <media/v4l2-ioctl.h>
37#include <media/v4l2-dv-timings.h>
38#include <media/v4l2-ctrls.h>
39#include <media/v4l2-event.h>
40#include <media/videobuf2-v4l2.h>
41#include <media/videobuf2-dma-contig.h>
42
43MODULE_DESCRIPTION("V4L2 PCI Skeleton Driver");
44MODULE_AUTHOR("Hans Verkuil");
45MODULE_LICENSE("GPL v2");
46
47/**
48 * struct skeleton - All internal data for one instance of device
49 * @pdev: PCI device
50 * @v4l2_dev: top-level v4l2 device struct
51 * @vdev: video node structure
52 * @ctrl_handler: control handler structure
53 * @lock: ioctl serialization mutex
54 * @std: current SDTV standard
55 * @timings: current HDTV timings
56 * @format: current pix format
57 * @input: current video input (0 = SDTV, 1 = HDTV)
58 * @queue: vb2 video capture queue
59 * @qlock: spinlock controlling access to buf_list and sequence
60 * @buf_list: list of buffers queued for DMA
61 * @field: the field (TOP/BOTTOM/other) of the current buffer
62 * @sequence: frame sequence counter
63 */
64struct skeleton {
65 struct pci_dev *pdev;
66 struct v4l2_device v4l2_dev;
67 struct video_device vdev;
68 struct v4l2_ctrl_handler ctrl_handler;
69 struct mutex lock;
70 v4l2_std_id std;
71 struct v4l2_dv_timings timings;
72 struct v4l2_pix_format format;
73 unsigned input;
74
75 struct vb2_queue queue;
76
77 spinlock_t qlock;
78 struct list_head buf_list;
79 unsigned field;
80 unsigned sequence;
81};
82
83struct skel_buffer {
84 struct vb2_v4l2_buffer vb;
85 struct list_head list;
86};
87
88static inline struct skel_buffer *to_skel_buffer(struct vb2_v4l2_buffer *vbuf)
89{
90 return container_of(vbuf, struct skel_buffer, vb);
91}
92
93static const struct pci_device_id skeleton_pci_tbl[] = {
94 /* { PCI_DEVICE(PCI_VENDOR_ID_, PCI_DEVICE_ID_) }, */
95 { 0, }
96};
97MODULE_DEVICE_TABLE(pci, skeleton_pci_tbl);
98
99/*
100 * HDTV: this structure has the capabilities of the HDTV receiver.
101 * It is used to constrain the huge list of possible formats based
102 * upon the hardware capabilities.
103 */
104static const struct v4l2_dv_timings_cap skel_timings_cap = {
105 .type = V4L2_DV_BT_656_1120,
106 /* keep this initialization for compatibility with GCC < 4.4.6 */
107 .reserved = { 0 },
108 V4L2_INIT_BT_TIMINGS(
109 720, 1920, /* min/max width */
110 480, 1080, /* min/max height */
111 27000000, 74250000, /* min/max pixelclock*/
112 V4L2_DV_BT_STD_CEA861, /* Supported standards */
113 /* capabilities */
114 V4L2_DV_BT_CAP_INTERLACED | V4L2_DV_BT_CAP_PROGRESSIVE
115 )
116};
117
118/*
119 * Supported SDTV standards. This does the same job as skel_timings_cap, but
120 * for standard TV formats.
121 */
122#define SKEL_TVNORMS V4L2_STD_ALL
123
124/*
125 * Interrupt handler: typically interrupts happen after a new frame has been
126 * captured. It is the job of the handler to remove the new frame from the
127 * internal list and give it back to the vb2 framework, updating the sequence
128 * counter, field and timestamp at the same time.
129 */
130static irqreturn_t skeleton_irq(int irq, void *dev_id)
131{
132#ifdef TODO
133 struct skeleton *skel = dev_id;
134
135 /* handle interrupt */
136
137 /* Once a new frame has been captured, mark it as done like this: */
138 if (captured_new_frame) {
139 ...
140 spin_lock(&skel->qlock);
141 list_del(&new_buf->list);
142 spin_unlock(&skel->qlock);
143 new_buf->vb.vb2_buf.timestamp = ktime_get_ns();
144 new_buf->vb.sequence = skel->sequence++;
145 new_buf->vb.field = skel->field;
146 if (skel->format.field == V4L2_FIELD_ALTERNATE) {
147 if (skel->field == V4L2_FIELD_BOTTOM)
148 skel->field = V4L2_FIELD_TOP;
149 else if (skel->field == V4L2_FIELD_TOP)
150 skel->field = V4L2_FIELD_BOTTOM;
151 }
152 vb2_buffer_done(&new_buf->vb.vb2_buf, VB2_BUF_STATE_DONE);
153 }
154#endif
155 return IRQ_HANDLED;
156}
157
158/*
159 * Setup the constraints of the queue: besides setting the number of planes
160 * per buffer and the size and allocation context of each plane, it also
161 * checks if sufficient buffers have been allocated. Usually 3 is a good
162 * minimum number: many DMA engines need a minimum of 2 buffers in the
163 * queue and you need to have another available for userspace processing.
164 */
165static int queue_setup(struct vb2_queue *vq,
166 unsigned int *nbuffers, unsigned int *nplanes,
167 unsigned int sizes[], struct device *alloc_devs[])
168{
169 struct skeleton *skel = vb2_get_drv_priv(vq);
170
171 skel->field = skel->format.field;
172 if (skel->field == V4L2_FIELD_ALTERNATE) {
173 /*
174 * You cannot use read() with FIELD_ALTERNATE since the field
175 * information (TOP/BOTTOM) cannot be passed back to the user.
176 */
177 if (vb2_fileio_is_active(vq))
178 return -EINVAL;
179 skel->field = V4L2_FIELD_TOP;
180 }
181
182 if (vq->num_buffers + *nbuffers < 3)
183 *nbuffers = 3 - vq->num_buffers;
184
185 if (*nplanes)
186 return sizes[0] < skel->format.sizeimage ? -EINVAL : 0;
187 *nplanes = 1;
188 sizes[0] = skel->format.sizeimage;
189 return 0;
190}
191
192/*
193 * Prepare the buffer for queueing to the DMA engine: check and set the
194 * payload size.
195 */
196static int buffer_prepare(struct vb2_buffer *vb)
197{
198 struct skeleton *skel = vb2_get_drv_priv(vb->vb2_queue);
199 unsigned long size = skel->format.sizeimage;
200
201 if (vb2_plane_size(vb, 0) < size) {
202 dev_err(&skel->pdev->dev, "buffer too small (%lu < %lu)\n",
203 vb2_plane_size(vb, 0), size);
204 return -EINVAL;
205 }
206
207 vb2_set_plane_payload(vb, 0, size);
208 return 0;
209}
210
211/*
212 * Queue this buffer to the DMA engine.
213 */
214static void buffer_queue(struct vb2_buffer *vb)
215{
216 struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
217 struct skeleton *skel = vb2_get_drv_priv(vb->vb2_queue);
218 struct skel_buffer *buf = to_skel_buffer(vbuf);
219 unsigned long flags;
220
221 spin_lock_irqsave(&skel->qlock, flags);
222 list_add_tail(&buf->list, &skel->buf_list);
223
224 /* TODO: Update any DMA pointers if necessary */
225
226 spin_unlock_irqrestore(&skel->qlock, flags);
227}
228
229static void return_all_buffers(struct skeleton *skel,
230 enum vb2_buffer_state state)
231{
232 struct skel_buffer *buf, *node;
233 unsigned long flags;
234
235 spin_lock_irqsave(&skel->qlock, flags);
236 list_for_each_entry_safe(buf, node, &skel->buf_list, list) {
237 vb2_buffer_done(&buf->vb.vb2_buf, state);
238 list_del(&buf->list);
239 }
240 spin_unlock_irqrestore(&skel->qlock, flags);
241}
242
243/*
244 * Start streaming. First check if the minimum number of buffers have been
245 * queued. If not, then return -ENOBUFS and the vb2 framework will call
246 * this function again the next time a buffer has been queued until enough
247 * buffers are available to actually start the DMA engine.
248 */
249static int start_streaming(struct vb2_queue *vq, unsigned int count)
250{
251 struct skeleton *skel = vb2_get_drv_priv(vq);
252 int ret = 0;
253
254 skel->sequence = 0;
255
256 /* TODO: start DMA */
257
258 if (ret) {
259 /*
260 * In case of an error, return all active buffers to the
261 * QUEUED state
262 */
263 return_all_buffers(skel, VB2_BUF_STATE_QUEUED);
264 }
265 return ret;
266}
267
268/*
269 * Stop the DMA engine. Any remaining buffers in the DMA queue are dequeued
270 * and passed on to the vb2 framework marked as STATE_ERROR.
271 */
272static void stop_streaming(struct vb2_queue *vq)
273{
274 struct skeleton *skel = vb2_get_drv_priv(vq);
275
276 /* TODO: stop DMA */
277
278 /* Release all active buffers */
279 return_all_buffers(skel, VB2_BUF_STATE_ERROR);
280}
281
282/*
283 * The vb2 queue ops. Note that since q->lock is set we can use the standard
284 * vb2_ops_wait_prepare/finish helper functions. If q->lock would be NULL,
285 * then this driver would have to provide these ops.
286 */
287static const struct vb2_ops skel_qops = {
288 .queue_setup = queue_setup,
289 .buf_prepare = buffer_prepare,
290 .buf_queue = buffer_queue,
291 .start_streaming = start_streaming,
292 .stop_streaming = stop_streaming,
293 .wait_prepare = vb2_ops_wait_prepare,
294 .wait_finish = vb2_ops_wait_finish,
295};
296
297/*
298 * Required ioctl querycap. Note that the version field is prefilled with
299 * the version of the kernel.
300 */
301static int skeleton_querycap(struct file *file, void *priv,
302 struct v4l2_capability *cap)
303{
304 struct skeleton *skel = video_drvdata(file);
305
306 strlcpy(cap->driver, KBUILD_MODNAME, sizeof(cap->driver));
307 strlcpy(cap->card, "V4L2 PCI Skeleton", sizeof(cap->card));
308 snprintf(cap->bus_info, sizeof(cap->bus_info), "PCI:%s",
309 pci_name(skel->pdev));
310 return 0;
311}
312
313/*
314 * Helper function to check and correct struct v4l2_pix_format. It's used
315 * not only in VIDIOC_TRY/S_FMT, but also elsewhere if changes to the SDTV
316 * standard, HDTV timings or the video input would require updating the
317 * current format.
318 */
319static void skeleton_fill_pix_format(struct skeleton *skel,
320 struct v4l2_pix_format *pix)
321{
322 pix->pixelformat = V4L2_PIX_FMT_YUYV;
323 if (skel->input == 0) {
324 /* S-Video input */
325 pix->width = 720;
326 pix->height = (skel->std & V4L2_STD_525_60) ? 480 : 576;
327 pix->field = V4L2_FIELD_INTERLACED;
328 pix->colorspace = V4L2_COLORSPACE_SMPTE170M;
329 } else {
330 /* HDMI input */
331 pix->width = skel->timings.bt.width;
332 pix->height = skel->timings.bt.height;
333 if (skel->timings.bt.interlaced) {
334 pix->field = V4L2_FIELD_ALTERNATE;
335 pix->height /= 2;
336 } else {
337 pix->field = V4L2_FIELD_NONE;
338 }
339 pix->colorspace = V4L2_COLORSPACE_REC709;
340 }
341
342 /*
343 * The YUYV format is four bytes for every two pixels, so bytesperline
344 * is width * 2.
345 */
346 pix->bytesperline = pix->width * 2;
347 pix->sizeimage = pix->bytesperline * pix->height;
348 pix->priv = 0;
349}
350
351static int skeleton_try_fmt_vid_cap(struct file *file, void *priv,
352 struct v4l2_format *f)
353{
354 struct skeleton *skel = video_drvdata(file);
355 struct v4l2_pix_format *pix = &f->fmt.pix;
356
357 /*
358 * Due to historical reasons providing try_fmt with an unsupported
359 * pixelformat will return -EINVAL for video receivers. Webcam drivers,
360 * however, will silently correct the pixelformat. Some video capture
361 * applications rely on this behavior...
362 */
363 if (pix->pixelformat != V4L2_PIX_FMT_YUYV)
364 return -EINVAL;
365 skeleton_fill_pix_format(skel, pix);
366 return 0;
367}
368
369static int skeleton_s_fmt_vid_cap(struct file *file, void *priv,
370 struct v4l2_format *f)
371{
372 struct skeleton *skel = video_drvdata(file);
373 int ret;
374
375 ret = skeleton_try_fmt_vid_cap(file, priv, f);
376 if (ret)
377 return ret;
378
379 /*
380 * It is not allowed to change the format while buffers for use with
381 * streaming have already been allocated.
382 */
383 if (vb2_is_busy(&skel->queue))
384 return -EBUSY;
385
386 /* TODO: change format */
387 skel->format = f->fmt.pix;
388 return 0;
389}
390
391static int skeleton_g_fmt_vid_cap(struct file *file, void *priv,
392 struct v4l2_format *f)
393{
394 struct skeleton *skel = video_drvdata(file);
395
396 f->fmt.pix = skel->format;
397 return 0;
398}
399
400static int skeleton_enum_fmt_vid_cap(struct file *file, void *priv,
401 struct v4l2_fmtdesc *f)
402{
403 if (f->index != 0)
404 return -EINVAL;
405
406 f->pixelformat = V4L2_PIX_FMT_YUYV;
407 return 0;
408}
409
410static int skeleton_s_std(struct file *file, void *priv, v4l2_std_id std)
411{
412 struct skeleton *skel = video_drvdata(file);
413
414 /* S_STD is not supported on the HDMI input */
415 if (skel->input)
416 return -ENODATA;
417
418 /*
419 * No change, so just return. Some applications call S_STD again after
420 * the buffers for streaming have been set up, so we have to allow for
421 * this behavior.
422 */
423 if (std == skel->std)
424 return 0;
425
426 /*
427 * Changing the standard implies a format change, which is not allowed
428 * while buffers for use with streaming have already been allocated.
429 */
430 if (vb2_is_busy(&skel->queue))
431 return -EBUSY;
432
433 /* TODO: handle changing std */
434
435 skel->std = std;
436
437 /* Update the internal format */
438 skeleton_fill_pix_format(skel, &skel->format);
439 return 0;
440}
441
442static int skeleton_g_std(struct file *file, void *priv, v4l2_std_id *std)
443{
444 struct skeleton *skel = video_drvdata(file);
445
446 /* G_STD is not supported on the HDMI input */
447 if (skel->input)
448 return -ENODATA;
449
450 *std = skel->std;
451 return 0;
452}
453
454/*
455 * Query the current standard as seen by the hardware. This function shall
456 * never actually change the standard, it just detects and reports.
457 * The framework will initially set *std to tvnorms (i.e. the set of
458 * supported standards by this input), and this function should just AND
459 * this value. If there is no signal, then *std should be set to 0.
460 */
461static int skeleton_querystd(struct file *file, void *priv, v4l2_std_id *std)
462{
463 struct skeleton *skel = video_drvdata(file);
464
465 /* QUERY_STD is not supported on the HDMI input */
466 if (skel->input)
467 return -ENODATA;
468
469#ifdef TODO
470 /*
471 * Query currently seen standard. Initial value of *std is
472 * V4L2_STD_ALL. This function should look something like this:
473 */
474 get_signal_info();
475 if (no_signal) {
476 *std = 0;
477 return 0;
478 }
479 /* Use signal information to reduce the number of possible standards */
480 if (signal_has_525_lines)
481 *std &= V4L2_STD_525_60;
482 else
483 *std &= V4L2_STD_625_50;
484#endif
485 return 0;
486}
487
488static int skeleton_s_dv_timings(struct file *file, void *_fh,
489 struct v4l2_dv_timings *timings)
490{
491 struct skeleton *skel = video_drvdata(file);
492
493 /* S_DV_TIMINGS is not supported on the S-Video input */
494 if (skel->input == 0)
495 return -ENODATA;
496
497 /* Quick sanity check */
498 if (!v4l2_valid_dv_timings(timings, &skel_timings_cap, NULL, NULL))
499 return -EINVAL;
500
501 /* Check if the timings are part of the CEA-861 timings. */
502 if (!v4l2_find_dv_timings_cap(timings, &skel_timings_cap,
503 0, NULL, NULL))
504 return -EINVAL;
505
506 /* Return 0 if the new timings are the same as the current timings. */
507 if (v4l2_match_dv_timings(timings, &skel->timings, 0, false))
508 return 0;
509
510 /*
511 * Changing the timings implies a format change, which is not allowed
512 * while buffers for use with streaming have already been allocated.
513 */
514 if (vb2_is_busy(&skel->queue))
515 return -EBUSY;
516
517 /* TODO: Configure new timings */
518
519 /* Save timings */
520 skel->timings = *timings;
521
522 /* Update the internal format */
523 skeleton_fill_pix_format(skel, &skel->format);
524 return 0;
525}
526
527static int skeleton_g_dv_timings(struct file *file, void *_fh,
528 struct v4l2_dv_timings *timings)
529{
530 struct skeleton *skel = video_drvdata(file);
531
532 /* G_DV_TIMINGS is not supported on the S-Video input */
533 if (skel->input == 0)
534 return -ENODATA;
535
536 *timings = skel->timings;
537 return 0;
538}
539
540static int skeleton_enum_dv_timings(struct file *file, void *_fh,
541 struct v4l2_enum_dv_timings *timings)
542{
543 struct skeleton *skel = video_drvdata(file);
544
545 /* ENUM_DV_TIMINGS is not supported on the S-Video input */
546 if (skel->input == 0)
547 return -ENODATA;
548
549 return v4l2_enum_dv_timings_cap(timings, &skel_timings_cap,
550 NULL, NULL);
551}
552
553/*
554 * Query the current timings as seen by the hardware. This function shall
555 * never actually change the timings, it just detects and reports.
556 * If no signal is detected, then return -ENOLINK. If the hardware cannot
557 * lock to the signal, then return -ENOLCK. If the signal is out of range
558 * of the capabilities of the system (e.g., it is possible that the receiver
559 * can lock but that the DMA engine it is connected to cannot handle
560 * pixelclocks above a certain frequency), then -ERANGE is returned.
561 */
562static int skeleton_query_dv_timings(struct file *file, void *_fh,
563 struct v4l2_dv_timings *timings)
564{
565 struct skeleton *skel = video_drvdata(file);
566
567 /* QUERY_DV_TIMINGS is not supported on the S-Video input */
568 if (skel->input == 0)
569 return -ENODATA;
570
571#ifdef TODO
572 /*
573 * Query currently seen timings. This function should look
574 * something like this:
575 */
576 detect_timings();
577 if (no_signal)
578 return -ENOLINK;
579 if (cannot_lock_to_signal)
580 return -ENOLCK;
581 if (signal_out_of_range_of_capabilities)
582 return -ERANGE;
583
584 /* Useful for debugging */
585 v4l2_print_dv_timings(skel->v4l2_dev.name, "query_dv_timings:",
586 timings, true);
587#endif
588 return 0;
589}
590
591static int skeleton_dv_timings_cap(struct file *file, void *fh,
592 struct v4l2_dv_timings_cap *cap)
593{
594 struct skeleton *skel = video_drvdata(file);
595
596 /* DV_TIMINGS_CAP is not supported on the S-Video input */
597 if (skel->input == 0)
598 return -ENODATA;
599 *cap = skel_timings_cap;
600 return 0;
601}
602
603static int skeleton_enum_input(struct file *file, void *priv,
604 struct v4l2_input *i)
605{
606 if (i->index > 1)
607 return -EINVAL;
608
609 i->type = V4L2_INPUT_TYPE_CAMERA;
610 if (i->index == 0) {
611 i->std = SKEL_TVNORMS;
612 strlcpy(i->name, "S-Video", sizeof(i->name));
613 i->capabilities = V4L2_IN_CAP_STD;
614 } else {
615 i->std = 0;
616 strlcpy(i->name, "HDMI", sizeof(i->name));
617 i->capabilities = V4L2_IN_CAP_DV_TIMINGS;
618 }
619 return 0;
620}
621
622static int skeleton_s_input(struct file *file, void *priv, unsigned int i)
623{
624 struct skeleton *skel = video_drvdata(file);
625
626 if (i > 1)
627 return -EINVAL;
628
629 /*
630 * Changing the input implies a format change, which is not allowed
631 * while buffers for use with streaming have already been allocated.
632 */
633 if (vb2_is_busy(&skel->queue))
634 return -EBUSY;
635
636 skel->input = i;
637 /*
638 * Update tvnorms. The tvnorms value is used by the core to implement
639 * VIDIOC_ENUMSTD so it has to be correct. If tvnorms == 0, then
640 * ENUMSTD will return -ENODATA.
641 */
642 skel->vdev.tvnorms = i ? 0 : SKEL_TVNORMS;
643
644 /* Update the internal format */
645 skeleton_fill_pix_format(skel, &skel->format);
646 return 0;
647}
648
649static int skeleton_g_input(struct file *file, void *priv, unsigned int *i)
650{
651 struct skeleton *skel = video_drvdata(file);
652
653 *i = skel->input;
654 return 0;
655}
656
657/* The control handler. */
658static int skeleton_s_ctrl(struct v4l2_ctrl *ctrl)
659{
660 /*struct skeleton *skel =
661 container_of(ctrl->handler, struct skeleton, ctrl_handler);*/
662
663 switch (ctrl->id) {
664 case V4L2_CID_BRIGHTNESS:
665 /* TODO: set brightness to ctrl->val */
666 break;
667 case V4L2_CID_CONTRAST:
668 /* TODO: set contrast to ctrl->val */
669 break;
670 case V4L2_CID_SATURATION:
671 /* TODO: set saturation to ctrl->val */
672 break;
673 case V4L2_CID_HUE:
674 /* TODO: set hue to ctrl->val */
675 break;
676 default:
677 return -EINVAL;
678 }
679 return 0;
680}
681
682/* ------------------------------------------------------------------
683 File operations for the device
684 ------------------------------------------------------------------*/
685
686static const struct v4l2_ctrl_ops skel_ctrl_ops = {
687 .s_ctrl = skeleton_s_ctrl,
688};
689
690/*
691 * The set of all supported ioctls. Note that all the streaming ioctls
692 * use the vb2 helper functions that take care of all the locking and
693 * that also do ownership tracking (i.e. only the filehandle that requested
694 * the buffers can call the streaming ioctls, all other filehandles will
695 * receive -EBUSY if they attempt to call the same streaming ioctls).
696 *
697 * The last three ioctls also use standard helper functions: these implement
698 * standard behavior for drivers with controls.
699 */
700static const struct v4l2_ioctl_ops skel_ioctl_ops = {
701 .vidioc_querycap = skeleton_querycap,
702 .vidioc_try_fmt_vid_cap = skeleton_try_fmt_vid_cap,
703 .vidioc_s_fmt_vid_cap = skeleton_s_fmt_vid_cap,
704 .vidioc_g_fmt_vid_cap = skeleton_g_fmt_vid_cap,
705 .vidioc_enum_fmt_vid_cap = skeleton_enum_fmt_vid_cap,
706
707 .vidioc_g_std = skeleton_g_std,
708 .vidioc_s_std = skeleton_s_std,
709 .vidioc_querystd = skeleton_querystd,
710
711 .vidioc_s_dv_timings = skeleton_s_dv_timings,
712 .vidioc_g_dv_timings = skeleton_g_dv_timings,
713 .vidioc_enum_dv_timings = skeleton_enum_dv_timings,
714 .vidioc_query_dv_timings = skeleton_query_dv_timings,
715 .vidioc_dv_timings_cap = skeleton_dv_timings_cap,
716
717 .vidioc_enum_input = skeleton_enum_input,
718 .vidioc_g_input = skeleton_g_input,
719 .vidioc_s_input = skeleton_s_input,
720
721 .vidioc_reqbufs = vb2_ioctl_reqbufs,
722 .vidioc_create_bufs = vb2_ioctl_create_bufs,
723 .vidioc_querybuf = vb2_ioctl_querybuf,
724 .vidioc_qbuf = vb2_ioctl_qbuf,
725 .vidioc_dqbuf = vb2_ioctl_dqbuf,
726 .vidioc_expbuf = vb2_ioctl_expbuf,
727 .vidioc_streamon = vb2_ioctl_streamon,
728 .vidioc_streamoff = vb2_ioctl_streamoff,
729
730 .vidioc_log_status = v4l2_ctrl_log_status,
731 .vidioc_subscribe_event = v4l2_ctrl_subscribe_event,
732 .vidioc_unsubscribe_event = v4l2_event_unsubscribe,
733};
734
735/*
736 * The set of file operations. Note that all these ops are standard core
737 * helper functions.
738 */
739static const struct v4l2_file_operations skel_fops = {
740 .owner = THIS_MODULE,
741 .open = v4l2_fh_open,
742 .release = vb2_fop_release,
743 .unlocked_ioctl = video_ioctl2,
744 .read = vb2_fop_read,
745 .mmap = vb2_fop_mmap,
746 .poll = vb2_fop_poll,
747};
748
749/*
750 * The initial setup of this device instance. Note that the initial state of
751 * the driver should be complete. So the initial format, standard, timings
752 * and video input should all be initialized to some reasonable value.
753 */
754static int skeleton_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
755{
756 /* The initial timings are chosen to be 720p60. */
757 static const struct v4l2_dv_timings timings_def =
758 V4L2_DV_BT_CEA_1280X720P60;
759 struct skeleton *skel;
760 struct video_device *vdev;
761 struct v4l2_ctrl_handler *hdl;
762 struct vb2_queue *q;
763 int ret;
764
765 /* Enable PCI */
766 ret = pci_enable_device(pdev);
767 if (ret)
768 return ret;
769 ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
770 if (ret) {
771 dev_err(&pdev->dev, "no suitable DMA available.\n");
772 goto disable_pci;
773 }
774
775 /* Allocate a new instance */
776 skel = devm_kzalloc(&pdev->dev, sizeof(struct skeleton), GFP_KERNEL);
777 if (!skel) {
778 ret = -ENOMEM;
779 goto disable_pci;
780 }
781
782 /* Allocate the interrupt */
783 ret = devm_request_irq(&pdev->dev, pdev->irq,
784 skeleton_irq, 0, KBUILD_MODNAME, skel);
785 if (ret) {
786 dev_err(&pdev->dev, "request_irq failed\n");
787 goto disable_pci;
788 }
789 skel->pdev = pdev;
790
791 /* Fill in the initial format-related settings */
792 skel->timings = timings_def;
793 skel->std = V4L2_STD_625_50;
794 skeleton_fill_pix_format(skel, &skel->format);
795
796 /* Initialize the top-level structure */
797 ret = v4l2_device_register(&pdev->dev, &skel->v4l2_dev);
798 if (ret)
799 goto disable_pci;
800
801 mutex_init(&skel->lock);
802
803 /* Add the controls */
804 hdl = &skel->ctrl_handler;
805 v4l2_ctrl_handler_init(hdl, 4);
806 v4l2_ctrl_new_std(hdl, &skel_ctrl_ops,
807 V4L2_CID_BRIGHTNESS, 0, 255, 1, 127);
808 v4l2_ctrl_new_std(hdl, &skel_ctrl_ops,
809 V4L2_CID_CONTRAST, 0, 255, 1, 16);
810 v4l2_ctrl_new_std(hdl, &skel_ctrl_ops,
811 V4L2_CID_SATURATION, 0, 255, 1, 127);
812 v4l2_ctrl_new_std(hdl, &skel_ctrl_ops,
813 V4L2_CID_HUE, -128, 127, 1, 0);
814 if (hdl->error) {
815 ret = hdl->error;
816 goto free_hdl;
817 }
818 skel->v4l2_dev.ctrl_handler = hdl;
819
820 /* Initialize the vb2 queue */
821 q = &skel->queue;
822 q->type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
823 q->io_modes = VB2_MMAP | VB2_DMABUF | VB2_READ;
824 q->dev = &pdev->dev;
825 q->drv_priv = skel;
826 q->buf_struct_size = sizeof(struct skel_buffer);
827 q->ops = &skel_qops;
828 q->mem_ops = &vb2_dma_contig_memops;
829 q->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC;
830 /*
831 * Assume that this DMA engine needs to have at least two buffers
832 * available before it can be started. The start_streaming() op
833 * won't be called until at least this many buffers are queued up.
834 */
835 q->min_buffers_needed = 2;
836 /*
837 * The serialization lock for the streaming ioctls. This is the same
838 * as the main serialization lock, but if some of the non-streaming
839 * ioctls could take a long time to execute, then you might want to
840 * have a different lock here to prevent VIDIOC_DQBUF from being
841 * blocked while waiting for another action to finish. This is
842 * generally not needed for PCI devices, but USB devices usually do
843 * want a separate lock here.
844 */
845 q->lock = &skel->lock;
846 /*
847 * Since this driver can only do 32-bit DMA we must make sure that
848 * the vb2 core will allocate the buffers in 32-bit DMA memory.
849 */
850 q->gfp_flags = GFP_DMA32;
851 ret = vb2_queue_init(q);
852 if (ret)
853 goto free_hdl;
854
855 INIT_LIST_HEAD(&skel->buf_list);
856 spin_lock_init(&skel->qlock);
857
858 /* Initialize the video_device structure */
859 vdev = &skel->vdev;
860 strlcpy(vdev->name, KBUILD_MODNAME, sizeof(vdev->name));
861 /*
862 * There is nothing to clean up, so release is set to an empty release
863 * function. The release callback must be non-NULL.
864 */
865 vdev->release = video_device_release_empty;
866 vdev->fops = &skel_fops,
867 vdev->ioctl_ops = &skel_ioctl_ops,
868 vdev->device_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_READWRITE |
869 V4L2_CAP_STREAMING;
870 /*
871 * The main serialization lock. All ioctls are serialized by this
872 * lock. Exception: if q->lock is set, then the streaming ioctls
873 * are serialized by that separate lock.
874 */
875 vdev->lock = &skel->lock;
876 vdev->queue = q;
877 vdev->v4l2_dev = &skel->v4l2_dev;
878 /* Supported SDTV standards, if any */
879 vdev->tvnorms = SKEL_TVNORMS;
880 video_set_drvdata(vdev, skel);
881
882 ret = video_register_device(vdev, VFL_TYPE_VIDEO, -1);
883 if (ret)
884 goto free_hdl;
885
886 dev_info(&pdev->dev, "V4L2 PCI Skeleton Driver loaded\n");
887 return 0;
888
889free_hdl:
890 v4l2_ctrl_handler_free(&skel->ctrl_handler);
891 v4l2_device_unregister(&skel->v4l2_dev);
892disable_pci:
893 pci_disable_device(pdev);
894 return ret;
895}
896
897static void skeleton_remove(struct pci_dev *pdev)
898{
899 struct v4l2_device *v4l2_dev = pci_get_drvdata(pdev);
900 struct skeleton *skel = container_of(v4l2_dev, struct skeleton, v4l2_dev);
901
902 video_unregister_device(&skel->vdev);
903 v4l2_ctrl_handler_free(&skel->ctrl_handler);
904 v4l2_device_unregister(&skel->v4l2_dev);
905 pci_disable_device(skel->pdev);
906}
907
908static struct pci_driver skeleton_driver = {
909 .name = KBUILD_MODNAME,
910 .probe = skeleton_probe,
911 .remove = skeleton_remove,
912 .id_table = skeleton_pci_tbl,
913};
914
915module_pci_driver(skeleton_driver);
diff --git a/samples/vfio-mdev/Makefile b/samples/vfio-mdev/Makefile
new file mode 100644
index 000000000..10d179c4f
--- /dev/null
+++ b/samples/vfio-mdev/Makefile
@@ -0,0 +1,5 @@
1# SPDX-License-Identifier: GPL-2.0-only
2obj-$(CONFIG_SAMPLE_VFIO_MDEV_MTTY) += mtty.o
3obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY) += mdpy.o
4obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY_FB) += mdpy-fb.o
5obj-$(CONFIG_SAMPLE_VFIO_MDEV_MBOCHS) += mbochs.o
diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
new file mode 100644
index 000000000..e03068917
--- /dev/null
+++ b/samples/vfio-mdev/mbochs.c
@@ -0,0 +1,1485 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Mediated virtual PCI display host device driver
4 *
5 * Emulate enough of qemu stdvga to make bochs-drm.ko happy. That is
6 * basically the vram memory bar and the bochs dispi interface vbe
7 * registers in the mmio register bar. Specifically it does *not*
8 * include any legacy vga stuff. Device looks a lot like "qemu -device
9 * secondary-vga".
10 *
11 * (c) Gerd Hoffmann <kraxel@redhat.com>
12 *
13 * based on mtty driver which is:
14 * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
15 * Author: Neo Jia <cjia@nvidia.com>
16 * Kirti Wankhede <kwankhede@nvidia.com>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License version 2 as
20 * published by the Free Software Foundation.
21 */
22#include <linux/init.h>
23#include <linux/module.h>
24#include <linux/device.h>
25#include <linux/kernel.h>
26#include <linux/slab.h>
27#include <linux/vmalloc.h>
28#include <linux/cdev.h>
29#include <linux/vfio.h>
30#include <linux/iommu.h>
31#include <linux/sysfs.h>
32#include <linux/mdev.h>
33#include <linux/pci.h>
34#include <linux/dma-buf.h>
35#include <linux/highmem.h>
36#include <drm/drm_fourcc.h>
37#include <drm/drm_rect.h>
38#include <drm/drm_modeset_lock.h>
39#include <drm/drm_property.h>
40#include <drm/drm_plane.h>
41
42
43#define VBE_DISPI_INDEX_ID 0x0
44#define VBE_DISPI_INDEX_XRES 0x1
45#define VBE_DISPI_INDEX_YRES 0x2
46#define VBE_DISPI_INDEX_BPP 0x3
47#define VBE_DISPI_INDEX_ENABLE 0x4
48#define VBE_DISPI_INDEX_BANK 0x5
49#define VBE_DISPI_INDEX_VIRT_WIDTH 0x6
50#define VBE_DISPI_INDEX_VIRT_HEIGHT 0x7
51#define VBE_DISPI_INDEX_X_OFFSET 0x8
52#define VBE_DISPI_INDEX_Y_OFFSET 0x9
53#define VBE_DISPI_INDEX_VIDEO_MEMORY_64K 0xa
54#define VBE_DISPI_INDEX_COUNT 0xb
55
56#define VBE_DISPI_ID0 0xB0C0
57#define VBE_DISPI_ID1 0xB0C1
58#define VBE_DISPI_ID2 0xB0C2
59#define VBE_DISPI_ID3 0xB0C3
60#define VBE_DISPI_ID4 0xB0C4
61#define VBE_DISPI_ID5 0xB0C5
62
63#define VBE_DISPI_DISABLED 0x00
64#define VBE_DISPI_ENABLED 0x01
65#define VBE_DISPI_GETCAPS 0x02
66#define VBE_DISPI_8BIT_DAC 0x20
67#define VBE_DISPI_LFB_ENABLED 0x40
68#define VBE_DISPI_NOCLEARMEM 0x80
69
70
71#define MBOCHS_NAME "mbochs"
72#define MBOCHS_CLASS_NAME "mbochs"
73
74#define MBOCHS_EDID_REGION_INDEX VFIO_PCI_NUM_REGIONS
75#define MBOCHS_NUM_REGIONS (MBOCHS_EDID_REGION_INDEX+1)
76
77#define MBOCHS_CONFIG_SPACE_SIZE 0xff
78#define MBOCHS_MMIO_BAR_OFFSET PAGE_SIZE
79#define MBOCHS_MMIO_BAR_SIZE PAGE_SIZE
80#define MBOCHS_EDID_OFFSET (MBOCHS_MMIO_BAR_OFFSET + \
81 MBOCHS_MMIO_BAR_SIZE)
82#define MBOCHS_EDID_SIZE PAGE_SIZE
83#define MBOCHS_MEMORY_BAR_OFFSET (MBOCHS_EDID_OFFSET + \
84 MBOCHS_EDID_SIZE)
85
86#define MBOCHS_EDID_BLOB_OFFSET (MBOCHS_EDID_SIZE/2)
87
88#define STORE_LE16(addr, val) (*(u16 *)addr = val)
89#define STORE_LE32(addr, val) (*(u32 *)addr = val)
90
91
92MODULE_LICENSE("GPL v2");
93
94static int max_mbytes = 256;
95module_param_named(count, max_mbytes, int, 0444);
96MODULE_PARM_DESC(mem, "megabytes available to " MBOCHS_NAME " devices");
97
98
99#define MBOCHS_TYPE_1 "small"
100#define MBOCHS_TYPE_2 "medium"
101#define MBOCHS_TYPE_3 "large"
102
103static const struct mbochs_type {
104 const char *name;
105 u32 mbytes;
106 u32 max_x;
107 u32 max_y;
108} mbochs_types[] = {
109 {
110 .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_1,
111 .mbytes = 4,
112 .max_x = 800,
113 .max_y = 600,
114 }, {
115 .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_2,
116 .mbytes = 16,
117 .max_x = 1920,
118 .max_y = 1440,
119 }, {
120 .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_3,
121 .mbytes = 64,
122 .max_x = 0,
123 .max_y = 0,
124 },
125};
126
127
128static dev_t mbochs_devt;
129static struct class *mbochs_class;
130static struct cdev mbochs_cdev;
131static struct device mbochs_dev;
132static int mbochs_used_mbytes;
133
134struct vfio_region_info_ext {
135 struct vfio_region_info base;
136 struct vfio_region_info_cap_type type;
137};
138
139struct mbochs_mode {
140 u32 drm_format;
141 u32 bytepp;
142 u32 width;
143 u32 height;
144 u32 stride;
145 u32 __pad;
146 u64 offset;
147 u64 size;
148};
149
150struct mbochs_dmabuf {
151 struct mbochs_mode mode;
152 u32 id;
153 struct page **pages;
154 pgoff_t pagecount;
155 struct dma_buf *buf;
156 struct mdev_state *mdev_state;
157 struct list_head next;
158 bool unlinked;
159};
160
161/* State of each mdev device */
162struct mdev_state {
163 u8 *vconfig;
164 u64 bar_mask[3];
165 u32 memory_bar_mask;
166 struct mutex ops_lock;
167 struct mdev_device *mdev;
168
169 const struct mbochs_type *type;
170 u16 vbe[VBE_DISPI_INDEX_COUNT];
171 u64 memsize;
172 struct page **pages;
173 pgoff_t pagecount;
174 struct vfio_region_gfx_edid edid_regs;
175 u8 edid_blob[0x400];
176
177 struct list_head dmabufs;
178 u32 active_id;
179 u32 next_id;
180};
181
182static const char *vbe_name_list[VBE_DISPI_INDEX_COUNT] = {
183 [VBE_DISPI_INDEX_ID] = "id",
184 [VBE_DISPI_INDEX_XRES] = "xres",
185 [VBE_DISPI_INDEX_YRES] = "yres",
186 [VBE_DISPI_INDEX_BPP] = "bpp",
187 [VBE_DISPI_INDEX_ENABLE] = "enable",
188 [VBE_DISPI_INDEX_BANK] = "bank",
189 [VBE_DISPI_INDEX_VIRT_WIDTH] = "virt-width",
190 [VBE_DISPI_INDEX_VIRT_HEIGHT] = "virt-height",
191 [VBE_DISPI_INDEX_X_OFFSET] = "x-offset",
192 [VBE_DISPI_INDEX_Y_OFFSET] = "y-offset",
193 [VBE_DISPI_INDEX_VIDEO_MEMORY_64K] = "video-mem",
194};
195
196static const char *vbe_name(u32 index)
197{
198 if (index < ARRAY_SIZE(vbe_name_list))
199 return vbe_name_list[index];
200 return "(invalid)";
201}
202
203static struct page *__mbochs_get_page(struct mdev_state *mdev_state,
204 pgoff_t pgoff);
205static struct page *mbochs_get_page(struct mdev_state *mdev_state,
206 pgoff_t pgoff);
207
208static const struct mbochs_type *mbochs_find_type(struct kobject *kobj)
209{
210 int i;
211
212 for (i = 0; i < ARRAY_SIZE(mbochs_types); i++)
213 if (strcmp(mbochs_types[i].name, kobj->name) == 0)
214 return mbochs_types + i;
215 return NULL;
216}
217
218static void mbochs_create_config_space(struct mdev_state *mdev_state)
219{
220 STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID],
221 0x1234);
222 STORE_LE16((u16 *) &mdev_state->vconfig[PCI_DEVICE_ID],
223 0x1111);
224 STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_VENDOR_ID],
225 PCI_SUBVENDOR_ID_REDHAT_QUMRANET);
226 STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_ID],
227 PCI_SUBDEVICE_ID_QEMU);
228
229 STORE_LE16((u16 *) &mdev_state->vconfig[PCI_COMMAND],
230 PCI_COMMAND_IO | PCI_COMMAND_MEMORY);
231 STORE_LE16((u16 *) &mdev_state->vconfig[PCI_CLASS_DEVICE],
232 PCI_CLASS_DISPLAY_OTHER);
233 mdev_state->vconfig[PCI_CLASS_REVISION] = 0x01;
234
235 STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_0],
236 PCI_BASE_ADDRESS_SPACE_MEMORY |
237 PCI_BASE_ADDRESS_MEM_TYPE_32 |
238 PCI_BASE_ADDRESS_MEM_PREFETCH);
239 mdev_state->bar_mask[0] = ~(mdev_state->memsize) + 1;
240
241 STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_2],
242 PCI_BASE_ADDRESS_SPACE_MEMORY |
243 PCI_BASE_ADDRESS_MEM_TYPE_32);
244 mdev_state->bar_mask[2] = ~(MBOCHS_MMIO_BAR_SIZE) + 1;
245}
246
247static int mbochs_check_framebuffer(struct mdev_state *mdev_state,
248 struct mbochs_mode *mode)
249{
250 struct device *dev = mdev_dev(mdev_state->mdev);
251 u16 *vbe = mdev_state->vbe;
252 u32 virt_width;
253
254 WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
255
256 if (!(vbe[VBE_DISPI_INDEX_ENABLE] & VBE_DISPI_ENABLED))
257 goto nofb;
258
259 memset(mode, 0, sizeof(*mode));
260 switch (vbe[VBE_DISPI_INDEX_BPP]) {
261 case 32:
262 mode->drm_format = DRM_FORMAT_XRGB8888;
263 mode->bytepp = 4;
264 break;
265 default:
266 dev_info_ratelimited(dev, "%s: bpp %d not supported\n",
267 __func__, vbe[VBE_DISPI_INDEX_BPP]);
268 goto nofb;
269 }
270
271 mode->width = vbe[VBE_DISPI_INDEX_XRES];
272 mode->height = vbe[VBE_DISPI_INDEX_YRES];
273 virt_width = vbe[VBE_DISPI_INDEX_VIRT_WIDTH];
274 if (virt_width < mode->width)
275 virt_width = mode->width;
276 mode->stride = virt_width * mode->bytepp;
277 mode->size = (u64)mode->stride * mode->height;
278 mode->offset = ((u64)vbe[VBE_DISPI_INDEX_X_OFFSET] * mode->bytepp +
279 (u64)vbe[VBE_DISPI_INDEX_Y_OFFSET] * mode->stride);
280
281 if (mode->width < 64 || mode->height < 64) {
282 dev_info_ratelimited(dev, "%s: invalid resolution %dx%d\n",
283 __func__, mode->width, mode->height);
284 goto nofb;
285 }
286 if (mode->offset + mode->size > mdev_state->memsize) {
287 dev_info_ratelimited(dev, "%s: framebuffer memory overflow\n",
288 __func__);
289 goto nofb;
290 }
291
292 return 0;
293
294nofb:
295 memset(mode, 0, sizeof(*mode));
296 return -EINVAL;
297}
298
299static bool mbochs_modes_equal(struct mbochs_mode *mode1,
300 struct mbochs_mode *mode2)
301{
302 return memcmp(mode1, mode2, sizeof(struct mbochs_mode)) == 0;
303}
304
305static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset,
306 char *buf, u32 count)
307{
308 struct device *dev = mdev_dev(mdev_state->mdev);
309 int index = (offset - PCI_BASE_ADDRESS_0) / 0x04;
310 u32 cfg_addr;
311
312 switch (offset) {
313 case PCI_BASE_ADDRESS_0:
314 case PCI_BASE_ADDRESS_2:
315 cfg_addr = *(u32 *)buf;
316
317 if (cfg_addr == 0xffffffff) {
318 cfg_addr = (cfg_addr & mdev_state->bar_mask[index]);
319 } else {
320 cfg_addr &= PCI_BASE_ADDRESS_MEM_MASK;
321 if (cfg_addr)
322 dev_info(dev, "BAR #%d @ 0x%x\n",
323 index, cfg_addr);
324 }
325
326 cfg_addr |= (mdev_state->vconfig[offset] &
327 ~PCI_BASE_ADDRESS_MEM_MASK);
328 STORE_LE32(&mdev_state->vconfig[offset], cfg_addr);
329 break;
330 }
331}
332
333static void handle_mmio_write(struct mdev_state *mdev_state, u16 offset,
334 char *buf, u32 count)
335{
336 struct device *dev = mdev_dev(mdev_state->mdev);
337 int index;
338 u16 reg16;
339
340 switch (offset) {
341 case 0x400 ... 0x41f: /* vga ioports remapped */
342 goto unhandled;
343 case 0x500 ... 0x515: /* bochs dispi interface */
344 if (count != 2)
345 goto unhandled;
346 index = (offset - 0x500) / 2;
347 reg16 = *(u16 *)buf;
348 if (index < ARRAY_SIZE(mdev_state->vbe))
349 mdev_state->vbe[index] = reg16;
350 dev_dbg(dev, "%s: vbe write %d = %d (%s)\n",
351 __func__, index, reg16, vbe_name(index));
352 break;
353 case 0x600 ... 0x607: /* qemu extended regs */
354 goto unhandled;
355 default:
356unhandled:
357 dev_dbg(dev, "%s: @0x%03x, count %d (unhandled)\n",
358 __func__, offset, count);
359 break;
360 }
361}
362
363static void handle_mmio_read(struct mdev_state *mdev_state, u16 offset,
364 char *buf, u32 count)
365{
366 struct device *dev = mdev_dev(mdev_state->mdev);
367 struct vfio_region_gfx_edid *edid;
368 u16 reg16 = 0;
369 int index;
370
371 switch (offset) {
372 case 0x000 ... 0x3ff: /* edid block */
373 edid = &mdev_state->edid_regs;
374 if (edid->link_state != VFIO_DEVICE_GFX_LINK_STATE_UP ||
375 offset >= edid->edid_size) {
376 memset(buf, 0, count);
377 break;
378 }
379 memcpy(buf, mdev_state->edid_blob + offset, count);
380 break;
381 case 0x500 ... 0x515: /* bochs dispi interface */
382 if (count != 2)
383 goto unhandled;
384 index = (offset - 0x500) / 2;
385 if (index < ARRAY_SIZE(mdev_state->vbe))
386 reg16 = mdev_state->vbe[index];
387 dev_dbg(dev, "%s: vbe read %d = %d (%s)\n",
388 __func__, index, reg16, vbe_name(index));
389 *(u16 *)buf = reg16;
390 break;
391 default:
392unhandled:
393 dev_dbg(dev, "%s: @0x%03x, count %d (unhandled)\n",
394 __func__, offset, count);
395 memset(buf, 0, count);
396 break;
397 }
398}
399
400static void handle_edid_regs(struct mdev_state *mdev_state, u16 offset,
401 char *buf, u32 count, bool is_write)
402{
403 char *regs = (void *)&mdev_state->edid_regs;
404
405 if (offset + count > sizeof(mdev_state->edid_regs))
406 return;
407 if (count != 4)
408 return;
409 if (offset % 4)
410 return;
411
412 if (is_write) {
413 switch (offset) {
414 case offsetof(struct vfio_region_gfx_edid, link_state):
415 case offsetof(struct vfio_region_gfx_edid, edid_size):
416 memcpy(regs + offset, buf, count);
417 break;
418 default:
419 /* read-only regs */
420 break;
421 }
422 } else {
423 memcpy(buf, regs + offset, count);
424 }
425}
426
427static void handle_edid_blob(struct mdev_state *mdev_state, u16 offset,
428 char *buf, u32 count, bool is_write)
429{
430 if (offset + count > mdev_state->edid_regs.edid_max_size)
431 return;
432 if (is_write)
433 memcpy(mdev_state->edid_blob + offset, buf, count);
434 else
435 memcpy(buf, mdev_state->edid_blob + offset, count);
436}
437
438static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count,
439 loff_t pos, bool is_write)
440{
441 struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
442 struct device *dev = mdev_dev(mdev);
443 struct page *pg;
444 loff_t poff;
445 char *map;
446 int ret = 0;
447
448 mutex_lock(&mdev_state->ops_lock);
449
450 if (pos < MBOCHS_CONFIG_SPACE_SIZE) {
451 if (is_write)
452 handle_pci_cfg_write(mdev_state, pos, buf, count);
453 else
454 memcpy(buf, (mdev_state->vconfig + pos), count);
455
456 } else if (pos >= MBOCHS_MMIO_BAR_OFFSET &&
457 pos + count <= (MBOCHS_MMIO_BAR_OFFSET +
458 MBOCHS_MMIO_BAR_SIZE)) {
459 pos -= MBOCHS_MMIO_BAR_OFFSET;
460 if (is_write)
461 handle_mmio_write(mdev_state, pos, buf, count);
462 else
463 handle_mmio_read(mdev_state, pos, buf, count);
464
465 } else if (pos >= MBOCHS_EDID_OFFSET &&
466 pos + count <= (MBOCHS_EDID_OFFSET +
467 MBOCHS_EDID_SIZE)) {
468 pos -= MBOCHS_EDID_OFFSET;
469 if (pos < MBOCHS_EDID_BLOB_OFFSET) {
470 handle_edid_regs(mdev_state, pos, buf, count, is_write);
471 } else {
472 pos -= MBOCHS_EDID_BLOB_OFFSET;
473 handle_edid_blob(mdev_state, pos, buf, count, is_write);
474 }
475
476 } else if (pos >= MBOCHS_MEMORY_BAR_OFFSET &&
477 pos + count <=
478 MBOCHS_MEMORY_BAR_OFFSET + mdev_state->memsize) {
479 pos -= MBOCHS_MMIO_BAR_OFFSET;
480 poff = pos & ~PAGE_MASK;
481 pg = __mbochs_get_page(mdev_state, pos >> PAGE_SHIFT);
482 map = kmap(pg);
483 if (is_write)
484 memcpy(map + poff, buf, count);
485 else
486 memcpy(buf, map + poff, count);
487 kunmap(pg);
488 put_page(pg);
489
490 } else {
491 dev_dbg(dev, "%s: %s @0x%llx (unhandled)\n",
492 __func__, is_write ? "WR" : "RD", pos);
493 ret = -1;
494 goto accessfailed;
495 }
496
497 ret = count;
498
499
500accessfailed:
501 mutex_unlock(&mdev_state->ops_lock);
502
503 return ret;
504}
505
506static int mbochs_reset(struct mdev_device *mdev)
507{
508 struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
509 u32 size64k = mdev_state->memsize / (64 * 1024);
510 int i;
511
512 for (i = 0; i < ARRAY_SIZE(mdev_state->vbe); i++)
513 mdev_state->vbe[i] = 0;
514 mdev_state->vbe[VBE_DISPI_INDEX_ID] = VBE_DISPI_ID5;
515 mdev_state->vbe[VBE_DISPI_INDEX_VIDEO_MEMORY_64K] = size64k;
516 return 0;
517}
518
519static int mbochs_create(struct kobject *kobj, struct mdev_device *mdev)
520{
521 const struct mbochs_type *type = mbochs_find_type(kobj);
522 struct device *dev = mdev_dev(mdev);
523 struct mdev_state *mdev_state;
524
525 if (!type)
526 type = &mbochs_types[0];
527 if (type->mbytes + mbochs_used_mbytes > max_mbytes)
528 return -ENOMEM;
529
530 mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL);
531 if (mdev_state == NULL)
532 return -ENOMEM;
533
534 mdev_state->vconfig = kzalloc(MBOCHS_CONFIG_SPACE_SIZE, GFP_KERNEL);
535 if (mdev_state->vconfig == NULL)
536 goto err_mem;
537
538 mdev_state->memsize = type->mbytes * 1024 * 1024;
539 mdev_state->pagecount = mdev_state->memsize >> PAGE_SHIFT;
540 mdev_state->pages = kcalloc(mdev_state->pagecount,
541 sizeof(struct page *),
542 GFP_KERNEL);
543 if (!mdev_state->pages)
544 goto err_mem;
545
546 dev_info(dev, "%s: %s, %d MB, %ld pages\n", __func__,
547 kobj->name, type->mbytes, mdev_state->pagecount);
548
549 mutex_init(&mdev_state->ops_lock);
550 mdev_state->mdev = mdev;
551 mdev_set_drvdata(mdev, mdev_state);
552 INIT_LIST_HEAD(&mdev_state->dmabufs);
553 mdev_state->next_id = 1;
554
555 mdev_state->type = type;
556 mdev_state->edid_regs.max_xres = type->max_x;
557 mdev_state->edid_regs.max_yres = type->max_y;
558 mdev_state->edid_regs.edid_offset = MBOCHS_EDID_BLOB_OFFSET;
559 mdev_state->edid_regs.edid_max_size = sizeof(mdev_state->edid_blob);
560 mbochs_create_config_space(mdev_state);
561 mbochs_reset(mdev);
562
563 mbochs_used_mbytes += type->mbytes;
564 return 0;
565
566err_mem:
567 kfree(mdev_state->vconfig);
568 kfree(mdev_state);
569 return -ENOMEM;
570}
571
572static int mbochs_remove(struct mdev_device *mdev)
573{
574 struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
575
576 mbochs_used_mbytes -= mdev_state->type->mbytes;
577 mdev_set_drvdata(mdev, NULL);
578 kfree(mdev_state->pages);
579 kfree(mdev_state->vconfig);
580 kfree(mdev_state);
581 return 0;
582}
583
584static ssize_t mbochs_read(struct mdev_device *mdev, char __user *buf,
585 size_t count, loff_t *ppos)
586{
587 unsigned int done = 0;
588 int ret;
589
590 while (count) {
591 size_t filled;
592
593 if (count >= 4 && !(*ppos % 4)) {
594 u32 val;
595
596 ret = mdev_access(mdev, (char *)&val, sizeof(val),
597 *ppos, false);
598 if (ret <= 0)
599 goto read_err;
600
601 if (copy_to_user(buf, &val, sizeof(val)))
602 goto read_err;
603
604 filled = 4;
605 } else if (count >= 2 && !(*ppos % 2)) {
606 u16 val;
607
608 ret = mdev_access(mdev, (char *)&val, sizeof(val),
609 *ppos, false);
610 if (ret <= 0)
611 goto read_err;
612
613 if (copy_to_user(buf, &val, sizeof(val)))
614 goto read_err;
615
616 filled = 2;
617 } else {
618 u8 val;
619
620 ret = mdev_access(mdev, (char *)&val, sizeof(val),
621 *ppos, false);
622 if (ret <= 0)
623 goto read_err;
624
625 if (copy_to_user(buf, &val, sizeof(val)))
626 goto read_err;
627
628 filled = 1;
629 }
630
631 count -= filled;
632 done += filled;
633 *ppos += filled;
634 buf += filled;
635 }
636
637 return done;
638
639read_err:
640 return -EFAULT;
641}
642
643static ssize_t mbochs_write(struct mdev_device *mdev, const char __user *buf,
644 size_t count, loff_t *ppos)
645{
646 unsigned int done = 0;
647 int ret;
648
649 while (count) {
650 size_t filled;
651
652 if (count >= 4 && !(*ppos % 4)) {
653 u32 val;
654
655 if (copy_from_user(&val, buf, sizeof(val)))
656 goto write_err;
657
658 ret = mdev_access(mdev, (char *)&val, sizeof(val),
659 *ppos, true);
660 if (ret <= 0)
661 goto write_err;
662
663 filled = 4;
664 } else if (count >= 2 && !(*ppos % 2)) {
665 u16 val;
666
667 if (copy_from_user(&val, buf, sizeof(val)))
668 goto write_err;
669
670 ret = mdev_access(mdev, (char *)&val, sizeof(val),
671 *ppos, true);
672 if (ret <= 0)
673 goto write_err;
674
675 filled = 2;
676 } else {
677 u8 val;
678
679 if (copy_from_user(&val, buf, sizeof(val)))
680 goto write_err;
681
682 ret = mdev_access(mdev, (char *)&val, sizeof(val),
683 *ppos, true);
684 if (ret <= 0)
685 goto write_err;
686
687 filled = 1;
688 }
689 count -= filled;
690 done += filled;
691 *ppos += filled;
692 buf += filled;
693 }
694
695 return done;
696write_err:
697 return -EFAULT;
698}
699
700static struct page *__mbochs_get_page(struct mdev_state *mdev_state,
701 pgoff_t pgoff)
702{
703 WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
704
705 if (!mdev_state->pages[pgoff]) {
706 mdev_state->pages[pgoff] =
707 alloc_pages(GFP_HIGHUSER | __GFP_ZERO, 0);
708 if (!mdev_state->pages[pgoff])
709 return NULL;
710 }
711
712 get_page(mdev_state->pages[pgoff]);
713 return mdev_state->pages[pgoff];
714}
715
716static struct page *mbochs_get_page(struct mdev_state *mdev_state,
717 pgoff_t pgoff)
718{
719 struct page *page;
720
721 if (WARN_ON(pgoff >= mdev_state->pagecount))
722 return NULL;
723
724 mutex_lock(&mdev_state->ops_lock);
725 page = __mbochs_get_page(mdev_state, pgoff);
726 mutex_unlock(&mdev_state->ops_lock);
727
728 return page;
729}
730
731static void mbochs_put_pages(struct mdev_state *mdev_state)
732{
733 struct device *dev = mdev_dev(mdev_state->mdev);
734 int i, count = 0;
735
736 WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
737
738 for (i = 0; i < mdev_state->pagecount; i++) {
739 if (!mdev_state->pages[i])
740 continue;
741 put_page(mdev_state->pages[i]);
742 mdev_state->pages[i] = NULL;
743 count++;
744 }
745 dev_dbg(dev, "%s: %d pages released\n", __func__, count);
746}
747
748static vm_fault_t mbochs_region_vm_fault(struct vm_fault *vmf)
749{
750 struct vm_area_struct *vma = vmf->vma;
751 struct mdev_state *mdev_state = vma->vm_private_data;
752 pgoff_t page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
753
754 if (page_offset >= mdev_state->pagecount)
755 return VM_FAULT_SIGBUS;
756
757 vmf->page = mbochs_get_page(mdev_state, page_offset);
758 if (!vmf->page)
759 return VM_FAULT_SIGBUS;
760
761 return 0;
762}
763
764static const struct vm_operations_struct mbochs_region_vm_ops = {
765 .fault = mbochs_region_vm_fault,
766};
767
768static int mbochs_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
769{
770 struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
771
772 if (vma->vm_pgoff != MBOCHS_MEMORY_BAR_OFFSET >> PAGE_SHIFT)
773 return -EINVAL;
774 if (vma->vm_end < vma->vm_start)
775 return -EINVAL;
776 if (vma->vm_end - vma->vm_start > mdev_state->memsize)
777 return -EINVAL;
778 if ((vma->vm_flags & VM_SHARED) == 0)
779 return -EINVAL;
780
781 vma->vm_ops = &mbochs_region_vm_ops;
782 vma->vm_private_data = mdev_state;
783 return 0;
784}
785
786static vm_fault_t mbochs_dmabuf_vm_fault(struct vm_fault *vmf)
787{
788 struct vm_area_struct *vma = vmf->vma;
789 struct mbochs_dmabuf *dmabuf = vma->vm_private_data;
790
791 if (WARN_ON(vmf->pgoff >= dmabuf->pagecount))
792 return VM_FAULT_SIGBUS;
793
794 vmf->page = dmabuf->pages[vmf->pgoff];
795 get_page(vmf->page);
796 return 0;
797}
798
799static const struct vm_operations_struct mbochs_dmabuf_vm_ops = {
800 .fault = mbochs_dmabuf_vm_fault,
801};
802
803static int mbochs_mmap_dmabuf(struct dma_buf *buf, struct vm_area_struct *vma)
804{
805 struct mbochs_dmabuf *dmabuf = buf->priv;
806 struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
807
808 dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
809
810 if ((vma->vm_flags & VM_SHARED) == 0)
811 return -EINVAL;
812
813 vma->vm_ops = &mbochs_dmabuf_vm_ops;
814 vma->vm_private_data = dmabuf;
815 return 0;
816}
817
818static void mbochs_print_dmabuf(struct mbochs_dmabuf *dmabuf,
819 const char *prefix)
820{
821 struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
822 u32 fourcc = dmabuf->mode.drm_format;
823
824 dev_dbg(dev, "%s/%d: %c%c%c%c, %dx%d, stride %d, off 0x%llx, size 0x%llx, pages %ld\n",
825 prefix, dmabuf->id,
826 fourcc ? ((fourcc >> 0) & 0xff) : '-',
827 fourcc ? ((fourcc >> 8) & 0xff) : '-',
828 fourcc ? ((fourcc >> 16) & 0xff) : '-',
829 fourcc ? ((fourcc >> 24) & 0xff) : '-',
830 dmabuf->mode.width, dmabuf->mode.height, dmabuf->mode.stride,
831 dmabuf->mode.offset, dmabuf->mode.size, dmabuf->pagecount);
832}
833
834static struct sg_table *mbochs_map_dmabuf(struct dma_buf_attachment *at,
835 enum dma_data_direction direction)
836{
837 struct mbochs_dmabuf *dmabuf = at->dmabuf->priv;
838 struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
839 struct sg_table *sg;
840
841 dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
842
843 sg = kzalloc(sizeof(*sg), GFP_KERNEL);
844 if (!sg)
845 goto err1;
846 if (sg_alloc_table_from_pages(sg, dmabuf->pages, dmabuf->pagecount,
847 0, dmabuf->mode.size, GFP_KERNEL) < 0)
848 goto err2;
849 if (dma_map_sgtable(at->dev, sg, direction, 0))
850 goto err3;
851
852 return sg;
853
854err3:
855 sg_free_table(sg);
856err2:
857 kfree(sg);
858err1:
859 return ERR_PTR(-ENOMEM);
860}
861
862static void mbochs_unmap_dmabuf(struct dma_buf_attachment *at,
863 struct sg_table *sg,
864 enum dma_data_direction direction)
865{
866 struct mbochs_dmabuf *dmabuf = at->dmabuf->priv;
867 struct device *dev = mdev_dev(dmabuf->mdev_state->mdev);
868
869 dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
870
871 dma_unmap_sgtable(at->dev, sg, direction, 0);
872 sg_free_table(sg);
873 kfree(sg);
874}
875
876static void mbochs_release_dmabuf(struct dma_buf *buf)
877{
878 struct mbochs_dmabuf *dmabuf = buf->priv;
879 struct mdev_state *mdev_state = dmabuf->mdev_state;
880 struct device *dev = mdev_dev(mdev_state->mdev);
881 pgoff_t pg;
882
883 dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
884
885 for (pg = 0; pg < dmabuf->pagecount; pg++)
886 put_page(dmabuf->pages[pg]);
887
888 mutex_lock(&mdev_state->ops_lock);
889 dmabuf->buf = NULL;
890 if (dmabuf->unlinked)
891 kfree(dmabuf);
892 mutex_unlock(&mdev_state->ops_lock);
893}
894
895static struct dma_buf_ops mbochs_dmabuf_ops = {
896 .map_dma_buf = mbochs_map_dmabuf,
897 .unmap_dma_buf = mbochs_unmap_dmabuf,
898 .release = mbochs_release_dmabuf,
899 .mmap = mbochs_mmap_dmabuf,
900};
901
902static struct mbochs_dmabuf *mbochs_dmabuf_alloc(struct mdev_state *mdev_state,
903 struct mbochs_mode *mode)
904{
905 struct mbochs_dmabuf *dmabuf;
906 pgoff_t page_offset, pg;
907
908 WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
909
910 dmabuf = kzalloc(sizeof(struct mbochs_dmabuf), GFP_KERNEL);
911 if (!dmabuf)
912 return NULL;
913
914 dmabuf->mode = *mode;
915 dmabuf->id = mdev_state->next_id++;
916 dmabuf->pagecount = DIV_ROUND_UP(mode->size, PAGE_SIZE);
917 dmabuf->pages = kcalloc(dmabuf->pagecount, sizeof(struct page *),
918 GFP_KERNEL);
919 if (!dmabuf->pages)
920 goto err_free_dmabuf;
921
922 page_offset = dmabuf->mode.offset >> PAGE_SHIFT;
923 for (pg = 0; pg < dmabuf->pagecount; pg++) {
924 dmabuf->pages[pg] = __mbochs_get_page(mdev_state,
925 page_offset + pg);
926 if (!dmabuf->pages[pg])
927 goto err_free_pages;
928 }
929
930 dmabuf->mdev_state = mdev_state;
931 list_add(&dmabuf->next, &mdev_state->dmabufs);
932
933 mbochs_print_dmabuf(dmabuf, __func__);
934 return dmabuf;
935
936err_free_pages:
937 while (pg > 0)
938 put_page(dmabuf->pages[--pg]);
939 kfree(dmabuf->pages);
940err_free_dmabuf:
941 kfree(dmabuf);
942 return NULL;
943}
944
945static struct mbochs_dmabuf *
946mbochs_dmabuf_find_by_mode(struct mdev_state *mdev_state,
947 struct mbochs_mode *mode)
948{
949 struct mbochs_dmabuf *dmabuf;
950
951 WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
952
953 list_for_each_entry(dmabuf, &mdev_state->dmabufs, next)
954 if (mbochs_modes_equal(&dmabuf->mode, mode))
955 return dmabuf;
956
957 return NULL;
958}
959
960static struct mbochs_dmabuf *
961mbochs_dmabuf_find_by_id(struct mdev_state *mdev_state, u32 id)
962{
963 struct mbochs_dmabuf *dmabuf;
964
965 WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
966
967 list_for_each_entry(dmabuf, &mdev_state->dmabufs, next)
968 if (dmabuf->id == id)
969 return dmabuf;
970
971 return NULL;
972}
973
974static int mbochs_dmabuf_export(struct mbochs_dmabuf *dmabuf)
975{
976 struct mdev_state *mdev_state = dmabuf->mdev_state;
977 struct device *dev = mdev_dev(mdev_state->mdev);
978 DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
979 struct dma_buf *buf;
980
981 WARN_ON(!mutex_is_locked(&mdev_state->ops_lock));
982
983 if (!IS_ALIGNED(dmabuf->mode.offset, PAGE_SIZE)) {
984 dev_info_ratelimited(dev, "%s: framebuffer not page-aligned\n",
985 __func__);
986 return -EINVAL;
987 }
988
989 exp_info.ops = &mbochs_dmabuf_ops;
990 exp_info.size = dmabuf->mode.size;
991 exp_info.priv = dmabuf;
992
993 buf = dma_buf_export(&exp_info);
994 if (IS_ERR(buf)) {
995 dev_info_ratelimited(dev, "%s: dma_buf_export failed: %ld\n",
996 __func__, PTR_ERR(buf));
997 return PTR_ERR(buf);
998 }
999
1000 dmabuf->buf = buf;
1001 dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id);
1002 return 0;
1003}
1004
1005static int mbochs_get_region_info(struct mdev_device *mdev,
1006 struct vfio_region_info_ext *ext)
1007{
1008 struct vfio_region_info *region_info = &ext->base;
1009 struct mdev_state *mdev_state;
1010
1011 mdev_state = mdev_get_drvdata(mdev);
1012 if (!mdev_state)
1013 return -EINVAL;
1014
1015 if (region_info->index >= MBOCHS_NUM_REGIONS)
1016 return -EINVAL;
1017
1018 switch (region_info->index) {
1019 case VFIO_PCI_CONFIG_REGION_INDEX:
1020 region_info->offset = 0;
1021 region_info->size = MBOCHS_CONFIG_SPACE_SIZE;
1022 region_info->flags = (VFIO_REGION_INFO_FLAG_READ |
1023 VFIO_REGION_INFO_FLAG_WRITE);
1024 break;
1025 case VFIO_PCI_BAR0_REGION_INDEX:
1026 region_info->offset = MBOCHS_MEMORY_BAR_OFFSET;
1027 region_info->size = mdev_state->memsize;
1028 region_info->flags = (VFIO_REGION_INFO_FLAG_READ |
1029 VFIO_REGION_INFO_FLAG_WRITE |
1030 VFIO_REGION_INFO_FLAG_MMAP);
1031 break;
1032 case VFIO_PCI_BAR2_REGION_INDEX:
1033 region_info->offset = MBOCHS_MMIO_BAR_OFFSET;
1034 region_info->size = MBOCHS_MMIO_BAR_SIZE;
1035 region_info->flags = (VFIO_REGION_INFO_FLAG_READ |
1036 VFIO_REGION_INFO_FLAG_WRITE);
1037 break;
1038 case MBOCHS_EDID_REGION_INDEX:
1039 ext->base.argsz = sizeof(*ext);
1040 ext->base.offset = MBOCHS_EDID_OFFSET;
1041 ext->base.size = MBOCHS_EDID_SIZE;
1042 ext->base.flags = (VFIO_REGION_INFO_FLAG_READ |
1043 VFIO_REGION_INFO_FLAG_WRITE |
1044 VFIO_REGION_INFO_FLAG_CAPS);
1045 ext->base.cap_offset = offsetof(typeof(*ext), type);
1046 ext->type.header.id = VFIO_REGION_INFO_CAP_TYPE;
1047 ext->type.header.version = 1;
1048 ext->type.header.next = 0;
1049 ext->type.type = VFIO_REGION_TYPE_GFX;
1050 ext->type.subtype = VFIO_REGION_SUBTYPE_GFX_EDID;
1051 break;
1052 default:
1053 region_info->size = 0;
1054 region_info->offset = 0;
1055 region_info->flags = 0;
1056 }
1057
1058 return 0;
1059}
1060
1061static int mbochs_get_irq_info(struct mdev_device *mdev,
1062 struct vfio_irq_info *irq_info)
1063{
1064 irq_info->count = 0;
1065 return 0;
1066}
1067
1068static int mbochs_get_device_info(struct mdev_device *mdev,
1069 struct vfio_device_info *dev_info)
1070{
1071 dev_info->flags = VFIO_DEVICE_FLAGS_PCI;
1072 dev_info->num_regions = MBOCHS_NUM_REGIONS;
1073 dev_info->num_irqs = VFIO_PCI_NUM_IRQS;
1074 return 0;
1075}
1076
1077static int mbochs_query_gfx_plane(struct mdev_device *mdev,
1078 struct vfio_device_gfx_plane_info *plane)
1079{
1080 struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
1081 struct device *dev = mdev_dev(mdev);
1082 struct mbochs_dmabuf *dmabuf;
1083 struct mbochs_mode mode;
1084 int ret;
1085
1086 if (plane->flags & VFIO_GFX_PLANE_TYPE_PROBE) {
1087 if (plane->flags == (VFIO_GFX_PLANE_TYPE_PROBE |
1088 VFIO_GFX_PLANE_TYPE_DMABUF))
1089 return 0;
1090 return -EINVAL;
1091 }
1092
1093 if (plane->flags != VFIO_GFX_PLANE_TYPE_DMABUF)
1094 return -EINVAL;
1095
1096 plane->drm_format_mod = 0;
1097 plane->x_pos = 0;
1098 plane->y_pos = 0;
1099 plane->x_hot = 0;
1100 plane->y_hot = 0;
1101
1102 mutex_lock(&mdev_state->ops_lock);
1103
1104 ret = -EINVAL;
1105 if (plane->drm_plane_type == DRM_PLANE_TYPE_PRIMARY)
1106 ret = mbochs_check_framebuffer(mdev_state, &mode);
1107 if (ret < 0) {
1108 plane->drm_format = 0;
1109 plane->width = 0;
1110 plane->height = 0;
1111 plane->stride = 0;
1112 plane->size = 0;
1113 plane->dmabuf_id = 0;
1114 goto done;
1115 }
1116
1117 dmabuf = mbochs_dmabuf_find_by_mode(mdev_state, &mode);
1118 if (!dmabuf)
1119 mbochs_dmabuf_alloc(mdev_state, &mode);
1120 if (!dmabuf) {
1121 mutex_unlock(&mdev_state->ops_lock);
1122 return -ENOMEM;
1123 }
1124
1125 plane->drm_format = dmabuf->mode.drm_format;
1126 plane->width = dmabuf->mode.width;
1127 plane->height = dmabuf->mode.height;
1128 plane->stride = dmabuf->mode.stride;
1129 plane->size = dmabuf->mode.size;
1130 plane->dmabuf_id = dmabuf->id;
1131
1132done:
1133 if (plane->drm_plane_type == DRM_PLANE_TYPE_PRIMARY &&
1134 mdev_state->active_id != plane->dmabuf_id) {
1135 dev_dbg(dev, "%s: primary: %d => %d\n", __func__,
1136 mdev_state->active_id, plane->dmabuf_id);
1137 mdev_state->active_id = plane->dmabuf_id;
1138 }
1139 mutex_unlock(&mdev_state->ops_lock);
1140 return 0;
1141}
1142
1143static int mbochs_get_gfx_dmabuf(struct mdev_device *mdev,
1144 u32 id)
1145{
1146 struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
1147 struct mbochs_dmabuf *dmabuf;
1148
1149 mutex_lock(&mdev_state->ops_lock);
1150
1151 dmabuf = mbochs_dmabuf_find_by_id(mdev_state, id);
1152 if (!dmabuf) {
1153 mutex_unlock(&mdev_state->ops_lock);
1154 return -ENOENT;
1155 }
1156
1157 if (!dmabuf->buf)
1158 mbochs_dmabuf_export(dmabuf);
1159
1160 mutex_unlock(&mdev_state->ops_lock);
1161
1162 if (!dmabuf->buf)
1163 return -EINVAL;
1164
1165 return dma_buf_fd(dmabuf->buf, 0);
1166}
1167
1168static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd,
1169 unsigned long arg)
1170{
1171 int ret = 0;
1172 unsigned long minsz, outsz;
1173
1174 switch (cmd) {
1175 case VFIO_DEVICE_GET_INFO:
1176 {
1177 struct vfio_device_info info;
1178
1179 minsz = offsetofend(struct vfio_device_info, num_irqs);
1180
1181 if (copy_from_user(&info, (void __user *)arg, minsz))
1182 return -EFAULT;
1183
1184 if (info.argsz < minsz)
1185 return -EINVAL;
1186
1187 ret = mbochs_get_device_info(mdev, &info);
1188 if (ret)
1189 return ret;
1190
1191 if (copy_to_user((void __user *)arg, &info, minsz))
1192 return -EFAULT;
1193
1194 return 0;
1195 }
1196 case VFIO_DEVICE_GET_REGION_INFO:
1197 {
1198 struct vfio_region_info_ext info;
1199
1200 minsz = offsetofend(typeof(info), base.offset);
1201
1202 if (copy_from_user(&info, (void __user *)arg, minsz))
1203 return -EFAULT;
1204
1205 outsz = info.base.argsz;
1206 if (outsz < minsz)
1207 return -EINVAL;
1208 if (outsz > sizeof(info))
1209 return -EINVAL;
1210
1211 ret = mbochs_get_region_info(mdev, &info);
1212 if (ret)
1213 return ret;
1214
1215 if (copy_to_user((void __user *)arg, &info, outsz))
1216 return -EFAULT;
1217
1218 return 0;
1219 }
1220
1221 case VFIO_DEVICE_GET_IRQ_INFO:
1222 {
1223 struct vfio_irq_info info;
1224
1225 minsz = offsetofend(struct vfio_irq_info, count);
1226
1227 if (copy_from_user(&info, (void __user *)arg, minsz))
1228 return -EFAULT;
1229
1230 if ((info.argsz < minsz) ||
1231 (info.index >= VFIO_PCI_NUM_IRQS))
1232 return -EINVAL;
1233
1234 ret = mbochs_get_irq_info(mdev, &info);
1235 if (ret)
1236 return ret;
1237
1238 if (copy_to_user((void __user *)arg, &info, minsz))
1239 return -EFAULT;
1240
1241 return 0;
1242 }
1243
1244 case VFIO_DEVICE_QUERY_GFX_PLANE:
1245 {
1246 struct vfio_device_gfx_plane_info plane;
1247
1248 minsz = offsetofend(struct vfio_device_gfx_plane_info,
1249 region_index);
1250
1251 if (copy_from_user(&plane, (void __user *)arg, minsz))
1252 return -EFAULT;
1253
1254 if (plane.argsz < minsz)
1255 return -EINVAL;
1256
1257 ret = mbochs_query_gfx_plane(mdev, &plane);
1258 if (ret)
1259 return ret;
1260
1261 if (copy_to_user((void __user *)arg, &plane, minsz))
1262 return -EFAULT;
1263
1264 return 0;
1265 }
1266
1267 case VFIO_DEVICE_GET_GFX_DMABUF:
1268 {
1269 u32 dmabuf_id;
1270
1271 if (get_user(dmabuf_id, (__u32 __user *)arg))
1272 return -EFAULT;
1273
1274 return mbochs_get_gfx_dmabuf(mdev, dmabuf_id);
1275 }
1276
1277 case VFIO_DEVICE_SET_IRQS:
1278 return -EINVAL;
1279
1280 case VFIO_DEVICE_RESET:
1281 return mbochs_reset(mdev);
1282 }
1283 return -ENOTTY;
1284}
1285
1286static int mbochs_open(struct mdev_device *mdev)
1287{
1288 if (!try_module_get(THIS_MODULE))
1289 return -ENODEV;
1290
1291 return 0;
1292}
1293
1294static void mbochs_close(struct mdev_device *mdev)
1295{
1296 struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
1297 struct mbochs_dmabuf *dmabuf, *tmp;
1298
1299 mutex_lock(&mdev_state->ops_lock);
1300
1301 list_for_each_entry_safe(dmabuf, tmp, &mdev_state->dmabufs, next) {
1302 list_del(&dmabuf->next);
1303 if (dmabuf->buf) {
1304 /* free in mbochs_release_dmabuf() */
1305 dmabuf->unlinked = true;
1306 } else {
1307 kfree(dmabuf);
1308 }
1309 }
1310 mbochs_put_pages(mdev_state);
1311
1312 mutex_unlock(&mdev_state->ops_lock);
1313 module_put(THIS_MODULE);
1314}
1315
1316static ssize_t
1317memory_show(struct device *dev, struct device_attribute *attr,
1318 char *buf)
1319{
1320 struct mdev_device *mdev = mdev_from_dev(dev);
1321 struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
1322
1323 return sprintf(buf, "%d MB\n", mdev_state->type->mbytes);
1324}
1325static DEVICE_ATTR_RO(memory);
1326
1327static struct attribute *mdev_dev_attrs[] = {
1328 &dev_attr_memory.attr,
1329 NULL,
1330};
1331
1332static const struct attribute_group mdev_dev_group = {
1333 .name = "vendor",
1334 .attrs = mdev_dev_attrs,
1335};
1336
1337const struct attribute_group *mdev_dev_groups[] = {
1338 &mdev_dev_group,
1339 NULL,
1340};
1341
1342static ssize_t
1343name_show(struct kobject *kobj, struct device *dev, char *buf)
1344{
1345 return sprintf(buf, "%s\n", kobj->name);
1346}
1347MDEV_TYPE_ATTR_RO(name);
1348
1349static ssize_t
1350description_show(struct kobject *kobj, struct device *dev, char *buf)
1351{
1352 const struct mbochs_type *type = mbochs_find_type(kobj);
1353
1354 return sprintf(buf, "virtual display, %d MB video memory\n",
1355 type ? type->mbytes : 0);
1356}
1357MDEV_TYPE_ATTR_RO(description);
1358
1359static ssize_t
1360available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
1361{
1362 const struct mbochs_type *type = mbochs_find_type(kobj);
1363 int count = (max_mbytes - mbochs_used_mbytes) / type->mbytes;
1364
1365 return sprintf(buf, "%d\n", count);
1366}
1367MDEV_TYPE_ATTR_RO(available_instances);
1368
1369static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
1370 char *buf)
1371{
1372 return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
1373}
1374MDEV_TYPE_ATTR_RO(device_api);
1375
1376static struct attribute *mdev_types_attrs[] = {
1377 &mdev_type_attr_name.attr,
1378 &mdev_type_attr_description.attr,
1379 &mdev_type_attr_device_api.attr,
1380 &mdev_type_attr_available_instances.attr,
1381 NULL,
1382};
1383
1384static struct attribute_group mdev_type_group1 = {
1385 .name = MBOCHS_TYPE_1,
1386 .attrs = mdev_types_attrs,
1387};
1388
1389static struct attribute_group mdev_type_group2 = {
1390 .name = MBOCHS_TYPE_2,
1391 .attrs = mdev_types_attrs,
1392};
1393
1394static struct attribute_group mdev_type_group3 = {
1395 .name = MBOCHS_TYPE_3,
1396 .attrs = mdev_types_attrs,
1397};
1398
1399static struct attribute_group *mdev_type_groups[] = {
1400 &mdev_type_group1,
1401 &mdev_type_group2,
1402 &mdev_type_group3,
1403 NULL,
1404};
1405
1406static const struct mdev_parent_ops mdev_fops = {
1407 .owner = THIS_MODULE,
1408 .mdev_attr_groups = mdev_dev_groups,
1409 .supported_type_groups = mdev_type_groups,
1410 .create = mbochs_create,
1411 .remove = mbochs_remove,
1412 .open = mbochs_open,
1413 .release = mbochs_close,
1414 .read = mbochs_read,
1415 .write = mbochs_write,
1416 .ioctl = mbochs_ioctl,
1417 .mmap = mbochs_mmap,
1418};
1419
1420static const struct file_operations vd_fops = {
1421 .owner = THIS_MODULE,
1422};
1423
1424static void mbochs_device_release(struct device *dev)
1425{
1426 /* nothing */
1427}
1428
1429static int __init mbochs_dev_init(void)
1430{
1431 int ret = 0;
1432
1433 ret = alloc_chrdev_region(&mbochs_devt, 0, MINORMASK + 1, MBOCHS_NAME);
1434 if (ret < 0) {
1435 pr_err("Error: failed to register mbochs_dev, err: %d\n", ret);
1436 return ret;
1437 }
1438 cdev_init(&mbochs_cdev, &vd_fops);
1439 cdev_add(&mbochs_cdev, mbochs_devt, MINORMASK + 1);
1440 pr_info("%s: major %d\n", __func__, MAJOR(mbochs_devt));
1441
1442 mbochs_class = class_create(THIS_MODULE, MBOCHS_CLASS_NAME);
1443 if (IS_ERR(mbochs_class)) {
1444 pr_err("Error: failed to register mbochs_dev class\n");
1445 ret = PTR_ERR(mbochs_class);
1446 goto failed1;
1447 }
1448 mbochs_dev.class = mbochs_class;
1449 mbochs_dev.release = mbochs_device_release;
1450 dev_set_name(&mbochs_dev, "%s", MBOCHS_NAME);
1451
1452 ret = device_register(&mbochs_dev);
1453 if (ret)
1454 goto failed2;
1455
1456 ret = mdev_register_device(&mbochs_dev, &mdev_fops);
1457 if (ret)
1458 goto failed3;
1459
1460 return 0;
1461
1462failed3:
1463 device_unregister(&mbochs_dev);
1464failed2:
1465 class_destroy(mbochs_class);
1466failed1:
1467 cdev_del(&mbochs_cdev);
1468 unregister_chrdev_region(mbochs_devt, MINORMASK + 1);
1469 return ret;
1470}
1471
1472static void __exit mbochs_dev_exit(void)
1473{
1474 mbochs_dev.bus = NULL;
1475 mdev_unregister_device(&mbochs_dev);
1476
1477 device_unregister(&mbochs_dev);
1478 cdev_del(&mbochs_cdev);
1479 unregister_chrdev_region(mbochs_devt, MINORMASK + 1);
1480 class_destroy(mbochs_class);
1481 mbochs_class = NULL;
1482}
1483
1484module_init(mbochs_dev_init)
1485module_exit(mbochs_dev_exit)
diff --git a/samples/vfio-mdev/mdpy-defs.h b/samples/vfio-mdev/mdpy-defs.h
new file mode 100644
index 000000000..961c55ec3
--- /dev/null
+++ b/samples/vfio-mdev/mdpy-defs.h
@@ -0,0 +1,22 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Simple pci display device.
4 *
5 * Framebuffer memory is pci bar 0.
6 * Configuration (read-only) is in pci config space.
7 * Format field uses drm fourcc codes.
8 * ATM only DRM_FORMAT_XRGB8888 is supported.
9 */
10
11/* pci ids */
12#define MDPY_PCI_VENDOR_ID PCI_VENDOR_ID_REDHAT
13#define MDPY_PCI_DEVICE_ID 0x000f
14#define MDPY_PCI_SUBVENDOR_ID PCI_SUBVENDOR_ID_REDHAT_QUMRANET
15#define MDPY_PCI_SUBDEVICE_ID PCI_SUBDEVICE_ID_QEMU
16
17/* pci cfg space offsets for fb config (dword) */
18#define MDPY_VENDORCAP_OFFSET 0x40
19#define MDPY_VENDORCAP_SIZE 0x10
20#define MDPY_FORMAT_OFFSET (MDPY_VENDORCAP_OFFSET + 0x04)
21#define MDPY_WIDTH_OFFSET (MDPY_VENDORCAP_OFFSET + 0x08)
22#define MDPY_HEIGHT_OFFSET (MDPY_VENDORCAP_OFFSET + 0x0c)
diff --git a/samples/vfio-mdev/mdpy-fb.c b/samples/vfio-mdev/mdpy-fb.c
new file mode 100644
index 000000000..4eb7aa11c
--- /dev/null
+++ b/samples/vfio-mdev/mdpy-fb.c
@@ -0,0 +1,243 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Framebuffer driver for mdpy (mediated virtual pci display device).
4 *
5 * See mdpy-defs.h for device specs
6 *
7 * (c) Gerd Hoffmann <kraxel@redhat.com>
8 *
9 * Using some code snippets from simplefb and cirrusfb.
10 *
11 * This program is free software; you can redistribute it and/or modify it
12 * under the terms and conditions of the GNU General Public License,
13 * version 2, as published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope it will be useful, but WITHOUT
16 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
17 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 * more details.
19 */
20#include <linux/errno.h>
21#include <linux/fb.h>
22#include <linux/io.h>
23#include <linux/pci.h>
24#include <linux/module.h>
25#include <drm/drm_fourcc.h>
26#include "mdpy-defs.h"
27
28static const struct fb_fix_screeninfo mdpy_fb_fix = {
29 .id = "mdpy-fb",
30 .type = FB_TYPE_PACKED_PIXELS,
31 .visual = FB_VISUAL_TRUECOLOR,
32 .accel = FB_ACCEL_NONE,
33};
34
35static const struct fb_var_screeninfo mdpy_fb_var = {
36 .height = -1,
37 .width = -1,
38 .activate = FB_ACTIVATE_NOW,
39 .vmode = FB_VMODE_NONINTERLACED,
40
41 .bits_per_pixel = 32,
42 .transp.offset = 24,
43 .red.offset = 16,
44 .green.offset = 8,
45 .blue.offset = 0,
46 .transp.length = 8,
47 .red.length = 8,
48 .green.length = 8,
49 .blue.length = 8,
50};
51
52#define PSEUDO_PALETTE_SIZE 16
53
54struct mdpy_fb_par {
55 u32 palette[PSEUDO_PALETTE_SIZE];
56};
57
58static int mdpy_fb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
59 u_int transp, struct fb_info *info)
60{
61 u32 *pal = info->pseudo_palette;
62 u32 cr = red >> (16 - info->var.red.length);
63 u32 cg = green >> (16 - info->var.green.length);
64 u32 cb = blue >> (16 - info->var.blue.length);
65 u32 value, mask;
66
67 if (regno >= PSEUDO_PALETTE_SIZE)
68 return -EINVAL;
69
70 value = (cr << info->var.red.offset) |
71 (cg << info->var.green.offset) |
72 (cb << info->var.blue.offset);
73 if (info->var.transp.length > 0) {
74 mask = (1 << info->var.transp.length) - 1;
75 mask <<= info->var.transp.offset;
76 value |= mask;
77 }
78 pal[regno] = value;
79
80 return 0;
81}
82
83static void mdpy_fb_destroy(struct fb_info *info)
84{
85 if (info->screen_base)
86 iounmap(info->screen_base);
87}
88
89static const struct fb_ops mdpy_fb_ops = {
90 .owner = THIS_MODULE,
91 .fb_destroy = mdpy_fb_destroy,
92 .fb_setcolreg = mdpy_fb_setcolreg,
93 .fb_fillrect = cfb_fillrect,
94 .fb_copyarea = cfb_copyarea,
95 .fb_imageblit = cfb_imageblit,
96};
97
98static int mdpy_fb_probe(struct pci_dev *pdev,
99 const struct pci_device_id *ent)
100{
101 struct fb_info *info;
102 struct mdpy_fb_par *par;
103 u32 format, width, height;
104 int ret;
105
106 ret = pci_enable_device(pdev);
107 if (ret < 0)
108 return ret;
109
110 ret = pci_request_regions(pdev, "mdpy-fb");
111 if (ret < 0)
112 goto err_disable_dev;
113
114 pci_read_config_dword(pdev, MDPY_FORMAT_OFFSET, &format);
115 pci_read_config_dword(pdev, MDPY_WIDTH_OFFSET, &width);
116 pci_read_config_dword(pdev, MDPY_HEIGHT_OFFSET, &height);
117 if (format != DRM_FORMAT_XRGB8888) {
118 pci_err(pdev, "format mismatch (0x%x != 0x%x)\n",
119 format, DRM_FORMAT_XRGB8888);
120 ret = -EINVAL;
121 goto err_release_regions;
122 }
123 if (width < 100 || width > 10000) {
124 pci_err(pdev, "width (%d) out of range\n", width);
125 ret = -EINVAL;
126 goto err_release_regions;
127 }
128 if (height < 100 || height > 10000) {
129 pci_err(pdev, "height (%d) out of range\n", height);
130 ret = -EINVAL;
131 goto err_release_regions;
132 }
133 pci_info(pdev, "mdpy found: %dx%d framebuffer\n",
134 width, height);
135
136 info = framebuffer_alloc(sizeof(struct mdpy_fb_par), &pdev->dev);
137 if (!info) {
138 ret = -ENOMEM;
139 goto err_release_regions;
140 }
141 pci_set_drvdata(pdev, info);
142 par = info->par;
143
144 info->fix = mdpy_fb_fix;
145 info->fix.smem_start = pci_resource_start(pdev, 0);
146 info->fix.smem_len = pci_resource_len(pdev, 0);
147 info->fix.line_length = width * 4;
148
149 info->var = mdpy_fb_var;
150 info->var.xres = width;
151 info->var.yres = height;
152 info->var.xres_virtual = width;
153 info->var.yres_virtual = height;
154
155 info->screen_size = info->fix.smem_len;
156 info->screen_base = ioremap(info->fix.smem_start,
157 info->screen_size);
158 if (!info->screen_base) {
159 pci_err(pdev, "ioremap(pcibar) failed\n");
160 ret = -EIO;
161 goto err_release_fb;
162 }
163
164 info->apertures = alloc_apertures(1);
165 if (!info->apertures) {
166 ret = -ENOMEM;
167 goto err_unmap;
168 }
169 info->apertures->ranges[0].base = info->fix.smem_start;
170 info->apertures->ranges[0].size = info->fix.smem_len;
171
172 info->fbops = &mdpy_fb_ops;
173 info->flags = FBINFO_DEFAULT;
174 info->pseudo_palette = par->palette;
175
176 ret = register_framebuffer(info);
177 if (ret < 0) {
178 pci_err(pdev, "mdpy-fb device register failed: %d\n", ret);
179 goto err_unmap;
180 }
181
182 pci_info(pdev, "fb%d registered\n", info->node);
183 return 0;
184
185err_unmap:
186 iounmap(info->screen_base);
187
188err_release_fb:
189 framebuffer_release(info);
190
191err_release_regions:
192 pci_release_regions(pdev);
193
194err_disable_dev:
195 pci_disable_device(pdev);
196
197 return ret;
198}
199
200static void mdpy_fb_remove(struct pci_dev *pdev)
201{
202 struct fb_info *info = pci_get_drvdata(pdev);
203
204 unregister_framebuffer(info);
205 iounmap(info->screen_base);
206 framebuffer_release(info);
207 pci_release_regions(pdev);
208 pci_disable_device(pdev);
209}
210
211static struct pci_device_id mdpy_fb_pci_table[] = {
212 {
213 .vendor = MDPY_PCI_VENDOR_ID,
214 .device = MDPY_PCI_DEVICE_ID,
215 .subvendor = MDPY_PCI_SUBVENDOR_ID,
216 .subdevice = MDPY_PCI_SUBDEVICE_ID,
217 }, {
218 /* end of list */
219 }
220};
221
222static struct pci_driver mdpy_fb_pci_driver = {
223 .name = "mdpy-fb",
224 .id_table = mdpy_fb_pci_table,
225 .probe = mdpy_fb_probe,
226 .remove = mdpy_fb_remove,
227};
228
229static int __init mdpy_fb_init(void)
230{
231 int ret;
232
233 ret = pci_register_driver(&mdpy_fb_pci_driver);
234 if (ret)
235 return ret;
236
237 return 0;
238}
239
240module_init(mdpy_fb_init);
241
242MODULE_DEVICE_TABLE(pci, mdpy_fb_pci_table);
243MODULE_LICENSE("GPL v2");
diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
new file mode 100644
index 000000000..9894693f3
--- /dev/null
+++ b/samples/vfio-mdev/mdpy.c
@@ -0,0 +1,807 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Mediated virtual PCI display host device driver
4 *
5 * See mdpy-defs.h for device specs
6 *
7 * (c) Gerd Hoffmann <kraxel@redhat.com>
8 *
9 * based on mtty driver which is:
10 * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
11 * Author: Neo Jia <cjia@nvidia.com>
12 * Kirti Wankhede <kwankhede@nvidia.com>
13 *
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License version 2 as
16 * published by the Free Software Foundation.
17 */
18#include <linux/init.h>
19#include <linux/module.h>
20#include <linux/device.h>
21#include <linux/kernel.h>
22#include <linux/slab.h>
23#include <linux/vmalloc.h>
24#include <linux/cdev.h>
25#include <linux/vfio.h>
26#include <linux/iommu.h>
27#include <linux/sysfs.h>
28#include <linux/mdev.h>
29#include <linux/pci.h>
30#include <drm/drm_fourcc.h>
31#include "mdpy-defs.h"
32
33#define MDPY_NAME "mdpy"
34#define MDPY_CLASS_NAME "mdpy"
35
36#define MDPY_CONFIG_SPACE_SIZE 0xff
37#define MDPY_MEMORY_BAR_OFFSET PAGE_SIZE
38#define MDPY_DISPLAY_REGION 16
39
40#define STORE_LE16(addr, val) (*(u16 *)addr = val)
41#define STORE_LE32(addr, val) (*(u32 *)addr = val)
42
43
44MODULE_LICENSE("GPL v2");
45
46static int max_devices = 4;
47module_param_named(count, max_devices, int, 0444);
48MODULE_PARM_DESC(count, "number of " MDPY_NAME " devices");
49
50
51#define MDPY_TYPE_1 "vga"
52#define MDPY_TYPE_2 "xga"
53#define MDPY_TYPE_3 "hd"
54
55static const struct mdpy_type {
56 const char *name;
57 u32 format;
58 u32 bytepp;
59 u32 width;
60 u32 height;
61} mdpy_types[] = {
62 {
63 .name = MDPY_CLASS_NAME "-" MDPY_TYPE_1,
64 .format = DRM_FORMAT_XRGB8888,
65 .bytepp = 4,
66 .width = 640,
67 .height = 480,
68 }, {
69 .name = MDPY_CLASS_NAME "-" MDPY_TYPE_2,
70 .format = DRM_FORMAT_XRGB8888,
71 .bytepp = 4,
72 .width = 1024,
73 .height = 768,
74 }, {
75 .name = MDPY_CLASS_NAME "-" MDPY_TYPE_3,
76 .format = DRM_FORMAT_XRGB8888,
77 .bytepp = 4,
78 .width = 1920,
79 .height = 1080,
80 },
81};
82
83static dev_t mdpy_devt;
84static struct class *mdpy_class;
85static struct cdev mdpy_cdev;
86static struct device mdpy_dev;
87static u32 mdpy_count;
88
89/* State of each mdev device */
90struct mdev_state {
91 u8 *vconfig;
92 u32 bar_mask;
93 struct mutex ops_lock;
94 struct mdev_device *mdev;
95 struct vfio_device_info dev_info;
96
97 const struct mdpy_type *type;
98 u32 memsize;
99 void *memblk;
100};
101
102static const struct mdpy_type *mdpy_find_type(struct kobject *kobj)
103{
104 int i;
105
106 for (i = 0; i < ARRAY_SIZE(mdpy_types); i++)
107 if (strcmp(mdpy_types[i].name, kobj->name) == 0)
108 return mdpy_types + i;
109 return NULL;
110}
111
112static void mdpy_create_config_space(struct mdev_state *mdev_state)
113{
114 STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID],
115 MDPY_PCI_VENDOR_ID);
116 STORE_LE16((u16 *) &mdev_state->vconfig[PCI_DEVICE_ID],
117 MDPY_PCI_DEVICE_ID);
118 STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_VENDOR_ID],
119 MDPY_PCI_SUBVENDOR_ID);
120 STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_ID],
121 MDPY_PCI_SUBDEVICE_ID);
122
123 STORE_LE16((u16 *) &mdev_state->vconfig[PCI_COMMAND],
124 PCI_COMMAND_IO | PCI_COMMAND_MEMORY);
125 STORE_LE16((u16 *) &mdev_state->vconfig[PCI_STATUS],
126 PCI_STATUS_CAP_LIST);
127 STORE_LE16((u16 *) &mdev_state->vconfig[PCI_CLASS_DEVICE],
128 PCI_CLASS_DISPLAY_OTHER);
129 mdev_state->vconfig[PCI_CLASS_REVISION] = 0x01;
130
131 STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_0],
132 PCI_BASE_ADDRESS_SPACE_MEMORY |
133 PCI_BASE_ADDRESS_MEM_TYPE_32 |
134 PCI_BASE_ADDRESS_MEM_PREFETCH);
135 mdev_state->bar_mask = ~(mdev_state->memsize) + 1;
136
137 /* vendor specific capability for the config registers */
138 mdev_state->vconfig[PCI_CAPABILITY_LIST] = MDPY_VENDORCAP_OFFSET;
139 mdev_state->vconfig[MDPY_VENDORCAP_OFFSET + 0] = 0x09; /* vendor cap */
140 mdev_state->vconfig[MDPY_VENDORCAP_OFFSET + 1] = 0x00; /* next ptr */
141 mdev_state->vconfig[MDPY_VENDORCAP_OFFSET + 2] = MDPY_VENDORCAP_SIZE;
142 STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_FORMAT_OFFSET],
143 mdev_state->type->format);
144 STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_WIDTH_OFFSET],
145 mdev_state->type->width);
146 STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_HEIGHT_OFFSET],
147 mdev_state->type->height);
148}
149
150static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset,
151 char *buf, u32 count)
152{
153 struct device *dev = mdev_dev(mdev_state->mdev);
154 u32 cfg_addr;
155
156 switch (offset) {
157 case PCI_BASE_ADDRESS_0:
158 cfg_addr = *(u32 *)buf;
159
160 if (cfg_addr == 0xffffffff) {
161 cfg_addr = (cfg_addr & mdev_state->bar_mask);
162 } else {
163 cfg_addr &= PCI_BASE_ADDRESS_MEM_MASK;
164 if (cfg_addr)
165 dev_info(dev, "BAR0 @ 0x%x\n", cfg_addr);
166 }
167
168 cfg_addr |= (mdev_state->vconfig[offset] &
169 ~PCI_BASE_ADDRESS_MEM_MASK);
170 STORE_LE32(&mdev_state->vconfig[offset], cfg_addr);
171 break;
172 }
173}
174
175static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count,
176 loff_t pos, bool is_write)
177{
178 struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
179 struct device *dev = mdev_dev(mdev);
180 int ret = 0;
181
182 mutex_lock(&mdev_state->ops_lock);
183
184 if (pos < MDPY_CONFIG_SPACE_SIZE) {
185 if (is_write)
186 handle_pci_cfg_write(mdev_state, pos, buf, count);
187 else
188 memcpy(buf, (mdev_state->vconfig + pos), count);
189
190 } else if ((pos >= MDPY_MEMORY_BAR_OFFSET) &&
191 (pos + count <=
192 MDPY_MEMORY_BAR_OFFSET + mdev_state->memsize)) {
193 pos -= MDPY_MEMORY_BAR_OFFSET;
194 if (is_write)
195 memcpy(mdev_state->memblk, buf, count);
196 else
197 memcpy(buf, mdev_state->memblk, count);
198
199 } else {
200 dev_info(dev, "%s: %s @0x%llx (unhandled)\n",
201 __func__, is_write ? "WR" : "RD", pos);
202 ret = -1;
203 goto accessfailed;
204 }
205
206 ret = count;
207
208
209accessfailed:
210 mutex_unlock(&mdev_state->ops_lock);
211
212 return ret;
213}
214
215static int mdpy_reset(struct mdev_device *mdev)
216{
217 struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
218 u32 stride, i;
219
220 /* initialize with gray gradient */
221 stride = mdev_state->type->width * mdev_state->type->bytepp;
222 for (i = 0; i < mdev_state->type->height; i++)
223 memset(mdev_state->memblk + i * stride,
224 i * 255 / mdev_state->type->height,
225 stride);
226 return 0;
227}
228
229static int mdpy_create(struct kobject *kobj, struct mdev_device *mdev)
230{
231 const struct mdpy_type *type = mdpy_find_type(kobj);
232 struct device *dev = mdev_dev(mdev);
233 struct mdev_state *mdev_state;
234 u32 fbsize;
235
236 if (mdpy_count >= max_devices)
237 return -ENOMEM;
238
239 mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL);
240 if (mdev_state == NULL)
241 return -ENOMEM;
242
243 mdev_state->vconfig = kzalloc(MDPY_CONFIG_SPACE_SIZE, GFP_KERNEL);
244 if (mdev_state->vconfig == NULL) {
245 kfree(mdev_state);
246 return -ENOMEM;
247 }
248
249 if (!type)
250 type = &mdpy_types[0];
251 fbsize = roundup_pow_of_two(type->width * type->height * type->bytepp);
252
253 mdev_state->memblk = vmalloc_user(fbsize);
254 if (!mdev_state->memblk) {
255 kfree(mdev_state->vconfig);
256 kfree(mdev_state);
257 return -ENOMEM;
258 }
259 dev_info(dev, "%s: %s (%dx%d)\n",
260 __func__, kobj->name, type->width, type->height);
261
262 mutex_init(&mdev_state->ops_lock);
263 mdev_state->mdev = mdev;
264 mdev_set_drvdata(mdev, mdev_state);
265
266 mdev_state->type = type;
267 mdev_state->memsize = fbsize;
268 mdpy_create_config_space(mdev_state);
269 mdpy_reset(mdev);
270
271 mdpy_count++;
272 return 0;
273}
274
275static int mdpy_remove(struct mdev_device *mdev)
276{
277 struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
278 struct device *dev = mdev_dev(mdev);
279
280 dev_info(dev, "%s\n", __func__);
281
282 mdev_set_drvdata(mdev, NULL);
283 vfree(mdev_state->memblk);
284 kfree(mdev_state->vconfig);
285 kfree(mdev_state);
286
287 mdpy_count--;
288 return 0;
289}
290
291static ssize_t mdpy_read(struct mdev_device *mdev, char __user *buf,
292 size_t count, loff_t *ppos)
293{
294 unsigned int done = 0;
295 int ret;
296
297 while (count) {
298 size_t filled;
299
300 if (count >= 4 && !(*ppos % 4)) {
301 u32 val;
302
303 ret = mdev_access(mdev, (char *)&val, sizeof(val),
304 *ppos, false);
305 if (ret <= 0)
306 goto read_err;
307
308 if (copy_to_user(buf, &val, sizeof(val)))
309 goto read_err;
310
311 filled = 4;
312 } else if (count >= 2 && !(*ppos % 2)) {
313 u16 val;
314
315 ret = mdev_access(mdev, (char *)&val, sizeof(val),
316 *ppos, false);
317 if (ret <= 0)
318 goto read_err;
319
320 if (copy_to_user(buf, &val, sizeof(val)))
321 goto read_err;
322
323 filled = 2;
324 } else {
325 u8 val;
326
327 ret = mdev_access(mdev, (char *)&val, sizeof(val),
328 *ppos, false);
329 if (ret <= 0)
330 goto read_err;
331
332 if (copy_to_user(buf, &val, sizeof(val)))
333 goto read_err;
334
335 filled = 1;
336 }
337
338 count -= filled;
339 done += filled;
340 *ppos += filled;
341 buf += filled;
342 }
343
344 return done;
345
346read_err:
347 return -EFAULT;
348}
349
350static ssize_t mdpy_write(struct mdev_device *mdev, const char __user *buf,
351 size_t count, loff_t *ppos)
352{
353 unsigned int done = 0;
354 int ret;
355
356 while (count) {
357 size_t filled;
358
359 if (count >= 4 && !(*ppos % 4)) {
360 u32 val;
361
362 if (copy_from_user(&val, buf, sizeof(val)))
363 goto write_err;
364
365 ret = mdev_access(mdev, (char *)&val, sizeof(val),
366 *ppos, true);
367 if (ret <= 0)
368 goto write_err;
369
370 filled = 4;
371 } else if (count >= 2 && !(*ppos % 2)) {
372 u16 val;
373
374 if (copy_from_user(&val, buf, sizeof(val)))
375 goto write_err;
376
377 ret = mdev_access(mdev, (char *)&val, sizeof(val),
378 *ppos, true);
379 if (ret <= 0)
380 goto write_err;
381
382 filled = 2;
383 } else {
384 u8 val;
385
386 if (copy_from_user(&val, buf, sizeof(val)))
387 goto write_err;
388
389 ret = mdev_access(mdev, (char *)&val, sizeof(val),
390 *ppos, true);
391 if (ret <= 0)
392 goto write_err;
393
394 filled = 1;
395 }
396 count -= filled;
397 done += filled;
398 *ppos += filled;
399 buf += filled;
400 }
401
402 return done;
403write_err:
404 return -EFAULT;
405}
406
407static int mdpy_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
408{
409 struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
410
411 if (vma->vm_pgoff != MDPY_MEMORY_BAR_OFFSET >> PAGE_SHIFT)
412 return -EINVAL;
413 if (vma->vm_end < vma->vm_start)
414 return -EINVAL;
415 if (vma->vm_end - vma->vm_start > mdev_state->memsize)
416 return -EINVAL;
417 if ((vma->vm_flags & VM_SHARED) == 0)
418 return -EINVAL;
419
420 return remap_vmalloc_range_partial(vma, vma->vm_start,
421 mdev_state->memblk, 0,
422 vma->vm_end - vma->vm_start);
423}
424
425static int mdpy_get_region_info(struct mdev_device *mdev,
426 struct vfio_region_info *region_info,
427 u16 *cap_type_id, void **cap_type)
428{
429 struct mdev_state *mdev_state;
430
431 mdev_state = mdev_get_drvdata(mdev);
432 if (!mdev_state)
433 return -EINVAL;
434
435 if (region_info->index >= VFIO_PCI_NUM_REGIONS &&
436 region_info->index != MDPY_DISPLAY_REGION)
437 return -EINVAL;
438
439 switch (region_info->index) {
440 case VFIO_PCI_CONFIG_REGION_INDEX:
441 region_info->offset = 0;
442 region_info->size = MDPY_CONFIG_SPACE_SIZE;
443 region_info->flags = (VFIO_REGION_INFO_FLAG_READ |
444 VFIO_REGION_INFO_FLAG_WRITE);
445 break;
446 case VFIO_PCI_BAR0_REGION_INDEX:
447 case MDPY_DISPLAY_REGION:
448 region_info->offset = MDPY_MEMORY_BAR_OFFSET;
449 region_info->size = mdev_state->memsize;
450 region_info->flags = (VFIO_REGION_INFO_FLAG_READ |
451 VFIO_REGION_INFO_FLAG_WRITE |
452 VFIO_REGION_INFO_FLAG_MMAP);
453 break;
454 default:
455 region_info->size = 0;
456 region_info->offset = 0;
457 region_info->flags = 0;
458 }
459
460 return 0;
461}
462
463static int mdpy_get_irq_info(struct mdev_device *mdev,
464 struct vfio_irq_info *irq_info)
465{
466 irq_info->count = 0;
467 return 0;
468}
469
470static int mdpy_get_device_info(struct mdev_device *mdev,
471 struct vfio_device_info *dev_info)
472{
473 dev_info->flags = VFIO_DEVICE_FLAGS_PCI;
474 dev_info->num_regions = VFIO_PCI_NUM_REGIONS;
475 dev_info->num_irqs = VFIO_PCI_NUM_IRQS;
476 return 0;
477}
478
479static int mdpy_query_gfx_plane(struct mdev_device *mdev,
480 struct vfio_device_gfx_plane_info *plane)
481{
482 struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
483
484 if (plane->flags & VFIO_GFX_PLANE_TYPE_PROBE) {
485 if (plane->flags == (VFIO_GFX_PLANE_TYPE_PROBE |
486 VFIO_GFX_PLANE_TYPE_REGION))
487 return 0;
488 return -EINVAL;
489 }
490
491 if (plane->flags != VFIO_GFX_PLANE_TYPE_REGION)
492 return -EINVAL;
493
494 plane->drm_format = mdev_state->type->format;
495 plane->width = mdev_state->type->width;
496 plane->height = mdev_state->type->height;
497 plane->stride = (mdev_state->type->width *
498 mdev_state->type->bytepp);
499 plane->size = mdev_state->memsize;
500 plane->region_index = MDPY_DISPLAY_REGION;
501
502 /* unused */
503 plane->drm_format_mod = 0;
504 plane->x_pos = 0;
505 plane->y_pos = 0;
506 plane->x_hot = 0;
507 plane->y_hot = 0;
508
509 return 0;
510}
511
512static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd,
513 unsigned long arg)
514{
515 int ret = 0;
516 unsigned long minsz;
517 struct mdev_state *mdev_state;
518
519 mdev_state = mdev_get_drvdata(mdev);
520
521 switch (cmd) {
522 case VFIO_DEVICE_GET_INFO:
523 {
524 struct vfio_device_info info;
525
526 minsz = offsetofend(struct vfio_device_info, num_irqs);
527
528 if (copy_from_user(&info, (void __user *)arg, minsz))
529 return -EFAULT;
530
531 if (info.argsz < minsz)
532 return -EINVAL;
533
534 ret = mdpy_get_device_info(mdev, &info);
535 if (ret)
536 return ret;
537
538 memcpy(&mdev_state->dev_info, &info, sizeof(info));
539
540 if (copy_to_user((void __user *)arg, &info, minsz))
541 return -EFAULT;
542
543 return 0;
544 }
545 case VFIO_DEVICE_GET_REGION_INFO:
546 {
547 struct vfio_region_info info;
548 u16 cap_type_id = 0;
549 void *cap_type = NULL;
550
551 minsz = offsetofend(struct vfio_region_info, offset);
552
553 if (copy_from_user(&info, (void __user *)arg, minsz))
554 return -EFAULT;
555
556 if (info.argsz < minsz)
557 return -EINVAL;
558
559 ret = mdpy_get_region_info(mdev, &info, &cap_type_id,
560 &cap_type);
561 if (ret)
562 return ret;
563
564 if (copy_to_user((void __user *)arg, &info, minsz))
565 return -EFAULT;
566
567 return 0;
568 }
569
570 case VFIO_DEVICE_GET_IRQ_INFO:
571 {
572 struct vfio_irq_info info;
573
574 minsz = offsetofend(struct vfio_irq_info, count);
575
576 if (copy_from_user(&info, (void __user *)arg, minsz))
577 return -EFAULT;
578
579 if ((info.argsz < minsz) ||
580 (info.index >= mdev_state->dev_info.num_irqs))
581 return -EINVAL;
582
583 ret = mdpy_get_irq_info(mdev, &info);
584 if (ret)
585 return ret;
586
587 if (copy_to_user((void __user *)arg, &info, minsz))
588 return -EFAULT;
589
590 return 0;
591 }
592
593 case VFIO_DEVICE_QUERY_GFX_PLANE:
594 {
595 struct vfio_device_gfx_plane_info plane;
596
597 minsz = offsetofend(struct vfio_device_gfx_plane_info,
598 region_index);
599
600 if (copy_from_user(&plane, (void __user *)arg, minsz))
601 return -EFAULT;
602
603 if (plane.argsz < minsz)
604 return -EINVAL;
605
606 ret = mdpy_query_gfx_plane(mdev, &plane);
607 if (ret)
608 return ret;
609
610 if (copy_to_user((void __user *)arg, &plane, minsz))
611 return -EFAULT;
612
613 return 0;
614 }
615
616 case VFIO_DEVICE_SET_IRQS:
617 return -EINVAL;
618
619 case VFIO_DEVICE_RESET:
620 return mdpy_reset(mdev);
621 }
622 return -ENOTTY;
623}
624
625static int mdpy_open(struct mdev_device *mdev)
626{
627 if (!try_module_get(THIS_MODULE))
628 return -ENODEV;
629
630 return 0;
631}
632
633static void mdpy_close(struct mdev_device *mdev)
634{
635 module_put(THIS_MODULE);
636}
637
638static ssize_t
639resolution_show(struct device *dev, struct device_attribute *attr,
640 char *buf)
641{
642 struct mdev_device *mdev = mdev_from_dev(dev);
643 struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
644
645 return sprintf(buf, "%dx%d\n",
646 mdev_state->type->width,
647 mdev_state->type->height);
648}
649static DEVICE_ATTR_RO(resolution);
650
651static struct attribute *mdev_dev_attrs[] = {
652 &dev_attr_resolution.attr,
653 NULL,
654};
655
656static const struct attribute_group mdev_dev_group = {
657 .name = "vendor",
658 .attrs = mdev_dev_attrs,
659};
660
661const struct attribute_group *mdev_dev_groups[] = {
662 &mdev_dev_group,
663 NULL,
664};
665
666static ssize_t
667name_show(struct kobject *kobj, struct device *dev, char *buf)
668{
669 return sprintf(buf, "%s\n", kobj->name);
670}
671MDEV_TYPE_ATTR_RO(name);
672
673static ssize_t
674description_show(struct kobject *kobj, struct device *dev, char *buf)
675{
676 const struct mdpy_type *type = mdpy_find_type(kobj);
677
678 return sprintf(buf, "virtual display, %dx%d framebuffer\n",
679 type ? type->width : 0,
680 type ? type->height : 0);
681}
682MDEV_TYPE_ATTR_RO(description);
683
684static ssize_t
685available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
686{
687 return sprintf(buf, "%d\n", max_devices - mdpy_count);
688}
689MDEV_TYPE_ATTR_RO(available_instances);
690
691static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
692 char *buf)
693{
694 return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
695}
696MDEV_TYPE_ATTR_RO(device_api);
697
698static struct attribute *mdev_types_attrs[] = {
699 &mdev_type_attr_name.attr,
700 &mdev_type_attr_description.attr,
701 &mdev_type_attr_device_api.attr,
702 &mdev_type_attr_available_instances.attr,
703 NULL,
704};
705
706static struct attribute_group mdev_type_group1 = {
707 .name = MDPY_TYPE_1,
708 .attrs = mdev_types_attrs,
709};
710
711static struct attribute_group mdev_type_group2 = {
712 .name = MDPY_TYPE_2,
713 .attrs = mdev_types_attrs,
714};
715
716static struct attribute_group mdev_type_group3 = {
717 .name = MDPY_TYPE_3,
718 .attrs = mdev_types_attrs,
719};
720
721static struct attribute_group *mdev_type_groups[] = {
722 &mdev_type_group1,
723 &mdev_type_group2,
724 &mdev_type_group3,
725 NULL,
726};
727
728static const struct mdev_parent_ops mdev_fops = {
729 .owner = THIS_MODULE,
730 .mdev_attr_groups = mdev_dev_groups,
731 .supported_type_groups = mdev_type_groups,
732 .create = mdpy_create,
733 .remove = mdpy_remove,
734 .open = mdpy_open,
735 .release = mdpy_close,
736 .read = mdpy_read,
737 .write = mdpy_write,
738 .ioctl = mdpy_ioctl,
739 .mmap = mdpy_mmap,
740};
741
742static const struct file_operations vd_fops = {
743 .owner = THIS_MODULE,
744};
745
746static void mdpy_device_release(struct device *dev)
747{
748 /* nothing */
749}
750
751static int __init mdpy_dev_init(void)
752{
753 int ret = 0;
754
755 ret = alloc_chrdev_region(&mdpy_devt, 0, MINORMASK + 1, MDPY_NAME);
756 if (ret < 0) {
757 pr_err("Error: failed to register mdpy_dev, err: %d\n", ret);
758 return ret;
759 }
760 cdev_init(&mdpy_cdev, &vd_fops);
761 cdev_add(&mdpy_cdev, mdpy_devt, MINORMASK + 1);
762 pr_info("%s: major %d\n", __func__, MAJOR(mdpy_devt));
763
764 mdpy_class = class_create(THIS_MODULE, MDPY_CLASS_NAME);
765 if (IS_ERR(mdpy_class)) {
766 pr_err("Error: failed to register mdpy_dev class\n");
767 ret = PTR_ERR(mdpy_class);
768 goto failed1;
769 }
770 mdpy_dev.class = mdpy_class;
771 mdpy_dev.release = mdpy_device_release;
772 dev_set_name(&mdpy_dev, "%s", MDPY_NAME);
773
774 ret = device_register(&mdpy_dev);
775 if (ret)
776 goto failed2;
777
778 ret = mdev_register_device(&mdpy_dev, &mdev_fops);
779 if (ret)
780 goto failed3;
781
782 return 0;
783
784failed3:
785 device_unregister(&mdpy_dev);
786failed2:
787 class_destroy(mdpy_class);
788failed1:
789 cdev_del(&mdpy_cdev);
790 unregister_chrdev_region(mdpy_devt, MINORMASK + 1);
791 return ret;
792}
793
794static void __exit mdpy_dev_exit(void)
795{
796 mdpy_dev.bus = NULL;
797 mdev_unregister_device(&mdpy_dev);
798
799 device_unregister(&mdpy_dev);
800 cdev_del(&mdpy_cdev);
801 unregister_chrdev_region(mdpy_devt, MINORMASK + 1);
802 class_destroy(mdpy_class);
803 mdpy_class = NULL;
804}
805
806module_init(mdpy_dev_init)
807module_exit(mdpy_dev_exit)
diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
new file mode 100644
index 000000000..ce84a300a
--- /dev/null
+++ b/samples/vfio-mdev/mtty.c
@@ -0,0 +1,1491 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Mediated virtual PCI serial host device driver
4 *
5 * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
6 * Author: Neo Jia <cjia@nvidia.com>
7 * Kirti Wankhede <kwankhede@nvidia.com>
8 *
9 * Sample driver that creates mdev device that simulates serial port over PCI
10 * card.
11 */
12
13#include <linux/init.h>
14#include <linux/module.h>
15#include <linux/device.h>
16#include <linux/kernel.h>
17#include <linux/fs.h>
18#include <linux/poll.h>
19#include <linux/slab.h>
20#include <linux/cdev.h>
21#include <linux/sched.h>
22#include <linux/wait.h>
23#include <linux/uuid.h>
24#include <linux/vfio.h>
25#include <linux/iommu.h>
26#include <linux/sysfs.h>
27#include <linux/ctype.h>
28#include <linux/file.h>
29#include <linux/mdev.h>
30#include <linux/pci.h>
31#include <linux/serial.h>
32#include <uapi/linux/serial_reg.h>
33#include <linux/eventfd.h>
34/*
35 * #defines
36 */
37
38#define VERSION_STRING "0.1"
39#define DRIVER_AUTHOR "NVIDIA Corporation"
40
41#define MTTY_CLASS_NAME "mtty"
42
43#define MTTY_NAME "mtty"
44
45#define MTTY_STRING_LEN 16
46
47#define MTTY_CONFIG_SPACE_SIZE 0xff
48#define MTTY_IO_BAR_SIZE 0x8
49#define MTTY_MMIO_BAR_SIZE 0x100000
50
51#define STORE_LE16(addr, val) (*(u16 *)addr = val)
52#define STORE_LE32(addr, val) (*(u32 *)addr = val)
53
54#define MAX_FIFO_SIZE 16
55
56#define CIRCULAR_BUF_INC_IDX(idx) (idx = (idx + 1) & (MAX_FIFO_SIZE - 1))
57
58#define MTTY_VFIO_PCI_OFFSET_SHIFT 40
59
60#define MTTY_VFIO_PCI_OFFSET_TO_INDEX(off) (off >> MTTY_VFIO_PCI_OFFSET_SHIFT)
61#define MTTY_VFIO_PCI_INDEX_TO_OFFSET(index) \
62 ((u64)(index) << MTTY_VFIO_PCI_OFFSET_SHIFT)
63#define MTTY_VFIO_PCI_OFFSET_MASK \
64 (((u64)(1) << MTTY_VFIO_PCI_OFFSET_SHIFT) - 1)
65#define MAX_MTTYS 24
66
67/*
68 * Global Structures
69 */
70
71static struct mtty_dev {
72 dev_t vd_devt;
73 struct class *vd_class;
74 struct cdev vd_cdev;
75 struct idr vd_idr;
76 struct device dev;
77} mtty_dev;
78
79struct mdev_region_info {
80 u64 start;
81 u64 phys_start;
82 u32 size;
83 u64 vfio_offset;
84};
85
86#if defined(DEBUG_REGS)
87static const char *wr_reg[] = {
88 "TX",
89 "IER",
90 "FCR",
91 "LCR",
92 "MCR",
93 "LSR",
94 "MSR",
95 "SCR"
96};
97
98static const char *rd_reg[] = {
99 "RX",
100 "IER",
101 "IIR",
102 "LCR",
103 "MCR",
104 "LSR",
105 "MSR",
106 "SCR"
107};
108#endif
109
110/* loop back buffer */
111struct rxtx {
112 u8 fifo[MAX_FIFO_SIZE];
113 u8 head, tail;
114 u8 count;
115};
116
117struct serial_port {
118 u8 uart_reg[8]; /* 8 registers */
119 struct rxtx rxtx; /* loop back buffer */
120 bool dlab;
121 bool overrun;
122 u16 divisor;
123 u8 fcr; /* FIFO control register */
124 u8 max_fifo_size;
125 u8 intr_trigger_level; /* interrupt trigger level */
126};
127
128/* State of each mdev device */
129struct mdev_state {
130 int irq_fd;
131 struct eventfd_ctx *intx_evtfd;
132 struct eventfd_ctx *msi_evtfd;
133 int irq_index;
134 u8 *vconfig;
135 struct mutex ops_lock;
136 struct mdev_device *mdev;
137 struct mdev_region_info region_info[VFIO_PCI_NUM_REGIONS];
138 u32 bar_mask[VFIO_PCI_NUM_REGIONS];
139 struct list_head next;
140 struct serial_port s[2];
141 struct mutex rxtx_lock;
142 struct vfio_device_info dev_info;
143 int nr_ports;
144};
145
146static struct mutex mdev_list_lock;
147static struct list_head mdev_devices_list;
148
149static const struct file_operations vd_fops = {
150 .owner = THIS_MODULE,
151};
152
153/* function prototypes */
154
155static int mtty_trigger_interrupt(struct mdev_state *mdev_state);
156
157/* Helper functions */
158
159static void dump_buffer(u8 *buf, uint32_t count)
160{
161#if defined(DEBUG)
162 int i;
163
164 pr_info("Buffer:\n");
165 for (i = 0; i < count; i++) {
166 pr_info("%2x ", *(buf + i));
167 if ((i + 1) % 16 == 0)
168 pr_info("\n");
169 }
170#endif
171}
172
173static void mtty_create_config_space(struct mdev_state *mdev_state)
174{
175 /* PCI dev ID */
176 STORE_LE32((u32 *) &mdev_state->vconfig[0x0], 0x32534348);
177
178 /* Control: I/O+, Mem-, BusMaster- */
179 STORE_LE16((u16 *) &mdev_state->vconfig[0x4], 0x0001);
180
181 /* Status: capabilities list absent */
182 STORE_LE16((u16 *) &mdev_state->vconfig[0x6], 0x0200);
183
184 /* Rev ID */
185 mdev_state->vconfig[0x8] = 0x10;
186
187 /* programming interface class : 16550-compatible serial controller */
188 mdev_state->vconfig[0x9] = 0x02;
189
190 /* Sub class : 00 */
191 mdev_state->vconfig[0xa] = 0x00;
192
193 /* Base class : Simple Communication controllers */
194 mdev_state->vconfig[0xb] = 0x07;
195
196 /* base address registers */
197 /* BAR0: IO space */
198 STORE_LE32((u32 *) &mdev_state->vconfig[0x10], 0x000001);
199 mdev_state->bar_mask[0] = ~(MTTY_IO_BAR_SIZE) + 1;
200
201 if (mdev_state->nr_ports == 2) {
202 /* BAR1: IO space */
203 STORE_LE32((u32 *) &mdev_state->vconfig[0x14], 0x000001);
204 mdev_state->bar_mask[1] = ~(MTTY_IO_BAR_SIZE) + 1;
205 }
206
207 /* Subsystem ID */
208 STORE_LE32((u32 *) &mdev_state->vconfig[0x2c], 0x32534348);
209
210 mdev_state->vconfig[0x34] = 0x00; /* Cap Ptr */
211 mdev_state->vconfig[0x3d] = 0x01; /* interrupt pin (INTA#) */
212
213 /* Vendor specific data */
214 mdev_state->vconfig[0x40] = 0x23;
215 mdev_state->vconfig[0x43] = 0x80;
216 mdev_state->vconfig[0x44] = 0x23;
217 mdev_state->vconfig[0x48] = 0x23;
218 mdev_state->vconfig[0x4c] = 0x23;
219
220 mdev_state->vconfig[0x60] = 0x50;
221 mdev_state->vconfig[0x61] = 0x43;
222 mdev_state->vconfig[0x62] = 0x49;
223 mdev_state->vconfig[0x63] = 0x20;
224 mdev_state->vconfig[0x64] = 0x53;
225 mdev_state->vconfig[0x65] = 0x65;
226 mdev_state->vconfig[0x66] = 0x72;
227 mdev_state->vconfig[0x67] = 0x69;
228 mdev_state->vconfig[0x68] = 0x61;
229 mdev_state->vconfig[0x69] = 0x6c;
230 mdev_state->vconfig[0x6a] = 0x2f;
231 mdev_state->vconfig[0x6b] = 0x55;
232 mdev_state->vconfig[0x6c] = 0x41;
233 mdev_state->vconfig[0x6d] = 0x52;
234 mdev_state->vconfig[0x6e] = 0x54;
235}
236
237static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset,
238 u8 *buf, u32 count)
239{
240 u32 cfg_addr, bar_mask, bar_index = 0;
241
242 switch (offset) {
243 case 0x04: /* device control */
244 case 0x06: /* device status */
245 /* do nothing */
246 break;
247 case 0x3c: /* interrupt line */
248 mdev_state->vconfig[0x3c] = buf[0];
249 break;
250 case 0x3d:
251 /*
252 * Interrupt Pin is hardwired to INTA.
253 * This field is write protected by hardware
254 */
255 break;
256 case 0x10: /* BAR0 */
257 case 0x14: /* BAR1 */
258 if (offset == 0x10)
259 bar_index = 0;
260 else if (offset == 0x14)
261 bar_index = 1;
262
263 if ((mdev_state->nr_ports == 1) && (bar_index == 1)) {
264 STORE_LE32(&mdev_state->vconfig[offset], 0);
265 break;
266 }
267
268 cfg_addr = *(u32 *)buf;
269 pr_info("BAR%d addr 0x%x\n", bar_index, cfg_addr);
270
271 if (cfg_addr == 0xffffffff) {
272 bar_mask = mdev_state->bar_mask[bar_index];
273 cfg_addr = (cfg_addr & bar_mask);
274 }
275
276 cfg_addr |= (mdev_state->vconfig[offset] & 0x3ul);
277 STORE_LE32(&mdev_state->vconfig[offset], cfg_addr);
278 break;
279 case 0x18: /* BAR2 */
280 case 0x1c: /* BAR3 */
281 case 0x20: /* BAR4 */
282 STORE_LE32(&mdev_state->vconfig[offset], 0);
283 break;
284 default:
285 pr_info("PCI config write @0x%x of %d bytes not handled\n",
286 offset, count);
287 break;
288 }
289}
290
291static void handle_bar_write(unsigned int index, struct mdev_state *mdev_state,
292 u16 offset, u8 *buf, u32 count)
293{
294 u8 data = *buf;
295
296 /* Handle data written by guest */
297 switch (offset) {
298 case UART_TX:
299 /* if DLAB set, data is LSB of divisor */
300 if (mdev_state->s[index].dlab) {
301 mdev_state->s[index].divisor |= data;
302 break;
303 }
304
305 mutex_lock(&mdev_state->rxtx_lock);
306
307 /* save in TX buffer */
308 if (mdev_state->s[index].rxtx.count <
309 mdev_state->s[index].max_fifo_size) {
310 mdev_state->s[index].rxtx.fifo[
311 mdev_state->s[index].rxtx.head] = data;
312 mdev_state->s[index].rxtx.count++;
313 CIRCULAR_BUF_INC_IDX(mdev_state->s[index].rxtx.head);
314 mdev_state->s[index].overrun = false;
315
316 /*
317 * Trigger interrupt if receive data interrupt is
318 * enabled and fifo reached trigger level
319 */
320 if ((mdev_state->s[index].uart_reg[UART_IER] &
321 UART_IER_RDI) &&
322 (mdev_state->s[index].rxtx.count ==
323 mdev_state->s[index].intr_trigger_level)) {
324 /* trigger interrupt */
325#if defined(DEBUG_INTR)
326 pr_err("Serial port %d: Fifo level trigger\n",
327 index);
328#endif
329 mtty_trigger_interrupt(mdev_state);
330 }
331 } else {
332#if defined(DEBUG_INTR)
333 pr_err("Serial port %d: Buffer Overflow\n", index);
334#endif
335 mdev_state->s[index].overrun = true;
336
337 /*
338 * Trigger interrupt if receiver line status interrupt
339 * is enabled
340 */
341 if (mdev_state->s[index].uart_reg[UART_IER] &
342 UART_IER_RLSI)
343 mtty_trigger_interrupt(mdev_state);
344 }
345 mutex_unlock(&mdev_state->rxtx_lock);
346 break;
347
348 case UART_IER:
349 /* if DLAB set, data is MSB of divisor */
350 if (mdev_state->s[index].dlab)
351 mdev_state->s[index].divisor |= (u16)data << 8;
352 else {
353 mdev_state->s[index].uart_reg[offset] = data;
354 mutex_lock(&mdev_state->rxtx_lock);
355 if ((data & UART_IER_THRI) &&
356 (mdev_state->s[index].rxtx.head ==
357 mdev_state->s[index].rxtx.tail)) {
358#if defined(DEBUG_INTR)
359 pr_err("Serial port %d: IER_THRI write\n",
360 index);
361#endif
362 mtty_trigger_interrupt(mdev_state);
363 }
364
365 mutex_unlock(&mdev_state->rxtx_lock);
366 }
367
368 break;
369
370 case UART_FCR:
371 mdev_state->s[index].fcr = data;
372
373 mutex_lock(&mdev_state->rxtx_lock);
374 if (data & (UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT)) {
375 /* clear loop back FIFO */
376 mdev_state->s[index].rxtx.count = 0;
377 mdev_state->s[index].rxtx.head = 0;
378 mdev_state->s[index].rxtx.tail = 0;
379 }
380 mutex_unlock(&mdev_state->rxtx_lock);
381
382 switch (data & UART_FCR_TRIGGER_MASK) {
383 case UART_FCR_TRIGGER_1:
384 mdev_state->s[index].intr_trigger_level = 1;
385 break;
386
387 case UART_FCR_TRIGGER_4:
388 mdev_state->s[index].intr_trigger_level = 4;
389 break;
390
391 case UART_FCR_TRIGGER_8:
392 mdev_state->s[index].intr_trigger_level = 8;
393 break;
394
395 case UART_FCR_TRIGGER_14:
396 mdev_state->s[index].intr_trigger_level = 14;
397 break;
398 }
399
400 /*
401 * Set trigger level to 1 otherwise or implement timer with
402 * timeout of 4 characters and on expiring that timer set
403 * Recevice data timeout in IIR register
404 */
405 mdev_state->s[index].intr_trigger_level = 1;
406 if (data & UART_FCR_ENABLE_FIFO)
407 mdev_state->s[index].max_fifo_size = MAX_FIFO_SIZE;
408 else {
409 mdev_state->s[index].max_fifo_size = 1;
410 mdev_state->s[index].intr_trigger_level = 1;
411 }
412
413 break;
414
415 case UART_LCR:
416 if (data & UART_LCR_DLAB) {
417 mdev_state->s[index].dlab = true;
418 mdev_state->s[index].divisor = 0;
419 } else
420 mdev_state->s[index].dlab = false;
421
422 mdev_state->s[index].uart_reg[offset] = data;
423 break;
424
425 case UART_MCR:
426 mdev_state->s[index].uart_reg[offset] = data;
427
428 if ((mdev_state->s[index].uart_reg[UART_IER] & UART_IER_MSI) &&
429 (data & UART_MCR_OUT2)) {
430#if defined(DEBUG_INTR)
431 pr_err("Serial port %d: MCR_OUT2 write\n", index);
432#endif
433 mtty_trigger_interrupt(mdev_state);
434 }
435
436 if ((mdev_state->s[index].uart_reg[UART_IER] & UART_IER_MSI) &&
437 (data & (UART_MCR_RTS | UART_MCR_DTR))) {
438#if defined(DEBUG_INTR)
439 pr_err("Serial port %d: MCR RTS/DTR write\n", index);
440#endif
441 mtty_trigger_interrupt(mdev_state);
442 }
443 break;
444
445 case UART_LSR:
446 case UART_MSR:
447 /* do nothing */
448 break;
449
450 case UART_SCR:
451 mdev_state->s[index].uart_reg[offset] = data;
452 break;
453
454 default:
455 break;
456 }
457}
458
459static void handle_bar_read(unsigned int index, struct mdev_state *mdev_state,
460 u16 offset, u8 *buf, u32 count)
461{
462 /* Handle read requests by guest */
463 switch (offset) {
464 case UART_RX:
465 /* if DLAB set, data is LSB of divisor */
466 if (mdev_state->s[index].dlab) {
467 *buf = (u8)mdev_state->s[index].divisor;
468 break;
469 }
470
471 mutex_lock(&mdev_state->rxtx_lock);
472 /* return data in tx buffer */
473 if (mdev_state->s[index].rxtx.head !=
474 mdev_state->s[index].rxtx.tail) {
475 *buf = mdev_state->s[index].rxtx.fifo[
476 mdev_state->s[index].rxtx.tail];
477 mdev_state->s[index].rxtx.count--;
478 CIRCULAR_BUF_INC_IDX(mdev_state->s[index].rxtx.tail);
479 }
480
481 if (mdev_state->s[index].rxtx.head ==
482 mdev_state->s[index].rxtx.tail) {
483 /*
484 * Trigger interrupt if tx buffer empty interrupt is
485 * enabled and fifo is empty
486 */
487#if defined(DEBUG_INTR)
488 pr_err("Serial port %d: Buffer Empty\n", index);
489#endif
490 if (mdev_state->s[index].uart_reg[UART_IER] &
491 UART_IER_THRI)
492 mtty_trigger_interrupt(mdev_state);
493 }
494 mutex_unlock(&mdev_state->rxtx_lock);
495
496 break;
497
498 case UART_IER:
499 if (mdev_state->s[index].dlab) {
500 *buf = (u8)(mdev_state->s[index].divisor >> 8);
501 break;
502 }
503 *buf = mdev_state->s[index].uart_reg[offset] & 0x0f;
504 break;
505
506 case UART_IIR:
507 {
508 u8 ier = mdev_state->s[index].uart_reg[UART_IER];
509 *buf = 0;
510
511 mutex_lock(&mdev_state->rxtx_lock);
512 /* Interrupt priority 1: Parity, overrun, framing or break */
513 if ((ier & UART_IER_RLSI) && mdev_state->s[index].overrun)
514 *buf |= UART_IIR_RLSI;
515
516 /* Interrupt priority 2: Fifo trigger level reached */
517 if ((ier & UART_IER_RDI) &&
518 (mdev_state->s[index].rxtx.count >=
519 mdev_state->s[index].intr_trigger_level))
520 *buf |= UART_IIR_RDI;
521
522 /* Interrupt priotiry 3: transmitter holding register empty */
523 if ((ier & UART_IER_THRI) &&
524 (mdev_state->s[index].rxtx.head ==
525 mdev_state->s[index].rxtx.tail))
526 *buf |= UART_IIR_THRI;
527
528 /* Interrupt priotiry 4: Modem status: CTS, DSR, RI or DCD */
529 if ((ier & UART_IER_MSI) &&
530 (mdev_state->s[index].uart_reg[UART_MCR] &
531 (UART_MCR_RTS | UART_MCR_DTR)))
532 *buf |= UART_IIR_MSI;
533
534 /* bit0: 0=> interrupt pending, 1=> no interrupt is pending */
535 if (*buf == 0)
536 *buf = UART_IIR_NO_INT;
537
538 /* set bit 6 & 7 to be 16550 compatible */
539 *buf |= 0xC0;
540 mutex_unlock(&mdev_state->rxtx_lock);
541 }
542 break;
543
544 case UART_LCR:
545 case UART_MCR:
546 *buf = mdev_state->s[index].uart_reg[offset];
547 break;
548
549 case UART_LSR:
550 {
551 u8 lsr = 0;
552
553 mutex_lock(&mdev_state->rxtx_lock);
554 /* atleast one char in FIFO */
555 if (mdev_state->s[index].rxtx.head !=
556 mdev_state->s[index].rxtx.tail)
557 lsr |= UART_LSR_DR;
558
559 /* if FIFO overrun */
560 if (mdev_state->s[index].overrun)
561 lsr |= UART_LSR_OE;
562
563 /* transmit FIFO empty and tramsitter empty */
564 if (mdev_state->s[index].rxtx.head ==
565 mdev_state->s[index].rxtx.tail)
566 lsr |= UART_LSR_TEMT | UART_LSR_THRE;
567
568 mutex_unlock(&mdev_state->rxtx_lock);
569 *buf = lsr;
570 break;
571 }
572 case UART_MSR:
573 *buf = UART_MSR_DSR | UART_MSR_DDSR | UART_MSR_DCD;
574
575 mutex_lock(&mdev_state->rxtx_lock);
576 /* if AFE is 1 and FIFO have space, set CTS bit */
577 if (mdev_state->s[index].uart_reg[UART_MCR] &
578 UART_MCR_AFE) {
579 if (mdev_state->s[index].rxtx.count <
580 mdev_state->s[index].max_fifo_size)
581 *buf |= UART_MSR_CTS | UART_MSR_DCTS;
582 } else
583 *buf |= UART_MSR_CTS | UART_MSR_DCTS;
584 mutex_unlock(&mdev_state->rxtx_lock);
585
586 break;
587
588 case UART_SCR:
589 *buf = mdev_state->s[index].uart_reg[offset];
590 break;
591
592 default:
593 break;
594 }
595}
596
597static void mdev_read_base(struct mdev_state *mdev_state)
598{
599 int index, pos;
600 u32 start_lo, start_hi;
601 u32 mem_type;
602
603 pos = PCI_BASE_ADDRESS_0;
604
605 for (index = 0; index <= VFIO_PCI_BAR5_REGION_INDEX; index++) {
606
607 if (!mdev_state->region_info[index].size)
608 continue;
609
610 start_lo = (*(u32 *)(mdev_state->vconfig + pos)) &
611 PCI_BASE_ADDRESS_MEM_MASK;
612 mem_type = (*(u32 *)(mdev_state->vconfig + pos)) &
613 PCI_BASE_ADDRESS_MEM_TYPE_MASK;
614
615 switch (mem_type) {
616 case PCI_BASE_ADDRESS_MEM_TYPE_64:
617 start_hi = (*(u32 *)(mdev_state->vconfig + pos + 4));
618 pos += 4;
619 break;
620 case PCI_BASE_ADDRESS_MEM_TYPE_32:
621 case PCI_BASE_ADDRESS_MEM_TYPE_1M:
622 /* 1M mem BAR treated as 32-bit BAR */
623 default:
624 /* mem unknown type treated as 32-bit BAR */
625 start_hi = 0;
626 break;
627 }
628 pos += 4;
629 mdev_state->region_info[index].start = ((u64)start_hi << 32) |
630 start_lo;
631 }
632}
633
634static ssize_t mdev_access(struct mdev_device *mdev, u8 *buf, size_t count,
635 loff_t pos, bool is_write)
636{
637 struct mdev_state *mdev_state;
638 unsigned int index;
639 loff_t offset;
640 int ret = 0;
641
642 if (!mdev || !buf)
643 return -EINVAL;
644
645 mdev_state = mdev_get_drvdata(mdev);
646 if (!mdev_state) {
647 pr_err("%s mdev_state not found\n", __func__);
648 return -EINVAL;
649 }
650
651 mutex_lock(&mdev_state->ops_lock);
652
653 index = MTTY_VFIO_PCI_OFFSET_TO_INDEX(pos);
654 offset = pos & MTTY_VFIO_PCI_OFFSET_MASK;
655 switch (index) {
656 case VFIO_PCI_CONFIG_REGION_INDEX:
657
658#if defined(DEBUG)
659 pr_info("%s: PCI config space %s at offset 0x%llx\n",
660 __func__, is_write ? "write" : "read", offset);
661#endif
662 if (is_write) {
663 dump_buffer(buf, count);
664 handle_pci_cfg_write(mdev_state, offset, buf, count);
665 } else {
666 memcpy(buf, (mdev_state->vconfig + offset), count);
667 dump_buffer(buf, count);
668 }
669
670 break;
671
672 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
673 if (!mdev_state->region_info[index].start)
674 mdev_read_base(mdev_state);
675
676 if (is_write) {
677 dump_buffer(buf, count);
678
679#if defined(DEBUG_REGS)
680 pr_info("%s: BAR%d WR @0x%llx %s val:0x%02x dlab:%d\n",
681 __func__, index, offset, wr_reg[offset],
682 *buf, mdev_state->s[index].dlab);
683#endif
684 handle_bar_write(index, mdev_state, offset, buf, count);
685 } else {
686 handle_bar_read(index, mdev_state, offset, buf, count);
687 dump_buffer(buf, count);
688
689#if defined(DEBUG_REGS)
690 pr_info("%s: BAR%d RD @0x%llx %s val:0x%02x dlab:%d\n",
691 __func__, index, offset, rd_reg[offset],
692 *buf, mdev_state->s[index].dlab);
693#endif
694 }
695 break;
696
697 default:
698 ret = -1;
699 goto accessfailed;
700 }
701
702 ret = count;
703
704
705accessfailed:
706 mutex_unlock(&mdev_state->ops_lock);
707
708 return ret;
709}
710
711static int mtty_create(struct kobject *kobj, struct mdev_device *mdev)
712{
713 struct mdev_state *mdev_state;
714 char name[MTTY_STRING_LEN];
715 int nr_ports = 0, i;
716
717 if (!mdev)
718 return -EINVAL;
719
720 for (i = 0; i < 2; i++) {
721 snprintf(name, MTTY_STRING_LEN, "%s-%d",
722 dev_driver_string(mdev_parent_dev(mdev)), i + 1);
723 if (!strcmp(kobj->name, name)) {
724 nr_ports = i + 1;
725 break;
726 }
727 }
728
729 if (!nr_ports)
730 return -EINVAL;
731
732 mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL);
733 if (mdev_state == NULL)
734 return -ENOMEM;
735
736 mdev_state->nr_ports = nr_ports;
737 mdev_state->irq_index = -1;
738 mdev_state->s[0].max_fifo_size = MAX_FIFO_SIZE;
739 mdev_state->s[1].max_fifo_size = MAX_FIFO_SIZE;
740 mutex_init(&mdev_state->rxtx_lock);
741 mdev_state->vconfig = kzalloc(MTTY_CONFIG_SPACE_SIZE, GFP_KERNEL);
742
743 if (mdev_state->vconfig == NULL) {
744 kfree(mdev_state);
745 return -ENOMEM;
746 }
747
748 mutex_init(&mdev_state->ops_lock);
749 mdev_state->mdev = mdev;
750 mdev_set_drvdata(mdev, mdev_state);
751
752 mtty_create_config_space(mdev_state);
753
754 mutex_lock(&mdev_list_lock);
755 list_add(&mdev_state->next, &mdev_devices_list);
756 mutex_unlock(&mdev_list_lock);
757
758 return 0;
759}
760
761static int mtty_remove(struct mdev_device *mdev)
762{
763 struct mdev_state *mds, *tmp_mds;
764 struct mdev_state *mdev_state = mdev_get_drvdata(mdev);
765 int ret = -EINVAL;
766
767 mutex_lock(&mdev_list_lock);
768 list_for_each_entry_safe(mds, tmp_mds, &mdev_devices_list, next) {
769 if (mdev_state == mds) {
770 list_del(&mdev_state->next);
771 mdev_set_drvdata(mdev, NULL);
772 kfree(mdev_state->vconfig);
773 kfree(mdev_state);
774 ret = 0;
775 break;
776 }
777 }
778 mutex_unlock(&mdev_list_lock);
779
780 return ret;
781}
782
783static int mtty_reset(struct mdev_device *mdev)
784{
785 struct mdev_state *mdev_state;
786
787 if (!mdev)
788 return -EINVAL;
789
790 mdev_state = mdev_get_drvdata(mdev);
791 if (!mdev_state)
792 return -EINVAL;
793
794 pr_info("%s: called\n", __func__);
795
796 return 0;
797}
798
799static ssize_t mtty_read(struct mdev_device *mdev, char __user *buf,
800 size_t count, loff_t *ppos)
801{
802 unsigned int done = 0;
803 int ret;
804
805 while (count) {
806 size_t filled;
807
808 if (count >= 4 && !(*ppos % 4)) {
809 u32 val;
810
811 ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
812 *ppos, false);
813 if (ret <= 0)
814 goto read_err;
815
816 if (copy_to_user(buf, &val, sizeof(val)))
817 goto read_err;
818
819 filled = 4;
820 } else if (count >= 2 && !(*ppos % 2)) {
821 u16 val;
822
823 ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
824 *ppos, false);
825 if (ret <= 0)
826 goto read_err;
827
828 if (copy_to_user(buf, &val, sizeof(val)))
829 goto read_err;
830
831 filled = 2;
832 } else {
833 u8 val;
834
835 ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
836 *ppos, false);
837 if (ret <= 0)
838 goto read_err;
839
840 if (copy_to_user(buf, &val, sizeof(val)))
841 goto read_err;
842
843 filled = 1;
844 }
845
846 count -= filled;
847 done += filled;
848 *ppos += filled;
849 buf += filled;
850 }
851
852 return done;
853
854read_err:
855 return -EFAULT;
856}
857
858static ssize_t mtty_write(struct mdev_device *mdev, const char __user *buf,
859 size_t count, loff_t *ppos)
860{
861 unsigned int done = 0;
862 int ret;
863
864 while (count) {
865 size_t filled;
866
867 if (count >= 4 && !(*ppos % 4)) {
868 u32 val;
869
870 if (copy_from_user(&val, buf, sizeof(val)))
871 goto write_err;
872
873 ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
874 *ppos, true);
875 if (ret <= 0)
876 goto write_err;
877
878 filled = 4;
879 } else if (count >= 2 && !(*ppos % 2)) {
880 u16 val;
881
882 if (copy_from_user(&val, buf, sizeof(val)))
883 goto write_err;
884
885 ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
886 *ppos, true);
887 if (ret <= 0)
888 goto write_err;
889
890 filled = 2;
891 } else {
892 u8 val;
893
894 if (copy_from_user(&val, buf, sizeof(val)))
895 goto write_err;
896
897 ret = mdev_access(mdev, (u8 *)&val, sizeof(val),
898 *ppos, true);
899 if (ret <= 0)
900 goto write_err;
901
902 filled = 1;
903 }
904 count -= filled;
905 done += filled;
906 *ppos += filled;
907 buf += filled;
908 }
909
910 return done;
911write_err:
912 return -EFAULT;
913}
914
915static int mtty_set_irqs(struct mdev_device *mdev, uint32_t flags,
916 unsigned int index, unsigned int start,
917 unsigned int count, void *data)
918{
919 int ret = 0;
920 struct mdev_state *mdev_state;
921
922 if (!mdev)
923 return -EINVAL;
924
925 mdev_state = mdev_get_drvdata(mdev);
926 if (!mdev_state)
927 return -EINVAL;
928
929 mutex_lock(&mdev_state->ops_lock);
930 switch (index) {
931 case VFIO_PCI_INTX_IRQ_INDEX:
932 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
933 case VFIO_IRQ_SET_ACTION_MASK:
934 case VFIO_IRQ_SET_ACTION_UNMASK:
935 break;
936 case VFIO_IRQ_SET_ACTION_TRIGGER:
937 {
938 if (flags & VFIO_IRQ_SET_DATA_NONE) {
939 pr_info("%s: disable INTx\n", __func__);
940 if (mdev_state->intx_evtfd)
941 eventfd_ctx_put(mdev_state->intx_evtfd);
942 break;
943 }
944
945 if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
946 int fd = *(int *)data;
947
948 if (fd > 0) {
949 struct eventfd_ctx *evt;
950
951 evt = eventfd_ctx_fdget(fd);
952 if (IS_ERR(evt)) {
953 ret = PTR_ERR(evt);
954 break;
955 }
956 mdev_state->intx_evtfd = evt;
957 mdev_state->irq_fd = fd;
958 mdev_state->irq_index = index;
959 break;
960 }
961 }
962 break;
963 }
964 }
965 break;
966 case VFIO_PCI_MSI_IRQ_INDEX:
967 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
968 case VFIO_IRQ_SET_ACTION_MASK:
969 case VFIO_IRQ_SET_ACTION_UNMASK:
970 break;
971 case VFIO_IRQ_SET_ACTION_TRIGGER:
972 if (flags & VFIO_IRQ_SET_DATA_NONE) {
973 if (mdev_state->msi_evtfd)
974 eventfd_ctx_put(mdev_state->msi_evtfd);
975 pr_info("%s: disable MSI\n", __func__);
976 mdev_state->irq_index = VFIO_PCI_INTX_IRQ_INDEX;
977 break;
978 }
979 if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
980 int fd = *(int *)data;
981 struct eventfd_ctx *evt;
982
983 if (fd <= 0)
984 break;
985
986 if (mdev_state->msi_evtfd)
987 break;
988
989 evt = eventfd_ctx_fdget(fd);
990 if (IS_ERR(evt)) {
991 ret = PTR_ERR(evt);
992 break;
993 }
994 mdev_state->msi_evtfd = evt;
995 mdev_state->irq_fd = fd;
996 mdev_state->irq_index = index;
997 }
998 break;
999 }
1000 break;
1001 case VFIO_PCI_MSIX_IRQ_INDEX:
1002 pr_info("%s: MSIX_IRQ\n", __func__);
1003 break;
1004 case VFIO_PCI_ERR_IRQ_INDEX:
1005 pr_info("%s: ERR_IRQ\n", __func__);
1006 break;
1007 case VFIO_PCI_REQ_IRQ_INDEX:
1008 pr_info("%s: REQ_IRQ\n", __func__);
1009 break;
1010 }
1011
1012 mutex_unlock(&mdev_state->ops_lock);
1013 return ret;
1014}
1015
1016static int mtty_trigger_interrupt(struct mdev_state *mdev_state)
1017{
1018 int ret = -1;
1019
1020 if ((mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX) &&
1021 (!mdev_state->msi_evtfd))
1022 return -EINVAL;
1023 else if ((mdev_state->irq_index == VFIO_PCI_INTX_IRQ_INDEX) &&
1024 (!mdev_state->intx_evtfd)) {
1025 pr_info("%s: Intr eventfd not found\n", __func__);
1026 return -EINVAL;
1027 }
1028
1029 if (mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX)
1030 ret = eventfd_signal(mdev_state->msi_evtfd, 1);
1031 else
1032 ret = eventfd_signal(mdev_state->intx_evtfd, 1);
1033
1034#if defined(DEBUG_INTR)
1035 pr_info("Intx triggered\n");
1036#endif
1037 if (ret != 1)
1038 pr_err("%s: eventfd signal failed (%d)\n", __func__, ret);
1039
1040 return ret;
1041}
1042
1043static int mtty_get_region_info(struct mdev_device *mdev,
1044 struct vfio_region_info *region_info,
1045 u16 *cap_type_id, void **cap_type)
1046{
1047 unsigned int size = 0;
1048 struct mdev_state *mdev_state;
1049 u32 bar_index;
1050
1051 if (!mdev)
1052 return -EINVAL;
1053
1054 mdev_state = mdev_get_drvdata(mdev);
1055 if (!mdev_state)
1056 return -EINVAL;
1057
1058 bar_index = region_info->index;
1059 if (bar_index >= VFIO_PCI_NUM_REGIONS)
1060 return -EINVAL;
1061
1062 mutex_lock(&mdev_state->ops_lock);
1063
1064 switch (bar_index) {
1065 case VFIO_PCI_CONFIG_REGION_INDEX:
1066 size = MTTY_CONFIG_SPACE_SIZE;
1067 break;
1068 case VFIO_PCI_BAR0_REGION_INDEX:
1069 size = MTTY_IO_BAR_SIZE;
1070 break;
1071 case VFIO_PCI_BAR1_REGION_INDEX:
1072 if (mdev_state->nr_ports == 2)
1073 size = MTTY_IO_BAR_SIZE;
1074 break;
1075 default:
1076 size = 0;
1077 break;
1078 }
1079
1080 mdev_state->region_info[bar_index].size = size;
1081 mdev_state->region_info[bar_index].vfio_offset =
1082 MTTY_VFIO_PCI_INDEX_TO_OFFSET(bar_index);
1083
1084 region_info->size = size;
1085 region_info->offset = MTTY_VFIO_PCI_INDEX_TO_OFFSET(bar_index);
1086 region_info->flags = VFIO_REGION_INFO_FLAG_READ |
1087 VFIO_REGION_INFO_FLAG_WRITE;
1088 mutex_unlock(&mdev_state->ops_lock);
1089 return 0;
1090}
1091
1092static int mtty_get_irq_info(struct mdev_device *mdev,
1093 struct vfio_irq_info *irq_info)
1094{
1095 switch (irq_info->index) {
1096 case VFIO_PCI_INTX_IRQ_INDEX:
1097 case VFIO_PCI_MSI_IRQ_INDEX:
1098 case VFIO_PCI_REQ_IRQ_INDEX:
1099 break;
1100
1101 default:
1102 return -EINVAL;
1103 }
1104
1105 irq_info->flags = VFIO_IRQ_INFO_EVENTFD;
1106 irq_info->count = 1;
1107
1108 if (irq_info->index == VFIO_PCI_INTX_IRQ_INDEX)
1109 irq_info->flags |= (VFIO_IRQ_INFO_MASKABLE |
1110 VFIO_IRQ_INFO_AUTOMASKED);
1111 else
1112 irq_info->flags |= VFIO_IRQ_INFO_NORESIZE;
1113
1114 return 0;
1115}
1116
1117static int mtty_get_device_info(struct mdev_device *mdev,
1118 struct vfio_device_info *dev_info)
1119{
1120 dev_info->flags = VFIO_DEVICE_FLAGS_PCI;
1121 dev_info->num_regions = VFIO_PCI_NUM_REGIONS;
1122 dev_info->num_irqs = VFIO_PCI_NUM_IRQS;
1123
1124 return 0;
1125}
1126
1127static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd,
1128 unsigned long arg)
1129{
1130 int ret = 0;
1131 unsigned long minsz;
1132 struct mdev_state *mdev_state;
1133
1134 if (!mdev)
1135 return -EINVAL;
1136
1137 mdev_state = mdev_get_drvdata(mdev);
1138 if (!mdev_state)
1139 return -ENODEV;
1140
1141 switch (cmd) {
1142 case VFIO_DEVICE_GET_INFO:
1143 {
1144 struct vfio_device_info info;
1145
1146 minsz = offsetofend(struct vfio_device_info, num_irqs);
1147
1148 if (copy_from_user(&info, (void __user *)arg, minsz))
1149 return -EFAULT;
1150
1151 if (info.argsz < minsz)
1152 return -EINVAL;
1153
1154 ret = mtty_get_device_info(mdev, &info);
1155 if (ret)
1156 return ret;
1157
1158 memcpy(&mdev_state->dev_info, &info, sizeof(info));
1159
1160 if (copy_to_user((void __user *)arg, &info, minsz))
1161 return -EFAULT;
1162
1163 return 0;
1164 }
1165 case VFIO_DEVICE_GET_REGION_INFO:
1166 {
1167 struct vfio_region_info info;
1168 u16 cap_type_id = 0;
1169 void *cap_type = NULL;
1170
1171 minsz = offsetofend(struct vfio_region_info, offset);
1172
1173 if (copy_from_user(&info, (void __user *)arg, minsz))
1174 return -EFAULT;
1175
1176 if (info.argsz < minsz)
1177 return -EINVAL;
1178
1179 ret = mtty_get_region_info(mdev, &info, &cap_type_id,
1180 &cap_type);
1181 if (ret)
1182 return ret;
1183
1184 if (copy_to_user((void __user *)arg, &info, minsz))
1185 return -EFAULT;
1186
1187 return 0;
1188 }
1189
1190 case VFIO_DEVICE_GET_IRQ_INFO:
1191 {
1192 struct vfio_irq_info info;
1193
1194 minsz = offsetofend(struct vfio_irq_info, count);
1195
1196 if (copy_from_user(&info, (void __user *)arg, minsz))
1197 return -EFAULT;
1198
1199 if ((info.argsz < minsz) ||
1200 (info.index >= mdev_state->dev_info.num_irqs))
1201 return -EINVAL;
1202
1203 ret = mtty_get_irq_info(mdev, &info);
1204 if (ret)
1205 return ret;
1206
1207 if (copy_to_user((void __user *)arg, &info, minsz))
1208 return -EFAULT;
1209
1210 return 0;
1211 }
1212 case VFIO_DEVICE_SET_IRQS:
1213 {
1214 struct vfio_irq_set hdr;
1215 u8 *data = NULL, *ptr = NULL;
1216 size_t data_size = 0;
1217
1218 minsz = offsetofend(struct vfio_irq_set, count);
1219
1220 if (copy_from_user(&hdr, (void __user *)arg, minsz))
1221 return -EFAULT;
1222
1223 ret = vfio_set_irqs_validate_and_prepare(&hdr,
1224 mdev_state->dev_info.num_irqs,
1225 VFIO_PCI_NUM_IRQS,
1226 &data_size);
1227 if (ret)
1228 return ret;
1229
1230 if (data_size) {
1231 ptr = data = memdup_user((void __user *)(arg + minsz),
1232 data_size);
1233 if (IS_ERR(data))
1234 return PTR_ERR(data);
1235 }
1236
1237 ret = mtty_set_irqs(mdev, hdr.flags, hdr.index, hdr.start,
1238 hdr.count, data);
1239
1240 kfree(ptr);
1241 return ret;
1242 }
1243 case VFIO_DEVICE_RESET:
1244 return mtty_reset(mdev);
1245 }
1246 return -ENOTTY;
1247}
1248
1249static int mtty_open(struct mdev_device *mdev)
1250{
1251 pr_info("%s\n", __func__);
1252 return 0;
1253}
1254
1255static void mtty_close(struct mdev_device *mdev)
1256{
1257 pr_info("%s\n", __func__);
1258}
1259
1260static ssize_t
1261sample_mtty_dev_show(struct device *dev, struct device_attribute *attr,
1262 char *buf)
1263{
1264 return sprintf(buf, "This is phy device\n");
1265}
1266
1267static DEVICE_ATTR_RO(sample_mtty_dev);
1268
1269static struct attribute *mtty_dev_attrs[] = {
1270 &dev_attr_sample_mtty_dev.attr,
1271 NULL,
1272};
1273
1274static const struct attribute_group mtty_dev_group = {
1275 .name = "mtty_dev",
1276 .attrs = mtty_dev_attrs,
1277};
1278
1279static const struct attribute_group *mtty_dev_groups[] = {
1280 &mtty_dev_group,
1281 NULL,
1282};
1283
1284static ssize_t
1285sample_mdev_dev_show(struct device *dev, struct device_attribute *attr,
1286 char *buf)
1287{
1288 if (mdev_from_dev(dev))
1289 return sprintf(buf, "This is MDEV %s\n", dev_name(dev));
1290
1291 return sprintf(buf, "\n");
1292}
1293
1294static DEVICE_ATTR_RO(sample_mdev_dev);
1295
1296static struct attribute *mdev_dev_attrs[] = {
1297 &dev_attr_sample_mdev_dev.attr,
1298 NULL,
1299};
1300
1301static const struct attribute_group mdev_dev_group = {
1302 .name = "vendor",
1303 .attrs = mdev_dev_attrs,
1304};
1305
1306static const struct attribute_group *mdev_dev_groups[] = {
1307 &mdev_dev_group,
1308 NULL,
1309};
1310
1311static ssize_t
1312name_show(struct kobject *kobj, struct device *dev, char *buf)
1313{
1314 char name[MTTY_STRING_LEN];
1315 int i;
1316 const char *name_str[2] = {"Single port serial", "Dual port serial"};
1317
1318 for (i = 0; i < 2; i++) {
1319 snprintf(name, MTTY_STRING_LEN, "%s-%d",
1320 dev_driver_string(dev), i + 1);
1321 if (!strcmp(kobj->name, name))
1322 return sprintf(buf, "%s\n", name_str[i]);
1323 }
1324
1325 return -EINVAL;
1326}
1327
1328static MDEV_TYPE_ATTR_RO(name);
1329
1330static ssize_t
1331available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
1332{
1333 char name[MTTY_STRING_LEN];
1334 int i;
1335 struct mdev_state *mds;
1336 int ports = 0, used = 0;
1337
1338 for (i = 0; i < 2; i++) {
1339 snprintf(name, MTTY_STRING_LEN, "%s-%d",
1340 dev_driver_string(dev), i + 1);
1341 if (!strcmp(kobj->name, name)) {
1342 ports = i + 1;
1343 break;
1344 }
1345 }
1346
1347 if (!ports)
1348 return -EINVAL;
1349
1350 list_for_each_entry(mds, &mdev_devices_list, next)
1351 used += mds->nr_ports;
1352
1353 return sprintf(buf, "%d\n", (MAX_MTTYS - used)/ports);
1354}
1355
1356static MDEV_TYPE_ATTR_RO(available_instances);
1357
1358
1359static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
1360 char *buf)
1361{
1362 return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
1363}
1364
1365static MDEV_TYPE_ATTR_RO(device_api);
1366
1367static struct attribute *mdev_types_attrs[] = {
1368 &mdev_type_attr_name.attr,
1369 &mdev_type_attr_device_api.attr,
1370 &mdev_type_attr_available_instances.attr,
1371 NULL,
1372};
1373
1374static struct attribute_group mdev_type_group1 = {
1375 .name = "1",
1376 .attrs = mdev_types_attrs,
1377};
1378
1379static struct attribute_group mdev_type_group2 = {
1380 .name = "2",
1381 .attrs = mdev_types_attrs,
1382};
1383
1384static struct attribute_group *mdev_type_groups[] = {
1385 &mdev_type_group1,
1386 &mdev_type_group2,
1387 NULL,
1388};
1389
1390static const struct mdev_parent_ops mdev_fops = {
1391 .owner = THIS_MODULE,
1392 .dev_attr_groups = mtty_dev_groups,
1393 .mdev_attr_groups = mdev_dev_groups,
1394 .supported_type_groups = mdev_type_groups,
1395 .create = mtty_create,
1396 .remove = mtty_remove,
1397 .open = mtty_open,
1398 .release = mtty_close,
1399 .read = mtty_read,
1400 .write = mtty_write,
1401 .ioctl = mtty_ioctl,
1402};
1403
1404static void mtty_device_release(struct device *dev)
1405{
1406 dev_dbg(dev, "mtty: released\n");
1407}
1408
1409static int __init mtty_dev_init(void)
1410{
1411 int ret = 0;
1412
1413 pr_info("mtty_dev: %s\n", __func__);
1414
1415 memset(&mtty_dev, 0, sizeof(mtty_dev));
1416
1417 idr_init(&mtty_dev.vd_idr);
1418
1419 ret = alloc_chrdev_region(&mtty_dev.vd_devt, 0, MINORMASK + 1,
1420 MTTY_NAME);
1421
1422 if (ret < 0) {
1423 pr_err("Error: failed to register mtty_dev, err:%d\n", ret);
1424 return ret;
1425 }
1426
1427 cdev_init(&mtty_dev.vd_cdev, &vd_fops);
1428 cdev_add(&mtty_dev.vd_cdev, mtty_dev.vd_devt, MINORMASK + 1);
1429
1430 pr_info("major_number:%d\n", MAJOR(mtty_dev.vd_devt));
1431
1432 mtty_dev.vd_class = class_create(THIS_MODULE, MTTY_CLASS_NAME);
1433
1434 if (IS_ERR(mtty_dev.vd_class)) {
1435 pr_err("Error: failed to register mtty_dev class\n");
1436 ret = PTR_ERR(mtty_dev.vd_class);
1437 goto failed1;
1438 }
1439
1440 mtty_dev.dev.class = mtty_dev.vd_class;
1441 mtty_dev.dev.release = mtty_device_release;
1442 dev_set_name(&mtty_dev.dev, "%s", MTTY_NAME);
1443
1444 ret = device_register(&mtty_dev.dev);
1445 if (ret)
1446 goto failed2;
1447
1448 ret = mdev_register_device(&mtty_dev.dev, &mdev_fops);
1449 if (ret)
1450 goto failed3;
1451
1452 mutex_init(&mdev_list_lock);
1453 INIT_LIST_HEAD(&mdev_devices_list);
1454
1455 goto all_done;
1456
1457failed3:
1458
1459 device_unregister(&mtty_dev.dev);
1460failed2:
1461 class_destroy(mtty_dev.vd_class);
1462
1463failed1:
1464 cdev_del(&mtty_dev.vd_cdev);
1465 unregister_chrdev_region(mtty_dev.vd_devt, MINORMASK + 1);
1466
1467all_done:
1468 return ret;
1469}
1470
1471static void __exit mtty_dev_exit(void)
1472{
1473 mtty_dev.dev.bus = NULL;
1474 mdev_unregister_device(&mtty_dev.dev);
1475
1476 device_unregister(&mtty_dev.dev);
1477 idr_destroy(&mtty_dev.vd_idr);
1478 cdev_del(&mtty_dev.vd_cdev);
1479 unregister_chrdev_region(mtty_dev.vd_devt, MINORMASK + 1);
1480 class_destroy(mtty_dev.vd_class);
1481 mtty_dev.vd_class = NULL;
1482 pr_info("mtty_dev: Unloaded!\n");
1483}
1484
1485module_init(mtty_dev_init)
1486module_exit(mtty_dev_exit)
1487
1488MODULE_LICENSE("GPL v2");
1489MODULE_INFO(supported, "Test driver that simulate serial port over PCI");
1490MODULE_VERSION(VERSION_STRING);
1491MODULE_AUTHOR(DRIVER_AUTHOR);
diff --git a/samples/vfs/.gitignore b/samples/vfs/.gitignore
new file mode 100644
index 000000000..8fdabf7e5
--- /dev/null
+++ b/samples/vfs/.gitignore
@@ -0,0 +1,3 @@
1# SPDX-License-Identifier: GPL-2.0-only
2test-fsmount
3test-statx
diff --git a/samples/vfs/Makefile b/samples/vfs/Makefile
new file mode 100644
index 000000000..6377a6781
--- /dev/null
+++ b/samples/vfs/Makefile
@@ -0,0 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0-only
2userprogs-always-y += test-fsmount test-statx
3
4userccflags += -I usr/include
diff --git a/samples/vfs/test-fsmount.c b/samples/vfs/test-fsmount.c
new file mode 100644
index 000000000..50f47b72e
--- /dev/null
+++ b/samples/vfs/test-fsmount.c
@@ -0,0 +1,129 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
2/* fd-based mount test.
3 *
4 * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
6 */
7
8#include <stdio.h>
9#include <stdlib.h>
10#include <unistd.h>
11#include <errno.h>
12#include <fcntl.h>
13#include <sys/prctl.h>
14#include <sys/wait.h>
15#include <linux/mount.h>
16#include <linux/unistd.h>
17
18#define E(x) do { if ((x) == -1) { perror(#x); exit(1); } } while(0)
19
20static void check_messages(int fd)
21{
22 char buf[4096];
23 int err, n;
24
25 err = errno;
26
27 for (;;) {
28 n = read(fd, buf, sizeof(buf));
29 if (n < 0)
30 break;
31 n -= 2;
32
33 switch (buf[0]) {
34 case 'e':
35 fprintf(stderr, "Error: %*.*s\n", n, n, buf + 2);
36 break;
37 case 'w':
38 fprintf(stderr, "Warning: %*.*s\n", n, n, buf + 2);
39 break;
40 case 'i':
41 fprintf(stderr, "Info: %*.*s\n", n, n, buf + 2);
42 break;
43 }
44 }
45
46 errno = err;
47}
48
49static __attribute__((noreturn))
50void mount_error(int fd, const char *s)
51{
52 check_messages(fd);
53 fprintf(stderr, "%s: %m\n", s);
54 exit(1);
55}
56
57/* Hope -1 isn't a syscall */
58#ifndef __NR_fsopen
59#define __NR_fsopen -1
60#endif
61#ifndef __NR_fsmount
62#define __NR_fsmount -1
63#endif
64#ifndef __NR_fsconfig
65#define __NR_fsconfig -1
66#endif
67#ifndef __NR_move_mount
68#define __NR_move_mount -1
69#endif
70
71
72static inline int fsopen(const char *fs_name, unsigned int flags)
73{
74 return syscall(__NR_fsopen, fs_name, flags);
75}
76
77static inline int fsmount(int fsfd, unsigned int flags, unsigned int ms_flags)
78{
79 return syscall(__NR_fsmount, fsfd, flags, ms_flags);
80}
81
82static inline int fsconfig(int fsfd, unsigned int cmd,
83 const char *key, const void *val, int aux)
84{
85 return syscall(__NR_fsconfig, fsfd, cmd, key, val, aux);
86}
87
88static inline int move_mount(int from_dfd, const char *from_pathname,
89 int to_dfd, const char *to_pathname,
90 unsigned int flags)
91{
92 return syscall(__NR_move_mount,
93 from_dfd, from_pathname,
94 to_dfd, to_pathname, flags);
95}
96
97#define E_fsconfig(fd, cmd, key, val, aux) \
98 do { \
99 if (fsconfig(fd, cmd, key, val, aux) == -1) \
100 mount_error(fd, key ?: "create"); \
101 } while (0)
102
103int main(int argc, char *argv[])
104{
105 int fsfd, mfd;
106
107 /* Mount a publically available AFS filesystem */
108 fsfd = fsopen("afs", 0);
109 if (fsfd == -1) {
110 perror("fsopen");
111 exit(1);
112 }
113
114 E_fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "#grand.central.org:root.cell.", 0);
115 E_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
116
117 mfd = fsmount(fsfd, 0, MOUNT_ATTR_RDONLY);
118 if (mfd < 0)
119 mount_error(fsfd, "fsmount");
120 E(close(fsfd));
121
122 if (move_mount(mfd, "", AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH) < 0) {
123 perror("move_mount");
124 exit(1);
125 }
126
127 E(close(mfd));
128 exit(0);
129}
diff --git a/samples/vfs/test-statx.c b/samples/vfs/test-statx.c
new file mode 100644
index 000000000..49c7a46ce
--- /dev/null
+++ b/samples/vfs/test-statx.c
@@ -0,0 +1,265 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
2/* Test the statx() system call.
3 *
4 * Note that the output of this program is intended to look like the output of
5 * /bin/stat where possible.
6 *
7 * Copyright (C) 2015 Red Hat, Inc. All Rights Reserved.
8 * Written by David Howells (dhowells@redhat.com)
9 */
10
11#define _GNU_SOURCE
12#define _ATFILE_SOURCE
13#include <stdio.h>
14#include <stdlib.h>
15#include <string.h>
16#include <unistd.h>
17#include <ctype.h>
18#include <errno.h>
19#include <time.h>
20#include <sys/syscall.h>
21#include <sys/types.h>
22#include <linux/stat.h>
23#include <linux/fcntl.h>
24#define statx foo
25#define statx_timestamp foo_timestamp
26struct statx;
27struct statx_timestamp;
28#include <sys/stat.h>
29#undef statx
30#undef statx_timestamp
31
32#define AT_STATX_SYNC_TYPE 0x6000
33#define AT_STATX_SYNC_AS_STAT 0x0000
34#define AT_STATX_FORCE_SYNC 0x2000
35#define AT_STATX_DONT_SYNC 0x4000
36
37#ifndef __NR_statx
38#define __NR_statx -1
39#endif
40
41static __attribute__((unused))
42ssize_t statx(int dfd, const char *filename, unsigned flags,
43 unsigned int mask, struct statx *buffer)
44{
45 return syscall(__NR_statx, dfd, filename, flags, mask, buffer);
46}
47
48static void print_time(const char *field, struct statx_timestamp *ts)
49{
50 struct tm tm;
51 time_t tim;
52 char buffer[100];
53 int len;
54
55 tim = ts->tv_sec;
56 if (!localtime_r(&tim, &tm)) {
57 perror("localtime_r");
58 exit(1);
59 }
60 len = strftime(buffer, 100, "%F %T", &tm);
61 if (len == 0) {
62 perror("strftime");
63 exit(1);
64 }
65 printf("%s", field);
66 fwrite(buffer, 1, len, stdout);
67 printf(".%09u", ts->tv_nsec);
68 len = strftime(buffer, 100, "%z", &tm);
69 if (len == 0) {
70 perror("strftime2");
71 exit(1);
72 }
73 fwrite(buffer, 1, len, stdout);
74 printf("\n");
75}
76
77static void dump_statx(struct statx *stx)
78{
79 char buffer[256], ft = '?';
80
81 printf("results=%x\n", stx->stx_mask);
82
83 printf(" ");
84 if (stx->stx_mask & STATX_SIZE)
85 printf(" Size: %-15llu", (unsigned long long)stx->stx_size);
86 if (stx->stx_mask & STATX_BLOCKS)
87 printf(" Blocks: %-10llu", (unsigned long long)stx->stx_blocks);
88 printf(" IO Block: %-6llu", (unsigned long long)stx->stx_blksize);
89 if (stx->stx_mask & STATX_TYPE) {
90 switch (stx->stx_mode & S_IFMT) {
91 case S_IFIFO: printf(" FIFO\n"); ft = 'p'; break;
92 case S_IFCHR: printf(" character special file\n"); ft = 'c'; break;
93 case S_IFDIR: printf(" directory\n"); ft = 'd'; break;
94 case S_IFBLK: printf(" block special file\n"); ft = 'b'; break;
95 case S_IFREG: printf(" regular file\n"); ft = '-'; break;
96 case S_IFLNK: printf(" symbolic link\n"); ft = 'l'; break;
97 case S_IFSOCK: printf(" socket\n"); ft = 's'; break;
98 default:
99 printf(" unknown type (%o)\n", stx->stx_mode & S_IFMT);
100 break;
101 }
102 } else {
103 printf(" no type\n");
104 }
105
106 sprintf(buffer, "%02x:%02x", stx->stx_dev_major, stx->stx_dev_minor);
107 printf("Device: %-15s", buffer);
108 if (stx->stx_mask & STATX_INO)
109 printf(" Inode: %-11llu", (unsigned long long) stx->stx_ino);
110 if (stx->stx_mask & STATX_NLINK)
111 printf(" Links: %-5u", stx->stx_nlink);
112 if (stx->stx_mask & STATX_TYPE) {
113 switch (stx->stx_mode & S_IFMT) {
114 case S_IFBLK:
115 case S_IFCHR:
116 printf(" Device type: %u,%u",
117 stx->stx_rdev_major, stx->stx_rdev_minor);
118 break;
119 }
120 }
121 printf("\n");
122
123 if (stx->stx_mask & STATX_MODE)
124 printf("Access: (%04o/%c%c%c%c%c%c%c%c%c%c) ",
125 stx->stx_mode & 07777,
126 ft,
127 stx->stx_mode & S_IRUSR ? 'r' : '-',
128 stx->stx_mode & S_IWUSR ? 'w' : '-',
129 stx->stx_mode & S_IXUSR ? 'x' : '-',
130 stx->stx_mode & S_IRGRP ? 'r' : '-',
131 stx->stx_mode & S_IWGRP ? 'w' : '-',
132 stx->stx_mode & S_IXGRP ? 'x' : '-',
133 stx->stx_mode & S_IROTH ? 'r' : '-',
134 stx->stx_mode & S_IWOTH ? 'w' : '-',
135 stx->stx_mode & S_IXOTH ? 'x' : '-');
136 if (stx->stx_mask & STATX_UID)
137 printf("Uid: %5d ", stx->stx_uid);
138 if (stx->stx_mask & STATX_GID)
139 printf("Gid: %5d\n", stx->stx_gid);
140
141 if (stx->stx_mask & STATX_ATIME)
142 print_time("Access: ", &stx->stx_atime);
143 if (stx->stx_mask & STATX_MTIME)
144 print_time("Modify: ", &stx->stx_mtime);
145 if (stx->stx_mask & STATX_CTIME)
146 print_time("Change: ", &stx->stx_ctime);
147 if (stx->stx_mask & STATX_BTIME)
148 print_time(" Birth: ", &stx->stx_btime);
149
150 if (stx->stx_attributes_mask) {
151 unsigned char bits, mbits;
152 int loop, byte;
153
154 static char attr_representation[64 + 1] =
155 /* STATX_ATTR_ flags: */
156 "????????" /* 63-56 */
157 "????????" /* 55-48 */
158 "????????" /* 47-40 */
159 "????????" /* 39-32 */
160 "????????" /* 31-24 0x00000000-ff000000 */
161 "????????" /* 23-16 0x00000000-00ff0000 */
162 "???me???" /* 15- 8 0x00000000-0000ff00 */
163 "?dai?c??" /* 7- 0 0x00000000-000000ff */
164 ;
165
166 printf("Attributes: %016llx (",
167 (unsigned long long)stx->stx_attributes);
168 for (byte = 64 - 8; byte >= 0; byte -= 8) {
169 bits = stx->stx_attributes >> byte;
170 mbits = stx->stx_attributes_mask >> byte;
171 for (loop = 7; loop >= 0; loop--) {
172 int bit = byte + loop;
173
174 if (!(mbits & 0x80))
175 putchar('.'); /* Not supported */
176 else if (bits & 0x80)
177 putchar(attr_representation[63 - bit]);
178 else
179 putchar('-'); /* Not set */
180 bits <<= 1;
181 mbits <<= 1;
182 }
183 if (byte)
184 putchar(' ');
185 }
186 printf(")\n");
187 }
188}
189
190static void dump_hex(unsigned long long *data, int from, int to)
191{
192 unsigned offset, print_offset = 1, col = 0;
193
194 from /= 8;
195 to = (to + 7) / 8;
196
197 for (offset = from; offset < to; offset++) {
198 if (print_offset) {
199 printf("%04x: ", offset * 8);
200 print_offset = 0;
201 }
202 printf("%016llx", data[offset]);
203 col++;
204 if ((col & 3) == 0) {
205 printf("\n");
206 print_offset = 1;
207 } else {
208 printf(" ");
209 }
210 }
211
212 if (!print_offset)
213 printf("\n");
214}
215
216int main(int argc, char **argv)
217{
218 struct statx stx;
219 int ret, raw = 0, atflag = AT_SYMLINK_NOFOLLOW;
220
221 unsigned int mask = STATX_BASIC_STATS | STATX_BTIME;
222
223 for (argv++; *argv; argv++) {
224 if (strcmp(*argv, "-F") == 0) {
225 atflag &= ~AT_STATX_SYNC_TYPE;
226 atflag |= AT_STATX_FORCE_SYNC;
227 continue;
228 }
229 if (strcmp(*argv, "-D") == 0) {
230 atflag &= ~AT_STATX_SYNC_TYPE;
231 atflag |= AT_STATX_DONT_SYNC;
232 continue;
233 }
234 if (strcmp(*argv, "-L") == 0) {
235 atflag &= ~AT_SYMLINK_NOFOLLOW;
236 continue;
237 }
238 if (strcmp(*argv, "-O") == 0) {
239 mask &= ~STATX_BASIC_STATS;
240 continue;
241 }
242 if (strcmp(*argv, "-A") == 0) {
243 atflag |= AT_NO_AUTOMOUNT;
244 continue;
245 }
246 if (strcmp(*argv, "-R") == 0) {
247 raw = 1;
248 continue;
249 }
250
251 memset(&stx, 0xbf, sizeof(stx));
252 ret = statx(AT_FDCWD, *argv, atflag, mask, &stx);
253 printf("statx(%s) = %d\n", *argv, ret);
254 if (ret < 0) {
255 perror(*argv);
256 exit(1);
257 }
258
259 if (raw)
260 dump_hex((unsigned long long *)&stx, 0, sizeof(stx));
261
262 dump_statx(&stx);
263 }
264 return 0;
265}
diff --git a/samples/watch_queue/.gitignore b/samples/watch_queue/.gitignore
new file mode 100644
index 000000000..2aa3c7e56
--- /dev/null
+++ b/samples/watch_queue/.gitignore
@@ -0,0 +1 @@
watch_test
diff --git a/samples/watch_queue/Makefile b/samples/watch_queue/Makefile
new file mode 100644
index 000000000..c0db3a6bc
--- /dev/null
+++ b/samples/watch_queue/Makefile
@@ -0,0 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0-only
2userprogs-always-y += watch_test
3
4userccflags += -I usr/include
diff --git a/samples/watch_queue/watch_test.c b/samples/watch_queue/watch_test.c
new file mode 100644
index 000000000..8c6cb57d5
--- /dev/null
+++ b/samples/watch_queue/watch_test.c
@@ -0,0 +1,186 @@
1// SPDX-License-Identifier: GPL-2.0
2/* Use watch_queue API to watch for notifications.
3 *
4 * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
6 */
7
8#define _GNU_SOURCE
9#include <stdbool.h>
10#include <stdarg.h>
11#include <stdio.h>
12#include <stdlib.h>
13#include <string.h>
14#include <signal.h>
15#include <unistd.h>
16#include <errno.h>
17#include <sys/ioctl.h>
18#include <limits.h>
19#include <linux/watch_queue.h>
20#include <linux/unistd.h>
21#include <linux/keyctl.h>
22
23#ifndef KEYCTL_WATCH_KEY
24#define KEYCTL_WATCH_KEY -1
25#endif
26#ifndef __NR_keyctl
27#define __NR_keyctl -1
28#endif
29
30#define BUF_SIZE 256
31
32static long keyctl_watch_key(int key, int watch_fd, int watch_id)
33{
34 return syscall(__NR_keyctl, KEYCTL_WATCH_KEY, key, watch_fd, watch_id);
35}
36
37static const char *key_subtypes[256] = {
38 [NOTIFY_KEY_INSTANTIATED] = "instantiated",
39 [NOTIFY_KEY_UPDATED] = "updated",
40 [NOTIFY_KEY_LINKED] = "linked",
41 [NOTIFY_KEY_UNLINKED] = "unlinked",
42 [NOTIFY_KEY_CLEARED] = "cleared",
43 [NOTIFY_KEY_REVOKED] = "revoked",
44 [NOTIFY_KEY_INVALIDATED] = "invalidated",
45 [NOTIFY_KEY_SETATTR] = "setattr",
46};
47
48static void saw_key_change(struct watch_notification *n, size_t len)
49{
50 struct key_notification *k = (struct key_notification *)n;
51
52 if (len != sizeof(struct key_notification)) {
53 fprintf(stderr, "Incorrect key message length\n");
54 return;
55 }
56
57 printf("KEY %08x change=%u[%s] aux=%u\n",
58 k->key_id, n->subtype, key_subtypes[n->subtype], k->aux);
59}
60
61/*
62 * Consume and display events.
63 */
64static void consumer(int fd)
65{
66 unsigned char buffer[433], *p, *end;
67 union {
68 struct watch_notification n;
69 unsigned char buf1[128];
70 } n;
71 ssize_t buf_len;
72
73 for (;;) {
74 buf_len = read(fd, buffer, sizeof(buffer));
75 if (buf_len == -1) {
76 perror("read");
77 exit(1);
78 }
79
80 if (buf_len == 0) {
81 printf("-- END --\n");
82 return;
83 }
84
85 if (buf_len > sizeof(buffer)) {
86 fprintf(stderr, "Read buffer overrun: %zd\n", buf_len);
87 return;
88 }
89
90 printf("read() = %zd\n", buf_len);
91
92 p = buffer;
93 end = buffer + buf_len;
94 while (p < end) {
95 size_t largest, len;
96
97 largest = end - p;
98 if (largest > 128)
99 largest = 128;
100 if (largest < sizeof(struct watch_notification)) {
101 fprintf(stderr, "Short message header: %zu\n", largest);
102 return;
103 }
104 memcpy(&n, p, largest);
105
106 printf("NOTIFY[%03zx]: ty=%06x sy=%02x i=%08x\n",
107 p - buffer, n.n.type, n.n.subtype, n.n.info);
108
109 len = n.n.info & WATCH_INFO_LENGTH;
110 if (len < sizeof(n.n) || len > largest) {
111 fprintf(stderr, "Bad message length: %zu/%zu\n", len, largest);
112 exit(1);
113 }
114
115 switch (n.n.type) {
116 case WATCH_TYPE_META:
117 switch (n.n.subtype) {
118 case WATCH_META_REMOVAL_NOTIFICATION:
119 printf("REMOVAL of watchpoint %08x\n",
120 (n.n.info & WATCH_INFO_ID) >>
121 WATCH_INFO_ID__SHIFT);
122 break;
123 case WATCH_META_LOSS_NOTIFICATION:
124 printf("-- LOSS --\n");
125 break;
126 default:
127 printf("other meta record\n");
128 break;
129 }
130 break;
131 case WATCH_TYPE_KEY_NOTIFY:
132 saw_key_change(&n.n, len);
133 break;
134 default:
135 printf("other type\n");
136 break;
137 }
138
139 p += len;
140 }
141 }
142}
143
144static struct watch_notification_filter filter = {
145 .nr_filters = 1,
146 .filters = {
147 [0] = {
148 .type = WATCH_TYPE_KEY_NOTIFY,
149 .subtype_filter[0] = UINT_MAX,
150 },
151 },
152};
153
154int main(int argc, char **argv)
155{
156 int pipefd[2], fd;
157
158 if (pipe2(pipefd, O_NOTIFICATION_PIPE) == -1) {
159 perror("pipe2");
160 exit(1);
161 }
162 fd = pipefd[0];
163
164 if (ioctl(fd, IOC_WATCH_QUEUE_SET_SIZE, BUF_SIZE) == -1) {
165 perror("watch_queue(size)");
166 exit(1);
167 }
168
169 if (ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter) == -1) {
170 perror("watch_queue(filter)");
171 exit(1);
172 }
173
174 if (keyctl_watch_key(KEY_SPEC_SESSION_KEYRING, fd, 0x01) == -1) {
175 perror("keyctl");
176 exit(1);
177 }
178
179 if (keyctl_watch_key(KEY_SPEC_USER_KEYRING, fd, 0x02) == -1) {
180 perror("keyctl");
181 exit(1);
182 }
183
184 consumer(fd);
185 exit(0);
186}
diff --git a/samples/watchdog/.gitignore b/samples/watchdog/.gitignore
new file mode 100644
index 000000000..74153b831
--- /dev/null
+++ b/samples/watchdog/.gitignore
@@ -0,0 +1,2 @@
1# SPDX-License-Identifier: GPL-2.0-only
2watchdog-simple
diff --git a/samples/watchdog/Makefile b/samples/watchdog/Makefile
new file mode 100644
index 000000000..ab39d23dc
--- /dev/null
+++ b/samples/watchdog/Makefile
@@ -0,0 +1,2 @@
1# SPDX-License-Identifier: GPL-2.0
2userprogs-always-y += watchdog-simple
diff --git a/samples/watchdog/watchdog-simple.c b/samples/watchdog/watchdog-simple.c
new file mode 100644
index 000000000..9ce66d2ca
--- /dev/null
+++ b/samples/watchdog/watchdog-simple.c
@@ -0,0 +1,25 @@
1// SPDX-License-Identifier: GPL-2.0
2#include <stdio.h>
3#include <stdlib.h>
4#include <unistd.h>
5#include <fcntl.h>
6
7int main(void)
8{
9 int fd = open("/dev/watchdog", O_WRONLY);
10 int ret = 0;
11 if (fd == -1) {
12 perror("watchdog");
13 exit(EXIT_FAILURE);
14 }
15 while (1) {
16 ret = write(fd, "\0", 1);
17 if (ret != 1) {
18 ret = -1;
19 break;
20 }
21 sleep(10);
22 }
23 close(fd);
24 return ret;
25}