diff options
author | 2025-03-08 22:04:20 +0800 | |
---|---|---|
committer | 2025-03-08 22:04:20 +0800 | |
commit | a07bb8fd1299070229f0e8f3dcb57ffd5ef9870a (patch) | |
tree | 84f21bd0bf7071bc5fc7dd989e77d7ceb5476682 /samples | |
download | ohosKernel-a07bb8fd1299070229f0e8f3dcb57ffd5ef9870a.tar.gz ohosKernel-a07bb8fd1299070229f0e8f3dcb57ffd5ef9870a.zip |
Initial commit: OpenHarmony-v4.0-ReleaseOpenHarmony-v4.0-Release
Diffstat (limited to 'samples')
252 files changed, 41442 insertions, 0 deletions
diff --git a/samples/Kconfig b/samples/Kconfig new file mode 100644 index 000000000..0dbd22e06 --- /dev/null +++ b/samples/Kconfig | |||
@@ -0,0 +1,242 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | menuconfig SAMPLES | ||
3 | bool "Sample kernel code" | ||
4 | help | ||
5 | You can build and test sample kernel code here. | ||
6 | |||
7 | if SAMPLES | ||
8 | |||
9 | config SAMPLE_AUXDISPLAY | ||
10 | bool "auxdisplay sample" | ||
11 | depends on CC_CAN_LINK | ||
12 | |||
13 | config SAMPLE_TRACE_EVENTS | ||
14 | tristate "Build trace_events examples -- loadable modules only" | ||
15 | depends on EVENT_TRACING && m | ||
16 | help | ||
17 | This build trace event example modules. | ||
18 | |||
19 | config SAMPLE_TRACE_PRINTK | ||
20 | tristate "Build trace_printk module - tests various trace_printk formats" | ||
21 | depends on EVENT_TRACING && m | ||
22 | help | ||
23 | This builds a module that calls trace_printk() and can be used to | ||
24 | test various trace_printk() calls from a module. | ||
25 | |||
26 | config SAMPLE_FTRACE_DIRECT | ||
27 | tristate "Build register_ftrace_direct() example" | ||
28 | depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS && m | ||
29 | depends on X86_64 # has x86_64 inlined asm | ||
30 | help | ||
31 | This builds an ftrace direct function example | ||
32 | that hooks to wake_up_process and prints the parameters. | ||
33 | |||
34 | config SAMPLE_TRACE_ARRAY | ||
35 | tristate "Build sample module for kernel access to Ftrace instancess" | ||
36 | depends on EVENT_TRACING && m | ||
37 | help | ||
38 | This builds a module that demonstrates the use of various APIs to | ||
39 | access Ftrace instances from within the kernel. | ||
40 | |||
41 | config SAMPLE_KOBJECT | ||
42 | tristate "Build kobject examples" | ||
43 | help | ||
44 | This config option will allow you to build a number of | ||
45 | different kobject sample modules showing how to use kobjects, | ||
46 | ksets, and ktypes properly. | ||
47 | |||
48 | If in doubt, say "N" here. | ||
49 | |||
50 | config SAMPLE_KPROBES | ||
51 | tristate "Build kprobes examples -- loadable modules only" | ||
52 | depends on KPROBES && m | ||
53 | help | ||
54 | This build several kprobes example modules. | ||
55 | |||
56 | config SAMPLE_KRETPROBES | ||
57 | tristate "Build kretprobes example -- loadable modules only" | ||
58 | default m | ||
59 | depends on SAMPLE_KPROBES && KRETPROBES | ||
60 | |||
61 | config SAMPLE_HW_BREAKPOINT | ||
62 | tristate "Build kernel hardware breakpoint examples -- loadable module only" | ||
63 | depends on HAVE_HW_BREAKPOINT && m | ||
64 | help | ||
65 | This builds kernel hardware breakpoint example modules. | ||
66 | |||
67 | config SAMPLE_KFIFO | ||
68 | tristate "Build kfifo examples -- loadable modules only" | ||
69 | depends on m | ||
70 | help | ||
71 | This config option will allow you to build a number of | ||
72 | different kfifo sample modules showing how to use the | ||
73 | generic kfifo API. | ||
74 | |||
75 | If in doubt, say "N" here. | ||
76 | |||
77 | config SAMPLE_KDB | ||
78 | tristate "Build kdb command example -- loadable modules only" | ||
79 | depends on KGDB_KDB && m | ||
80 | help | ||
81 | Build an example of how to dynamically add the hello | ||
82 | command to the kdb shell. | ||
83 | |||
84 | config SAMPLE_QMI_CLIENT | ||
85 | tristate "Build qmi client sample -- loadable modules only" | ||
86 | depends on m | ||
87 | depends on ARCH_QCOM | ||
88 | depends on NET | ||
89 | select QCOM_QMI_HELPERS | ||
90 | help | ||
91 | Build an QMI client sample driver, which demonstrates how to | ||
92 | communicate with a remote QRTR service, using QMI encoded messages. | ||
93 | |||
94 | config SAMPLE_RPMSG_CLIENT | ||
95 | tristate "Build rpmsg client sample -- loadable modules only" | ||
96 | depends on RPMSG && m | ||
97 | help | ||
98 | Build an rpmsg client sample driver, which demonstrates how | ||
99 | to communicate with an AMP-configured remote processor over | ||
100 | the rpmsg bus. | ||
101 | |||
102 | config SAMPLE_LIVEPATCH | ||
103 | tristate "Build live patching samples -- loadable modules only" | ||
104 | depends on LIVEPATCH && m | ||
105 | help | ||
106 | Build sample live patch demonstrations. | ||
107 | |||
108 | config SAMPLE_CONFIGFS | ||
109 | tristate "Build configfs patching sample -- loadable modules only" | ||
110 | depends on CONFIGFS_FS && m | ||
111 | help | ||
112 | Builds a sample configfs interface. | ||
113 | |||
114 | config SAMPLE_CONNECTOR | ||
115 | tristate "Build connector sample -- loadable modules only" | ||
116 | depends on CONNECTOR && HEADERS_INSTALL && m | ||
117 | help | ||
118 | When enabled, this builds both a sample kernel module for | ||
119 | the connector interface and a user space tool to communicate | ||
120 | with it. | ||
121 | See also Documentation/driver-api/connector.rst | ||
122 | |||
123 | config SAMPLE_HIDRAW | ||
124 | bool "hidraw sample" | ||
125 | depends on CC_CAN_LINK && HEADERS_INSTALL | ||
126 | |||
127 | config SAMPLE_PIDFD | ||
128 | bool "pidfd sample" | ||
129 | depends on CC_CAN_LINK && HEADERS_INSTALL | ||
130 | |||
131 | config SAMPLE_SECCOMP | ||
132 | bool "Build seccomp sample code" | ||
133 | depends on SECCOMP_FILTER && CC_CAN_LINK && HEADERS_INSTALL | ||
134 | help | ||
135 | Build samples of seccomp filters using various methods of | ||
136 | BPF filter construction. | ||
137 | |||
138 | config SAMPLE_TIMER | ||
139 | bool "Timer sample" | ||
140 | depends on CC_CAN_LINK && HEADERS_INSTALL | ||
141 | |||
142 | config SAMPLE_UHID | ||
143 | bool "UHID sample" | ||
144 | depends on CC_CAN_LINK && HEADERS_INSTALL | ||
145 | help | ||
146 | Build UHID sample program. | ||
147 | |||
148 | config SAMPLE_VFIO_MDEV_MTTY | ||
149 | tristate "Build VFIO mtty example mediated device sample code -- loadable modules only" | ||
150 | depends on VFIO_MDEV_DEVICE && m | ||
151 | help | ||
152 | Build a virtual tty sample driver for use as a VFIO | ||
153 | mediated device | ||
154 | |||
155 | config SAMPLE_VFIO_MDEV_MDPY | ||
156 | tristate "Build VFIO mdpy example mediated device sample code -- loadable modules only" | ||
157 | depends on VFIO_MDEV_DEVICE && m | ||
158 | help | ||
159 | Build a virtual display sample driver for use as a VFIO | ||
160 | mediated device. It is a simple framebuffer and supports | ||
161 | the region display interface (VFIO_GFX_PLANE_TYPE_REGION). | ||
162 | |||
163 | config SAMPLE_VFIO_MDEV_MDPY_FB | ||
164 | tristate "Build VFIO mdpy example guest fbdev driver -- loadable module only" | ||
165 | depends on FB && m | ||
166 | select FB_CFB_FILLRECT | ||
167 | select FB_CFB_COPYAREA | ||
168 | select FB_CFB_IMAGEBLIT | ||
169 | help | ||
170 | Guest fbdev driver for the virtual display sample driver. | ||
171 | |||
172 | config SAMPLE_VFIO_MDEV_MBOCHS | ||
173 | tristate "Build VFIO mdpy example mediated device sample code -- loadable modules only" | ||
174 | depends on VFIO_MDEV_DEVICE && m | ||
175 | select DMA_SHARED_BUFFER | ||
176 | help | ||
177 | Build a virtual display sample driver for use as a VFIO | ||
178 | mediated device. It supports the region display interface | ||
179 | (VFIO_GFX_PLANE_TYPE_DMABUF). | ||
180 | Emulate enough of qemu stdvga to make bochs-drm.ko happy. | ||
181 | That is basically the vram memory bar and the bochs dispi | ||
182 | interface vbe registers in the mmio register bar. | ||
183 | Specifically it does *not* include any legacy vga stuff. | ||
184 | Device looks a lot like "qemu -device secondary-vga". | ||
185 | |||
186 | config SAMPLE_ANDROID_BINDERFS | ||
187 | bool "Build Android binderfs example" | ||
188 | depends on CC_CAN_LINK && HEADERS_INSTALL | ||
189 | help | ||
190 | Builds a sample program to illustrate the use of the Android binderfs | ||
191 | filesystem. | ||
192 | |||
193 | config SAMPLE_VFS | ||
194 | bool "Build example programs that use new VFS system calls" | ||
195 | depends on CC_CAN_LINK && HEADERS_INSTALL | ||
196 | help | ||
197 | Build example userspace programs that use new VFS system calls such | ||
198 | as mount API and statx(). Note that this is restricted to the x86 | ||
199 | arch whilst it accesses system calls that aren't yet in all arches. | ||
200 | |||
201 | config SAMPLE_INTEL_MEI | ||
202 | bool "Build example program working with intel mei driver" | ||
203 | depends on INTEL_MEI | ||
204 | depends on CC_CAN_LINK && HEADERS_INSTALL | ||
205 | help | ||
206 | Build a sample program to work with mei device. | ||
207 | |||
208 | config SAMPLE_WATCHDOG | ||
209 | bool "watchdog sample" | ||
210 | depends on CC_CAN_LINK | ||
211 | |||
212 | config SAMPLE_WATCH_QUEUE | ||
213 | bool "Build example watch_queue notification API consumer" | ||
214 | depends on CC_CAN_LINK && HEADERS_INSTALL | ||
215 | help | ||
216 | Build example userspace program to use the new mount_notify(), | ||
217 | sb_notify() syscalls and the KEYCTL_WATCH_KEY keyctl() function. | ||
218 | |||
219 | config SAMPLE_HCK | ||
220 | bool "HCK sample" | ||
221 | help | ||
222 | HCK sample | ||
223 | |||
224 | config SAMPLE_HCK_CALL | ||
225 | bool "HCK call sample" | ||
226 | depends on SAMPLE_HCK | ||
227 | help | ||
228 | HCK call sample | ||
229 | |||
230 | config SAMPLE_HCK_REGISTER | ||
231 | bool "HCK register sample" | ||
232 | depends on SAMPLE_HCK | ||
233 | help | ||
234 | HCK register sample | ||
235 | |||
236 | config SAMPLE_HCK_REGISTER_ONE | ||
237 | bool "HCK register one interface sample" | ||
238 | depends on SAMPLE_HCK | ||
239 | help | ||
240 | HCK register sample | ||
241 | |||
242 | endif # SAMPLES | ||
diff --git a/samples/Makefile b/samples/Makefile new file mode 100644 index 000000000..e002c114a --- /dev/null +++ b/samples/Makefile | |||
@@ -0,0 +1,32 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0 | ||
2 | # Makefile for Linux samples code | ||
3 | |||
4 | subdir-$(CONFIG_SAMPLE_AUXDISPLAY) += auxdisplay | ||
5 | subdir-$(CONFIG_SAMPLE_ANDROID_BINDERFS) += binderfs | ||
6 | obj-$(CONFIG_SAMPLE_CONFIGFS) += configfs/ | ||
7 | obj-$(CONFIG_SAMPLE_CONNECTOR) += connector/ | ||
8 | subdir-$(CONFIG_SAMPLE_HIDRAW) += hidraw | ||
9 | obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += hw_breakpoint/ | ||
10 | obj-$(CONFIG_SAMPLE_KDB) += kdb/ | ||
11 | obj-$(CONFIG_SAMPLE_KFIFO) += kfifo/ | ||
12 | obj-$(CONFIG_SAMPLE_KOBJECT) += kobject/ | ||
13 | obj-$(CONFIG_SAMPLE_KPROBES) += kprobes/ | ||
14 | obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch/ | ||
15 | subdir-$(CONFIG_SAMPLE_PIDFD) += pidfd | ||
16 | obj-$(CONFIG_SAMPLE_QMI_CLIENT) += qmi/ | ||
17 | obj-$(CONFIG_SAMPLE_RPMSG_CLIENT) += rpmsg/ | ||
18 | subdir-$(CONFIG_SAMPLE_SECCOMP) += seccomp | ||
19 | subdir-$(CONFIG_SAMPLE_TIMER) += timers | ||
20 | obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace_events/ | ||
21 | obj-$(CONFIG_SAMPLE_TRACE_PRINTK) += trace_printk/ | ||
22 | obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace/ | ||
23 | obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += ftrace/ | ||
24 | subdir-$(CONFIG_SAMPLE_UHID) += uhid | ||
25 | obj-$(CONFIG_VIDEO_PCI_SKELETON) += v4l/ | ||
26 | obj-y += vfio-mdev/ | ||
27 | subdir-$(CONFIG_SAMPLE_VFS) += vfs | ||
28 | obj-$(CONFIG_SAMPLE_INTEL_MEI) += mei/ | ||
29 | subdir-$(CONFIG_SAMPLE_WATCHDOG) += watchdog | ||
30 | subdir-$(CONFIG_SAMPLE_WATCH_QUEUE) += watch_queue | ||
31 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak/ | ||
32 | obj-$(CONFIG_SAMPLE_HCK) += hck/ | ||
diff --git a/samples/auxdisplay/.gitignore b/samples/auxdisplay/.gitignore new file mode 100644 index 000000000..2ed744c0e --- /dev/null +++ b/samples/auxdisplay/.gitignore | |||
@@ -0,0 +1,2 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | cfag12864b-example | ||
diff --git a/samples/auxdisplay/Makefile b/samples/auxdisplay/Makefile new file mode 100644 index 000000000..19d556893 --- /dev/null +++ b/samples/auxdisplay/Makefile | |||
@@ -0,0 +1,2 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0 | ||
2 | userprogs-always-y += cfag12864b-example | ||
diff --git a/samples/auxdisplay/cfag12864b-example.c b/samples/auxdisplay/cfag12864b-example.c new file mode 100644 index 000000000..bfeab44f8 --- /dev/null +++ b/samples/auxdisplay/cfag12864b-example.c | |||
@@ -0,0 +1,267 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Filename: cfag12864b-example.c | ||
4 | * Version: 0.1.0 | ||
5 | * Description: cfag12864b LCD userspace example program | ||
6 | * | ||
7 | * Author: Copyright (C) Miguel Ojeda Sandonis | ||
8 | * Date: 2006-10-31 | ||
9 | */ | ||
10 | |||
11 | /* | ||
12 | * ------------------------ | ||
13 | * start of cfag12864b code | ||
14 | * ------------------------ | ||
15 | */ | ||
16 | |||
17 | #include <string.h> | ||
18 | #include <fcntl.h> | ||
19 | #include <unistd.h> | ||
20 | #include <sys/types.h> | ||
21 | #include <sys/stat.h> | ||
22 | #include <sys/mman.h> | ||
23 | |||
24 | #define CFAG12864B_WIDTH (128) | ||
25 | #define CFAG12864B_HEIGHT (64) | ||
26 | #define CFAG12864B_SIZE (128 * 64 / 8) | ||
27 | #define CFAG12864B_BPB (8) | ||
28 | #define CFAG12864B_ADDRESS(x, y) ((y) * CFAG12864B_WIDTH / \ | ||
29 | CFAG12864B_BPB + (x) / CFAG12864B_BPB) | ||
30 | #define CFAG12864B_BIT(n) (((unsigned char) 1) << (n)) | ||
31 | |||
32 | #undef CFAG12864B_DOCHECK | ||
33 | #ifdef CFAG12864B_DOCHECK | ||
34 | #define CFAG12864B_CHECK(x, y) ((x) < CFAG12864B_WIDTH && \ | ||
35 | (y) < CFAG12864B_HEIGHT) | ||
36 | #else | ||
37 | #define CFAG12864B_CHECK(x, y) (1) | ||
38 | #endif | ||
39 | |||
40 | int cfag12864b_fd; | ||
41 | unsigned char * cfag12864b_mem; | ||
42 | unsigned char cfag12864b_buffer[CFAG12864B_SIZE]; | ||
43 | |||
44 | /* | ||
45 | * init a cfag12864b framebuffer device | ||
46 | * | ||
47 | * No error: return = 0 | ||
48 | * Unable to open: return = -1 | ||
49 | * Unable to mmap: return = -2 | ||
50 | */ | ||
51 | static int cfag12864b_init(char *path) | ||
52 | { | ||
53 | cfag12864b_fd = open(path, O_RDWR); | ||
54 | if (cfag12864b_fd == -1) | ||
55 | return -1; | ||
56 | |||
57 | cfag12864b_mem = mmap(0, CFAG12864B_SIZE, PROT_READ | PROT_WRITE, | ||
58 | MAP_SHARED, cfag12864b_fd, 0); | ||
59 | if (cfag12864b_mem == MAP_FAILED) { | ||
60 | close(cfag12864b_fd); | ||
61 | return -2; | ||
62 | } | ||
63 | |||
64 | return 0; | ||
65 | } | ||
66 | |||
67 | /* | ||
68 | * exit a cfag12864b framebuffer device | ||
69 | */ | ||
70 | static void cfag12864b_exit(void) | ||
71 | { | ||
72 | munmap(cfag12864b_mem, CFAG12864B_SIZE); | ||
73 | close(cfag12864b_fd); | ||
74 | } | ||
75 | |||
76 | /* | ||
77 | * set (x, y) pixel | ||
78 | */ | ||
79 | static void cfag12864b_set(unsigned char x, unsigned char y) | ||
80 | { | ||
81 | if (CFAG12864B_CHECK(x, y)) | ||
82 | cfag12864b_buffer[CFAG12864B_ADDRESS(x, y)] |= | ||
83 | CFAG12864B_BIT(x % CFAG12864B_BPB); | ||
84 | } | ||
85 | |||
86 | /* | ||
87 | * unset (x, y) pixel | ||
88 | */ | ||
89 | static void cfag12864b_unset(unsigned char x, unsigned char y) | ||
90 | { | ||
91 | if (CFAG12864B_CHECK(x, y)) | ||
92 | cfag12864b_buffer[CFAG12864B_ADDRESS(x, y)] &= | ||
93 | ~CFAG12864B_BIT(x % CFAG12864B_BPB); | ||
94 | } | ||
95 | |||
96 | /* | ||
97 | * is set (x, y) pixel? | ||
98 | * | ||
99 | * Pixel off: return = 0 | ||
100 | * Pixel on: return = 1 | ||
101 | */ | ||
102 | static unsigned char cfag12864b_isset(unsigned char x, unsigned char y) | ||
103 | { | ||
104 | if (CFAG12864B_CHECK(x, y)) | ||
105 | if (cfag12864b_buffer[CFAG12864B_ADDRESS(x, y)] & | ||
106 | CFAG12864B_BIT(x % CFAG12864B_BPB)) | ||
107 | return 1; | ||
108 | |||
109 | return 0; | ||
110 | } | ||
111 | |||
112 | /* | ||
113 | * not (x, y) pixel | ||
114 | */ | ||
115 | static void cfag12864b_not(unsigned char x, unsigned char y) | ||
116 | { | ||
117 | if (cfag12864b_isset(x, y)) | ||
118 | cfag12864b_unset(x, y); | ||
119 | else | ||
120 | cfag12864b_set(x, y); | ||
121 | } | ||
122 | |||
123 | /* | ||
124 | * fill (set all pixels) | ||
125 | */ | ||
126 | static void cfag12864b_fill(void) | ||
127 | { | ||
128 | unsigned short i; | ||
129 | |||
130 | for (i = 0; i < CFAG12864B_SIZE; i++) | ||
131 | cfag12864b_buffer[i] = 0xFF; | ||
132 | } | ||
133 | |||
134 | /* | ||
135 | * clear (unset all pixels) | ||
136 | */ | ||
137 | static void cfag12864b_clear(void) | ||
138 | { | ||
139 | unsigned short i; | ||
140 | |||
141 | for (i = 0; i < CFAG12864B_SIZE; i++) | ||
142 | cfag12864b_buffer[i] = 0; | ||
143 | } | ||
144 | |||
145 | /* | ||
146 | * format a [128*64] matrix | ||
147 | * | ||
148 | * Pixel off: src[i] = 0 | ||
149 | * Pixel on: src[i] > 0 | ||
150 | */ | ||
151 | static void cfag12864b_format(unsigned char * matrix) | ||
152 | { | ||
153 | unsigned char i, j, n; | ||
154 | |||
155 | for (i = 0; i < CFAG12864B_HEIGHT; i++) | ||
156 | for (j = 0; j < CFAG12864B_WIDTH / CFAG12864B_BPB; j++) { | ||
157 | cfag12864b_buffer[i * CFAG12864B_WIDTH / CFAG12864B_BPB + | ||
158 | j] = 0; | ||
159 | for (n = 0; n < CFAG12864B_BPB; n++) | ||
160 | if (matrix[i * CFAG12864B_WIDTH + | ||
161 | j * CFAG12864B_BPB + n]) | ||
162 | cfag12864b_buffer[i * CFAG12864B_WIDTH / | ||
163 | CFAG12864B_BPB + j] |= | ||
164 | CFAG12864B_BIT(n); | ||
165 | } | ||
166 | } | ||
167 | |||
168 | /* | ||
169 | * blit buffer to lcd | ||
170 | */ | ||
171 | static void cfag12864b_blit(void) | ||
172 | { | ||
173 | memcpy(cfag12864b_mem, cfag12864b_buffer, CFAG12864B_SIZE); | ||
174 | } | ||
175 | |||
176 | /* | ||
177 | * ---------------------- | ||
178 | * end of cfag12864b code | ||
179 | * ---------------------- | ||
180 | */ | ||
181 | |||
182 | #include <stdio.h> | ||
183 | |||
184 | #define EXAMPLES 6 | ||
185 | |||
186 | static void example(unsigned char n) | ||
187 | { | ||
188 | unsigned short i, j; | ||
189 | unsigned char matrix[CFAG12864B_WIDTH * CFAG12864B_HEIGHT]; | ||
190 | |||
191 | if (n > EXAMPLES) | ||
192 | return; | ||
193 | |||
194 | printf("Example %i/%i - ", n, EXAMPLES); | ||
195 | |||
196 | switch (n) { | ||
197 | case 1: | ||
198 | printf("Draw points setting bits"); | ||
199 | cfag12864b_clear(); | ||
200 | for (i = 0; i < CFAG12864B_WIDTH; i += 2) | ||
201 | for (j = 0; j < CFAG12864B_HEIGHT; j += 2) | ||
202 | cfag12864b_set(i, j); | ||
203 | break; | ||
204 | |||
205 | case 2: | ||
206 | printf("Clear the LCD"); | ||
207 | cfag12864b_clear(); | ||
208 | break; | ||
209 | |||
210 | case 3: | ||
211 | printf("Draw rows formatting a [128*64] matrix"); | ||
212 | memset(matrix, 0, CFAG12864B_WIDTH * CFAG12864B_HEIGHT); | ||
213 | for (i = 0; i < CFAG12864B_WIDTH; i++) | ||
214 | for (j = 0; j < CFAG12864B_HEIGHT; j += 2) | ||
215 | matrix[j * CFAG12864B_WIDTH + i] = 1; | ||
216 | cfag12864b_format(matrix); | ||
217 | break; | ||
218 | |||
219 | case 4: | ||
220 | printf("Fill the lcd"); | ||
221 | cfag12864b_fill(); | ||
222 | break; | ||
223 | |||
224 | case 5: | ||
225 | printf("Draw columns unsetting bits"); | ||
226 | for (i = 0; i < CFAG12864B_WIDTH; i += 2) | ||
227 | for (j = 0; j < CFAG12864B_HEIGHT; j++) | ||
228 | cfag12864b_unset(i, j); | ||
229 | break; | ||
230 | |||
231 | case 6: | ||
232 | printf("Do negative not-ing all bits"); | ||
233 | for (i = 0; i < CFAG12864B_WIDTH; i++) | ||
234 | for (j = 0; j < CFAG12864B_HEIGHT; j ++) | ||
235 | cfag12864b_not(i, j); | ||
236 | break; | ||
237 | } | ||
238 | |||
239 | puts(" - [Press Enter]"); | ||
240 | } | ||
241 | |||
242 | int main(int argc, char *argv[]) | ||
243 | { | ||
244 | unsigned char n; | ||
245 | |||
246 | if (argc != 2) { | ||
247 | printf( | ||
248 | "Syntax: %s fbdev\n" | ||
249 | "Usually: /dev/fb0, /dev/fb1...\n", argv[0]); | ||
250 | return -1; | ||
251 | } | ||
252 | |||
253 | if (cfag12864b_init(argv[1])) { | ||
254 | printf("Can't init %s fbdev\n", argv[1]); | ||
255 | return -2; | ||
256 | } | ||
257 | |||
258 | for (n = 1; n <= EXAMPLES; n++) { | ||
259 | example(n); | ||
260 | cfag12864b_blit(); | ||
261 | while (getchar() != '\n'); | ||
262 | } | ||
263 | |||
264 | cfag12864b_exit(); | ||
265 | |||
266 | return 0; | ||
267 | } | ||
diff --git a/samples/binderfs/.gitignore b/samples/binderfs/.gitignore new file mode 100644 index 000000000..eb60241e8 --- /dev/null +++ b/samples/binderfs/.gitignore | |||
@@ -0,0 +1 @@ | |||
binderfs_example | |||
diff --git a/samples/binderfs/Makefile b/samples/binderfs/Makefile new file mode 100644 index 000000000..629e43b9b --- /dev/null +++ b/samples/binderfs/Makefile | |||
@@ -0,0 +1,4 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | userprogs-always-y += binderfs_example | ||
3 | |||
4 | userccflags += -I usr/include | ||
diff --git a/samples/binderfs/binderfs_example.c b/samples/binderfs/binderfs_example.c new file mode 100644 index 000000000..0fd92cdda --- /dev/null +++ b/samples/binderfs/binderfs_example.c | |||
@@ -0,0 +1,82 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | #define _GNU_SOURCE | ||
4 | #include <errno.h> | ||
5 | #include <fcntl.h> | ||
6 | #include <sched.h> | ||
7 | #include <stdio.h> | ||
8 | #include <stdlib.h> | ||
9 | #include <string.h> | ||
10 | #include <sys/ioctl.h> | ||
11 | #include <sys/mount.h> | ||
12 | #include <sys/stat.h> | ||
13 | #include <sys/types.h> | ||
14 | #include <unistd.h> | ||
15 | #include <linux/android/binder.h> | ||
16 | #include <linux/android/binderfs.h> | ||
17 | |||
18 | int main(int argc, char *argv[]) | ||
19 | { | ||
20 | int fd, ret, saved_errno; | ||
21 | struct binderfs_device device = { 0 }; | ||
22 | |||
23 | ret = unshare(CLONE_NEWNS); | ||
24 | if (ret < 0) { | ||
25 | fprintf(stderr, "%s - Failed to unshare mount namespace\n", | ||
26 | strerror(errno)); | ||
27 | exit(EXIT_FAILURE); | ||
28 | } | ||
29 | |||
30 | ret = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0); | ||
31 | if (ret < 0) { | ||
32 | fprintf(stderr, "%s - Failed to mount / as private\n", | ||
33 | strerror(errno)); | ||
34 | exit(EXIT_FAILURE); | ||
35 | } | ||
36 | |||
37 | ret = mkdir("/dev/binderfs", 0755); | ||
38 | if (ret < 0 && errno != EEXIST) { | ||
39 | fprintf(stderr, "%s - Failed to create binderfs mountpoint\n", | ||
40 | strerror(errno)); | ||
41 | exit(EXIT_FAILURE); | ||
42 | } | ||
43 | |||
44 | ret = mount(NULL, "/dev/binderfs", "binder", 0, 0); | ||
45 | if (ret < 0) { | ||
46 | fprintf(stderr, "%s - Failed to mount binderfs\n", | ||
47 | strerror(errno)); | ||
48 | exit(EXIT_FAILURE); | ||
49 | } | ||
50 | |||
51 | memcpy(device.name, "my-binder", strlen("my-binder")); | ||
52 | |||
53 | fd = open("/dev/binderfs/binder-control", O_RDONLY | O_CLOEXEC); | ||
54 | if (fd < 0) { | ||
55 | fprintf(stderr, "%s - Failed to open binder-control device\n", | ||
56 | strerror(errno)); | ||
57 | exit(EXIT_FAILURE); | ||
58 | } | ||
59 | |||
60 | ret = ioctl(fd, BINDER_CTL_ADD, &device); | ||
61 | saved_errno = errno; | ||
62 | close(fd); | ||
63 | errno = saved_errno; | ||
64 | if (ret < 0) { | ||
65 | fprintf(stderr, "%s - Failed to allocate new binder device\n", | ||
66 | strerror(errno)); | ||
67 | exit(EXIT_FAILURE); | ||
68 | } | ||
69 | |||
70 | printf("Allocated new binder device with major %d, minor %d, and name %s\n", | ||
71 | device.major, device.minor, device.name); | ||
72 | |||
73 | ret = unlink("/dev/binderfs/my-binder"); | ||
74 | if (ret < 0) { | ||
75 | fprintf(stderr, "%s - Failed to delete binder device\n", | ||
76 | strerror(errno)); | ||
77 | exit(EXIT_FAILURE); | ||
78 | } | ||
79 | |||
80 | /* Cleanup happens when the mount namespace dies. */ | ||
81 | exit(EXIT_SUCCESS); | ||
82 | } | ||
diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore new file mode 100644 index 000000000..b2f29bc8d --- /dev/null +++ b/samples/bpf/.gitignore | |||
@@ -0,0 +1,54 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | cpustat | ||
3 | fds_example | ||
4 | hbm | ||
5 | ibumad | ||
6 | lathist | ||
7 | lwt_len_hist | ||
8 | map_perf_test | ||
9 | offwaketime | ||
10 | per_socket_stats_example | ||
11 | sampleip | ||
12 | sock_example | ||
13 | sockex1 | ||
14 | sockex2 | ||
15 | sockex3 | ||
16 | spintest | ||
17 | syscall_nrs.h | ||
18 | syscall_tp | ||
19 | task_fd_query | ||
20 | tc_l2_redirect | ||
21 | test_cgrp2_array_pin | ||
22 | test_cgrp2_attach | ||
23 | test_cgrp2_attach2 | ||
24 | test_cgrp2_sock | ||
25 | test_cgrp2_sock2 | ||
26 | test_current_task_under_cgroup | ||
27 | test_lru_dist | ||
28 | test_map_in_map | ||
29 | test_overhead | ||
30 | test_probe_write_user | ||
31 | trace_event | ||
32 | trace_output | ||
33 | tracex1 | ||
34 | tracex2 | ||
35 | tracex3 | ||
36 | tracex4 | ||
37 | tracex5 | ||
38 | tracex6 | ||
39 | tracex7 | ||
40 | xdp1 | ||
41 | xdp2 | ||
42 | xdp_adjust_tail | ||
43 | xdp_fwd | ||
44 | xdp_monitor | ||
45 | xdp_redirect | ||
46 | xdp_redirect_cpu | ||
47 | xdp_redirect_map | ||
48 | xdp_router_ipv4 | ||
49 | xdp_rxq_info | ||
50 | xdp_sample_pkts | ||
51 | xdp_tx_iptunnel | ||
52 | xdpsock | ||
53 | xsk_fwd | ||
54 | testfile.img | ||
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile new file mode 100644 index 000000000..aeebf5d12 --- /dev/null +++ b/samples/bpf/Makefile | |||
@@ -0,0 +1,329 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | BPF_SAMPLES_PATH ?= $(abspath $(srctree)/$(src)) | ||
4 | TOOLS_PATH := $(BPF_SAMPLES_PATH)/../../tools | ||
5 | |||
6 | # List of programs to build | ||
7 | tprogs-y := test_lru_dist | ||
8 | tprogs-y += sock_example | ||
9 | tprogs-y += fds_example | ||
10 | tprogs-y += sockex1 | ||
11 | tprogs-y += sockex2 | ||
12 | tprogs-y += sockex3 | ||
13 | tprogs-y += tracex1 | ||
14 | tprogs-y += tracex2 | ||
15 | tprogs-y += tracex3 | ||
16 | tprogs-y += tracex4 | ||
17 | tprogs-y += tracex5 | ||
18 | tprogs-y += tracex6 | ||
19 | tprogs-y += tracex7 | ||
20 | tprogs-y += test_probe_write_user | ||
21 | tprogs-y += trace_output | ||
22 | tprogs-y += lathist | ||
23 | tprogs-y += offwaketime | ||
24 | tprogs-y += spintest | ||
25 | tprogs-y += map_perf_test | ||
26 | tprogs-y += test_overhead | ||
27 | tprogs-y += test_cgrp2_array_pin | ||
28 | tprogs-y += test_cgrp2_attach | ||
29 | tprogs-y += test_cgrp2_sock | ||
30 | tprogs-y += test_cgrp2_sock2 | ||
31 | tprogs-y += xdp1 | ||
32 | tprogs-y += xdp2 | ||
33 | tprogs-y += xdp_router_ipv4 | ||
34 | tprogs-y += test_current_task_under_cgroup | ||
35 | tprogs-y += trace_event | ||
36 | tprogs-y += sampleip | ||
37 | tprogs-y += tc_l2_redirect | ||
38 | tprogs-y += lwt_len_hist | ||
39 | tprogs-y += xdp_tx_iptunnel | ||
40 | tprogs-y += test_map_in_map | ||
41 | tprogs-y += per_socket_stats_example | ||
42 | tprogs-y += xdp_redirect | ||
43 | tprogs-y += xdp_redirect_map | ||
44 | tprogs-y += xdp_redirect_cpu | ||
45 | tprogs-y += xdp_monitor | ||
46 | tprogs-y += xdp_rxq_info | ||
47 | tprogs-y += syscall_tp | ||
48 | tprogs-y += cpustat | ||
49 | tprogs-y += xdp_adjust_tail | ||
50 | tprogs-y += xdpsock | ||
51 | tprogs-y += xsk_fwd | ||
52 | tprogs-y += xdp_fwd | ||
53 | tprogs-y += task_fd_query | ||
54 | tprogs-y += xdp_sample_pkts | ||
55 | tprogs-y += ibumad | ||
56 | tprogs-y += hbm | ||
57 | |||
58 | # Libbpf dependencies | ||
59 | LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a | ||
60 | |||
61 | CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o | ||
62 | TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o | ||
63 | |||
64 | fds_example-objs := fds_example.o | ||
65 | sockex1-objs := sockex1_user.o | ||
66 | sockex2-objs := sockex2_user.o | ||
67 | sockex3-objs := sockex3_user.o | ||
68 | tracex1-objs := tracex1_user.o $(TRACE_HELPERS) | ||
69 | tracex2-objs := tracex2_user.o | ||
70 | tracex3-objs := tracex3_user.o | ||
71 | tracex4-objs := tracex4_user.o | ||
72 | tracex5-objs := tracex5_user.o $(TRACE_HELPERS) | ||
73 | tracex6-objs := tracex6_user.o | ||
74 | tracex7-objs := tracex7_user.o | ||
75 | test_probe_write_user-objs := test_probe_write_user_user.o | ||
76 | trace_output-objs := trace_output_user.o $(TRACE_HELPERS) | ||
77 | lathist-objs := lathist_user.o | ||
78 | offwaketime-objs := offwaketime_user.o $(TRACE_HELPERS) | ||
79 | spintest-objs := spintest_user.o $(TRACE_HELPERS) | ||
80 | map_perf_test-objs := map_perf_test_user.o | ||
81 | test_overhead-objs := bpf_load.o test_overhead_user.o | ||
82 | test_cgrp2_array_pin-objs := test_cgrp2_array_pin.o | ||
83 | test_cgrp2_attach-objs := test_cgrp2_attach.o | ||
84 | test_cgrp2_sock-objs := test_cgrp2_sock.o | ||
85 | test_cgrp2_sock2-objs := bpf_load.o test_cgrp2_sock2.o | ||
86 | xdp1-objs := xdp1_user.o | ||
87 | # reuse xdp1 source intentionally | ||
88 | xdp2-objs := xdp1_user.o | ||
89 | xdp_router_ipv4-objs := xdp_router_ipv4_user.o | ||
90 | test_current_task_under_cgroup-objs := $(CGROUP_HELPERS) \ | ||
91 | test_current_task_under_cgroup_user.o | ||
92 | trace_event-objs := trace_event_user.o $(TRACE_HELPERS) | ||
93 | sampleip-objs := sampleip_user.o $(TRACE_HELPERS) | ||
94 | tc_l2_redirect-objs := bpf_load.o tc_l2_redirect_user.o | ||
95 | lwt_len_hist-objs := bpf_load.o lwt_len_hist_user.o | ||
96 | xdp_tx_iptunnel-objs := xdp_tx_iptunnel_user.o | ||
97 | test_map_in_map-objs := test_map_in_map_user.o | ||
98 | per_socket_stats_example-objs := cookie_uid_helper_example.o | ||
99 | xdp_redirect-objs := xdp_redirect_user.o | ||
100 | xdp_redirect_map-objs := xdp_redirect_map_user.o | ||
101 | xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o | ||
102 | xdp_monitor-objs := xdp_monitor_user.o | ||
103 | xdp_rxq_info-objs := xdp_rxq_info_user.o | ||
104 | syscall_tp-objs := syscall_tp_user.o | ||
105 | cpustat-objs := cpustat_user.o | ||
106 | xdp_adjust_tail-objs := xdp_adjust_tail_user.o | ||
107 | xdpsock-objs := xdpsock_user.o | ||
108 | xsk_fwd-objs := xsk_fwd.o | ||
109 | xdp_fwd-objs := xdp_fwd_user.o | ||
110 | task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS) | ||
111 | xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) | ||
112 | ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS) | ||
113 | hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS) | ||
114 | |||
115 | # Tell kbuild to always build the programs | ||
116 | always-y := $(tprogs-y) | ||
117 | always-y += sockex1_kern.o | ||
118 | always-y += sockex2_kern.o | ||
119 | always-y += sockex3_kern.o | ||
120 | always-y += tracex1_kern.o | ||
121 | always-y += tracex2_kern.o | ||
122 | always-y += tracex3_kern.o | ||
123 | always-y += tracex4_kern.o | ||
124 | always-y += tracex5_kern.o | ||
125 | always-y += tracex6_kern.o | ||
126 | always-y += tracex7_kern.o | ||
127 | always-y += sock_flags_kern.o | ||
128 | always-y += test_probe_write_user_kern.o | ||
129 | always-y += trace_output_kern.o | ||
130 | always-y += tcbpf1_kern.o | ||
131 | always-y += tc_l2_redirect_kern.o | ||
132 | always-y += lathist_kern.o | ||
133 | always-y += offwaketime_kern.o | ||
134 | always-y += spintest_kern.o | ||
135 | always-y += map_perf_test_kern.o | ||
136 | always-y += test_overhead_tp_kern.o | ||
137 | always-y += test_overhead_raw_tp_kern.o | ||
138 | always-y += test_overhead_kprobe_kern.o | ||
139 | always-y += parse_varlen.o parse_simple.o parse_ldabs.o | ||
140 | always-y += test_cgrp2_tc_kern.o | ||
141 | always-y += xdp1_kern.o | ||
142 | always-y += xdp2_kern.o | ||
143 | always-y += xdp_router_ipv4_kern.o | ||
144 | always-y += test_current_task_under_cgroup_kern.o | ||
145 | always-y += trace_event_kern.o | ||
146 | always-y += sampleip_kern.o | ||
147 | always-y += lwt_len_hist_kern.o | ||
148 | always-y += xdp_tx_iptunnel_kern.o | ||
149 | always-y += test_map_in_map_kern.o | ||
150 | always-y += tcp_synrto_kern.o | ||
151 | always-y += tcp_rwnd_kern.o | ||
152 | always-y += tcp_bufs_kern.o | ||
153 | always-y += tcp_cong_kern.o | ||
154 | always-y += tcp_iw_kern.o | ||
155 | always-y += tcp_clamp_kern.o | ||
156 | always-y += tcp_basertt_kern.o | ||
157 | always-y += tcp_tos_reflect_kern.o | ||
158 | always-y += tcp_dumpstats_kern.o | ||
159 | always-y += xdp_redirect_kern.o | ||
160 | always-y += xdp_redirect_map_kern.o | ||
161 | always-y += xdp_redirect_cpu_kern.o | ||
162 | always-y += xdp_monitor_kern.o | ||
163 | always-y += xdp_rxq_info_kern.o | ||
164 | always-y += xdp2skb_meta_kern.o | ||
165 | always-y += syscall_tp_kern.o | ||
166 | always-y += cpustat_kern.o | ||
167 | always-y += xdp_adjust_tail_kern.o | ||
168 | always-y += xdp_fwd_kern.o | ||
169 | always-y += task_fd_query_kern.o | ||
170 | always-y += xdp_sample_pkts_kern.o | ||
171 | always-y += ibumad_kern.o | ||
172 | always-y += hbm_out_kern.o | ||
173 | always-y += hbm_edt_kern.o | ||
174 | always-y += xdpsock_kern.o | ||
175 | |||
176 | ifeq ($(ARCH), arm) | ||
177 | # Strip all except -D__LINUX_ARM_ARCH__ option needed to handle linux | ||
178 | # headers when arm instruction set identification is requested. | ||
179 | ARM_ARCH_SELECTOR := $(filter -D__LINUX_ARM_ARCH__%, $(KBUILD_CFLAGS)) | ||
180 | BPF_EXTRA_CFLAGS := $(ARM_ARCH_SELECTOR) | ||
181 | TPROGS_CFLAGS += $(ARM_ARCH_SELECTOR) | ||
182 | endif | ||
183 | |||
184 | TPROGS_CFLAGS += -Wall -O2 | ||
185 | TPROGS_CFLAGS += -Wmissing-prototypes | ||
186 | TPROGS_CFLAGS += -Wstrict-prototypes | ||
187 | |||
188 | TPROGS_CFLAGS += -I$(objtree)/usr/include | ||
189 | TPROGS_CFLAGS += -I$(srctree)/tools/testing/selftests/bpf/ | ||
190 | TPROGS_CFLAGS += -I$(srctree)/tools/lib/ | ||
191 | TPROGS_CFLAGS += -I$(srctree)/tools/include | ||
192 | TPROGS_CFLAGS += -I$(srctree)/tools/perf | ||
193 | TPROGS_CFLAGS += -DHAVE_ATTR_TEST=0 | ||
194 | |||
195 | ifdef SYSROOT | ||
196 | TPROGS_CFLAGS += --sysroot=$(SYSROOT) | ||
197 | TPROGS_LDFLAGS := -L$(SYSROOT)/usr/lib | ||
198 | endif | ||
199 | |||
200 | TPROGCFLAGS_bpf_load.o += -Wno-unused-variable | ||
201 | |||
202 | TPROGS_LDLIBS += $(LIBBPF) -lelf -lz | ||
203 | TPROGLDLIBS_tracex4 += -lrt | ||
204 | TPROGLDLIBS_trace_output += -lrt | ||
205 | TPROGLDLIBS_map_perf_test += -lrt | ||
206 | TPROGLDLIBS_test_overhead += -lrt | ||
207 | TPROGLDLIBS_xdpsock += -pthread | ||
208 | TPROGLDLIBS_xsk_fwd += -pthread | ||
209 | |||
210 | # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: | ||
211 | # make M=samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang | ||
212 | LLC ?= llc | ||
213 | CLANG ?= clang | ||
214 | OPT ?= opt | ||
215 | LLVM_DIS ?= llvm-dis | ||
216 | LLVM_OBJCOPY ?= llvm-objcopy | ||
217 | BTF_PAHOLE ?= pahole | ||
218 | |||
219 | # Detect that we're cross compiling and use the cross compiler | ||
220 | ifdef CROSS_COMPILE | ||
221 | CLANG_ARCH_ARGS = --target=$(notdir $(CROSS_COMPILE:%-=%)) | ||
222 | endif | ||
223 | |||
224 | # Don't evaluate probes and warnings if we need to run make recursively | ||
225 | ifneq ($(src),) | ||
226 | HDR_PROBE := $(shell printf "\#include <linux/types.h>\n struct list_head { int a; }; int main() { return 0; }" | \ | ||
227 | $(CC) $(TPROGS_CFLAGS) $(TPROGS_LDFLAGS) -x c - \ | ||
228 | -o /dev/null 2>/dev/null && echo okay) | ||
229 | |||
230 | ifeq ($(HDR_PROBE),) | ||
231 | $(warning WARNING: Detected possible issues with include path.) | ||
232 | $(warning WARNING: Please install kernel headers locally (make headers_install).) | ||
233 | endif | ||
234 | |||
235 | BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris) | ||
236 | BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF) | ||
237 | BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 'usage.*llvm') | ||
238 | BTF_LLVM_PROBE := $(shell echo "int main() { return 0; }" | \ | ||
239 | $(CLANG) -target bpf -O2 -g -c -x c - -o ./llvm_btf_verify.o; \ | ||
240 | readelf -S ./llvm_btf_verify.o | grep BTF; \ | ||
241 | /bin/rm -f ./llvm_btf_verify.o) | ||
242 | |||
243 | BPF_EXTRA_CFLAGS += -fno-stack-protector | ||
244 | ifneq ($(BTF_LLVM_PROBE),) | ||
245 | BPF_EXTRA_CFLAGS += -g | ||
246 | else | ||
247 | ifneq ($(and $(BTF_LLC_PROBE),$(BTF_PAHOLE_PROBE),$(BTF_OBJCOPY_PROBE)),) | ||
248 | BPF_EXTRA_CFLAGS += -g | ||
249 | LLC_FLAGS += -mattr=dwarfris | ||
250 | DWARF2BTF = y | ||
251 | endif | ||
252 | endif | ||
253 | endif | ||
254 | |||
255 | # Trick to allow make to be run from this directory | ||
256 | all: | ||
257 | $(MAKE) -C ../../ M=$(CURDIR) BPF_SAMPLES_PATH=$(CURDIR) | ||
258 | |||
259 | clean: | ||
260 | $(MAKE) -C ../../ M=$(CURDIR) clean | ||
261 | @find $(CURDIR) -type f -name '*~' -delete | ||
262 | |||
263 | $(LIBBPF): FORCE | ||
264 | # Fix up variables inherited from Kbuild that tools/ build system won't like | ||
265 | $(MAKE) -C $(dir $@) RM='rm -rf' EXTRA_CFLAGS="$(TPROGS_CFLAGS)" \ | ||
266 | LDFLAGS=$(TPROGS_LDFLAGS) srctree=$(BPF_SAMPLES_PATH)/../../ O= | ||
267 | |||
268 | $(obj)/syscall_nrs.h: $(obj)/syscall_nrs.s FORCE | ||
269 | $(call filechk,offsets,__SYSCALL_NRS_H__) | ||
270 | |||
271 | targets += syscall_nrs.s | ||
272 | clean-files += syscall_nrs.h | ||
273 | |||
274 | FORCE: | ||
275 | |||
276 | |||
277 | # Verify LLVM compiler tools are available and bpf target is supported by llc | ||
278 | .PHONY: verify_cmds verify_target_bpf $(CLANG) $(LLC) | ||
279 | |||
280 | verify_cmds: $(CLANG) $(LLC) | ||
281 | @for TOOL in $^ ; do \ | ||
282 | if ! (which -- "$${TOOL}" > /dev/null 2>&1); then \ | ||
283 | echo "*** ERROR: Cannot find LLVM tool $${TOOL}" ;\ | ||
284 | exit 1; \ | ||
285 | else true; fi; \ | ||
286 | done | ||
287 | |||
288 | verify_target_bpf: verify_cmds | ||
289 | @if ! (${LLC} -march=bpf -mattr=help > /dev/null 2>&1); then \ | ||
290 | echo "*** ERROR: LLVM (${LLC}) does not support 'bpf' target" ;\ | ||
291 | echo " NOTICE: LLVM version >= 3.7.1 required" ;\ | ||
292 | exit 2; \ | ||
293 | else true; fi | ||
294 | |||
295 | $(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF) | ||
296 | $(src)/*.c: verify_target_bpf $(LIBBPF) | ||
297 | |||
298 | $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h | ||
299 | $(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h | ||
300 | $(obj)/hbm.o: $(src)/hbm.h | ||
301 | $(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h | ||
302 | |||
303 | -include $(BPF_SAMPLES_PATH)/Makefile.target | ||
304 | |||
305 | # asm/sysreg.h - inline assembly used by it is incompatible with llvm. | ||
306 | # But, there is no easy way to fix it, so just exclude it since it is | ||
307 | # useless for BPF samples. | ||
308 | # below we use long chain of commands, clang | opt | llvm-dis | llc, | ||
309 | # to generate final object file. 'clang' compiles the source into IR | ||
310 | # with native target, e.g., x64, arm64, etc. 'opt' does bpf CORE IR builtin | ||
311 | # processing (llvm12) and IR optimizations. 'llvm-dis' converts | ||
312 | # 'opt' output to IR, and finally 'llc' generates bpf byte code. | ||
313 | $(obj)/%.o: $(src)/%.c | ||
314 | @echo " CLANG-bpf " $@ | ||
315 | $(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(BPF_EXTRA_CFLAGS) \ | ||
316 | -I$(obj) -I$(srctree)/tools/testing/selftests/bpf/ \ | ||
317 | -I$(srctree)/tools/lib/ \ | ||
318 | -D__KERNEL__ -D__BPF_TRACING__ -Wno-unused-value -Wno-pointer-sign \ | ||
319 | -D__TARGET_ARCH_$(SRCARCH) -Wno-compare-distinct-pointer-types \ | ||
320 | -Wno-gnu-variable-sized-type-not-at-end \ | ||
321 | -Wno-address-of-packed-member -Wno-tautological-compare \ | ||
322 | -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \ | ||
323 | -I$(srctree)/samples/bpf/ -include asm_goto_workaround.h \ | ||
324 | -O2 -emit-llvm -Xclang -disable-llvm-passes -c $< -o - | \ | ||
325 | $(OPT) -O2 -mtriple=bpf-pc-linux | $(LLVM_DIS) | \ | ||
326 | $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@ | ||
327 | ifeq ($(DWARF2BTF),y) | ||
328 | $(BTF_PAHOLE) -J $@ | ||
329 | endif | ||
diff --git a/samples/bpf/Makefile.target b/samples/bpf/Makefile.target new file mode 100644 index 000000000..7621f55e2 --- /dev/null +++ b/samples/bpf/Makefile.target | |||
@@ -0,0 +1,75 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0 | ||
2 | # ========================================================================== | ||
3 | # Building binaries on the host system | ||
4 | # Binaries are not used during the compilation of the kernel, and intended | ||
5 | # to be build for target board, target board can be host of course. Added to | ||
6 | # build binaries to run not on host system. | ||
7 | # | ||
8 | # Sample syntax | ||
9 | # tprogs-y := xsk_example | ||
10 | # Will compile xsk_example.c and create an executable named xsk_example | ||
11 | # | ||
12 | # tprogs-y := xdpsock | ||
13 | # xdpsock-objs := xdpsock_1.o xdpsock_2.o | ||
14 | # Will compile xdpsock_1.c and xdpsock_2.c, and then link the executable | ||
15 | # xdpsock, based on xdpsock_1.o and xdpsock_2.o | ||
16 | # | ||
17 | # Derived from scripts/Makefile.host | ||
18 | # | ||
19 | __tprogs := $(sort $(tprogs-y)) | ||
20 | |||
21 | # C code | ||
22 | # Executables compiled from a single .c file | ||
23 | tprog-csingle := $(foreach m,$(__tprogs), \ | ||
24 | $(if $($(m)-objs),,$(m))) | ||
25 | |||
26 | # C executables linked based on several .o files | ||
27 | tprog-cmulti := $(foreach m,$(__tprogs),\ | ||
28 | $(if $($(m)-objs),$(m))) | ||
29 | |||
30 | # Object (.o) files compiled from .c files | ||
31 | tprog-cobjs := $(sort $(foreach m,$(__tprogs),$($(m)-objs))) | ||
32 | |||
33 | tprog-csingle := $(addprefix $(obj)/,$(tprog-csingle)) | ||
34 | tprog-cmulti := $(addprefix $(obj)/,$(tprog-cmulti)) | ||
35 | tprog-cobjs := $(addprefix $(obj)/,$(tprog-cobjs)) | ||
36 | |||
37 | ##### | ||
38 | # Handle options to gcc. Support building with separate output directory | ||
39 | |||
40 | _tprogc_flags = $(TPROGS_CFLAGS) \ | ||
41 | $(TPROGCFLAGS_$(basetarget).o) | ||
42 | |||
43 | # $(objtree)/$(obj) for including generated headers from checkin source files | ||
44 | ifeq ($(KBUILD_EXTMOD),) | ||
45 | ifdef building_out_of_srctree | ||
46 | _tprogc_flags += -I $(objtree)/$(obj) | ||
47 | endif | ||
48 | endif | ||
49 | |||
50 | tprogc_flags = -Wp,-MD,$(depfile) $(_tprogc_flags) | ||
51 | |||
52 | # Create executable from a single .c file | ||
53 | # tprog-csingle -> Executable | ||
54 | quiet_cmd_tprog-csingle = CC $@ | ||
55 | cmd_tprog-csingle = $(CC) $(tprogc_flags) $(TPROGS_LDFLAGS) -o $@ $< \ | ||
56 | $(TPROGS_LDLIBS) $(TPROGLDLIBS_$(@F)) | ||
57 | $(tprog-csingle): $(obj)/%: $(src)/%.c FORCE | ||
58 | $(call if_changed_dep,tprog-csingle) | ||
59 | |||
60 | # Link an executable based on list of .o files, all plain c | ||
61 | # tprog-cmulti -> executable | ||
62 | quiet_cmd_tprog-cmulti = LD $@ | ||
63 | cmd_tprog-cmulti = $(CC) $(tprogc_flags) $(TPROGS_LDFLAGS) -o $@ \ | ||
64 | $(addprefix $(obj)/,$($(@F)-objs)) \ | ||
65 | $(TPROGS_LDLIBS) $(TPROGLDLIBS_$(@F)) | ||
66 | $(tprog-cmulti): $(tprog-cobjs) FORCE | ||
67 | $(call if_changed,tprog-cmulti) | ||
68 | $(call multi_depend, $(tprog-cmulti), , -objs) | ||
69 | |||
70 | # Create .o file from a single .c file | ||
71 | # tprog-cobjs -> .o | ||
72 | quiet_cmd_tprog-cobjs = CC $@ | ||
73 | cmd_tprog-cobjs = $(CC) $(tprogc_flags) -c -o $@ $< | ||
74 | $(tprog-cobjs): $(obj)/%.o: $(src)/%.c FORCE | ||
75 | $(call if_changed_dep,tprog-cobjs) | ||
diff --git a/samples/bpf/README.rst b/samples/bpf/README.rst new file mode 100644 index 000000000..dd34b2d26 --- /dev/null +++ b/samples/bpf/README.rst | |||
@@ -0,0 +1,105 @@ | |||
1 | eBPF sample programs | ||
2 | ==================== | ||
3 | |||
4 | This directory contains a test stubs, verifier test-suite and examples | ||
5 | for using eBPF. The examples use libbpf from tools/lib/bpf. | ||
6 | |||
7 | Build dependencies | ||
8 | ================== | ||
9 | |||
10 | Compiling requires having installed: | ||
11 | * clang >= version 3.4.0 | ||
12 | * llvm >= version 3.7.1 | ||
13 | |||
14 | Note that LLVM's tool 'llc' must support target 'bpf', list version | ||
15 | and supported targets with command: ``llc --version`` | ||
16 | |||
17 | Clean and configuration | ||
18 | ----------------------- | ||
19 | |||
20 | It can be needed to clean tools, samples or kernel before trying new arch or | ||
21 | after some changes (on demand):: | ||
22 | |||
23 | make -C tools clean | ||
24 | make -C samples/bpf clean | ||
25 | make clean | ||
26 | |||
27 | Configure kernel, defconfig for instance:: | ||
28 | |||
29 | make defconfig | ||
30 | |||
31 | Kernel headers | ||
32 | -------------- | ||
33 | |||
34 | There are usually dependencies to header files of the current kernel. | ||
35 | To avoid installing devel kernel headers system wide, as a normal | ||
36 | user, simply call:: | ||
37 | |||
38 | make headers_install | ||
39 | |||
40 | This will creates a local "usr/include" directory in the git/build top | ||
41 | level directory, that the make system automatically pickup first. | ||
42 | |||
43 | Compiling | ||
44 | ========= | ||
45 | |||
46 | For building the BPF samples, issue the below command from the kernel | ||
47 | top level directory:: | ||
48 | |||
49 | make M=samples/bpf | ||
50 | |||
51 | It is also possible to call make from this directory. This will just | ||
52 | hide the invocation of make as above. | ||
53 | |||
54 | Manually compiling LLVM with 'bpf' support | ||
55 | ------------------------------------------ | ||
56 | |||
57 | Since version 3.7.0, LLVM adds a proper LLVM backend target for the | ||
58 | BPF bytecode architecture. | ||
59 | |||
60 | By default llvm will build all non-experimental backends including bpf. | ||
61 | To generate a smaller llc binary one can use:: | ||
62 | |||
63 | -DLLVM_TARGETS_TO_BUILD="BPF" | ||
64 | |||
65 | Quick sniplet for manually compiling LLVM and clang | ||
66 | (build dependencies are cmake and gcc-c++):: | ||
67 | |||
68 | $ git clone http://llvm.org/git/llvm.git | ||
69 | $ cd llvm/tools | ||
70 | $ git clone --depth 1 http://llvm.org/git/clang.git | ||
71 | $ cd ..; mkdir build; cd build | ||
72 | $ cmake .. -DLLVM_TARGETS_TO_BUILD="BPF;X86" | ||
73 | $ make -j $(getconf _NPROCESSORS_ONLN) | ||
74 | |||
75 | It is also possible to point make to the newly compiled 'llc' or | ||
76 | 'clang' command via redefining LLC or CLANG on the make command line:: | ||
77 | |||
78 | make M=samples/bpf LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang | ||
79 | |||
80 | Cross compiling samples | ||
81 | ----------------------- | ||
82 | In order to cross-compile, say for arm64 targets, export CROSS_COMPILE and ARCH | ||
83 | environment variables before calling make. But do this before clean, | ||
84 | cofiguration and header install steps described above. This will direct make to | ||
85 | build samples for the cross target:: | ||
86 | |||
87 | export ARCH=arm64 | ||
88 | export CROSS_COMPILE="aarch64-linux-gnu-" | ||
89 | |||
90 | Headers can be also installed on RFS of target board if need to keep them in | ||
91 | sync (not necessarily and it creates a local "usr/include" directory also):: | ||
92 | |||
93 | make INSTALL_HDR_PATH=~/some_sysroot/usr headers_install | ||
94 | |||
95 | Pointing LLC and CLANG is not necessarily if it's installed on HOST and have | ||
96 | in its targets appropriate arm64 arch (usually it has several arches). | ||
97 | Build samples:: | ||
98 | |||
99 | make M=samples/bpf | ||
100 | |||
101 | Or build samples with SYSROOT if some header or library is absent in toolchain, | ||
102 | say libelf, providing address to file system containing headers and libs, | ||
103 | can be RFS of target board:: | ||
104 | |||
105 | make M=samples/bpf SYSROOT=~/some_sysroot | ||
diff --git a/samples/bpf/asm_goto_workaround.h b/samples/bpf/asm_goto_workaround.h new file mode 100644 index 000000000..7048bb359 --- /dev/null +++ b/samples/bpf/asm_goto_workaround.h | |||
@@ -0,0 +1,28 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
2 | /* Copyright (c) 2019 Facebook */ | ||
3 | #ifndef __ASM_GOTO_WORKAROUND_H | ||
4 | #define __ASM_GOTO_WORKAROUND_H | ||
5 | |||
6 | /* | ||
7 | * This will bring in asm_volatile_goto and asm_inline macro definitions | ||
8 | * if enabled by compiler and config options. | ||
9 | */ | ||
10 | #include <linux/types.h> | ||
11 | |||
12 | #ifdef asm_volatile_goto | ||
13 | #undef asm_volatile_goto | ||
14 | #define asm_volatile_goto(x...) asm volatile("invalid use of asm_volatile_goto") | ||
15 | #endif | ||
16 | |||
17 | /* | ||
18 | * asm_inline is defined as asm __inline in "include/linux/compiler_types.h" | ||
19 | * if supported by the kernel's CC (i.e CONFIG_CC_HAS_ASM_INLINE) which is not | ||
20 | * supported by CLANG. | ||
21 | */ | ||
22 | #ifdef asm_inline | ||
23 | #undef asm_inline | ||
24 | #define asm_inline asm | ||
25 | #endif | ||
26 | |||
27 | #define volatile(x...) volatile("") | ||
28 | #endif | ||
diff --git a/samples/bpf/bpf_insn.h b/samples/bpf/bpf_insn.h new file mode 100644 index 000000000..544237980 --- /dev/null +++ b/samples/bpf/bpf_insn.h | |||
@@ -0,0 +1,217 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
2 | /* eBPF instruction mini library */ | ||
3 | #ifndef __BPF_INSN_H | ||
4 | #define __BPF_INSN_H | ||
5 | |||
6 | struct bpf_insn; | ||
7 | |||
8 | /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ | ||
9 | |||
10 | #define BPF_ALU64_REG(OP, DST, SRC) \ | ||
11 | ((struct bpf_insn) { \ | ||
12 | .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \ | ||
13 | .dst_reg = DST, \ | ||
14 | .src_reg = SRC, \ | ||
15 | .off = 0, \ | ||
16 | .imm = 0 }) | ||
17 | |||
18 | #define BPF_ALU32_REG(OP, DST, SRC) \ | ||
19 | ((struct bpf_insn) { \ | ||
20 | .code = BPF_ALU | BPF_OP(OP) | BPF_X, \ | ||
21 | .dst_reg = DST, \ | ||
22 | .src_reg = SRC, \ | ||
23 | .off = 0, \ | ||
24 | .imm = 0 }) | ||
25 | |||
26 | /* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */ | ||
27 | |||
28 | #define BPF_ALU64_IMM(OP, DST, IMM) \ | ||
29 | ((struct bpf_insn) { \ | ||
30 | .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ | ||
31 | .dst_reg = DST, \ | ||
32 | .src_reg = 0, \ | ||
33 | .off = 0, \ | ||
34 | .imm = IMM }) | ||
35 | |||
36 | #define BPF_ALU32_IMM(OP, DST, IMM) \ | ||
37 | ((struct bpf_insn) { \ | ||
38 | .code = BPF_ALU | BPF_OP(OP) | BPF_K, \ | ||
39 | .dst_reg = DST, \ | ||
40 | .src_reg = 0, \ | ||
41 | .off = 0, \ | ||
42 | .imm = IMM }) | ||
43 | |||
44 | /* Short form of mov, dst_reg = src_reg */ | ||
45 | |||
46 | #define BPF_MOV64_REG(DST, SRC) \ | ||
47 | ((struct bpf_insn) { \ | ||
48 | .code = BPF_ALU64 | BPF_MOV | BPF_X, \ | ||
49 | .dst_reg = DST, \ | ||
50 | .src_reg = SRC, \ | ||
51 | .off = 0, \ | ||
52 | .imm = 0 }) | ||
53 | |||
54 | #define BPF_MOV32_REG(DST, SRC) \ | ||
55 | ((struct bpf_insn) { \ | ||
56 | .code = BPF_ALU | BPF_MOV | BPF_X, \ | ||
57 | .dst_reg = DST, \ | ||
58 | .src_reg = SRC, \ | ||
59 | .off = 0, \ | ||
60 | .imm = 0 }) | ||
61 | |||
62 | /* Short form of mov, dst_reg = imm32 */ | ||
63 | |||
64 | #define BPF_MOV64_IMM(DST, IMM) \ | ||
65 | ((struct bpf_insn) { \ | ||
66 | .code = BPF_ALU64 | BPF_MOV | BPF_K, \ | ||
67 | .dst_reg = DST, \ | ||
68 | .src_reg = 0, \ | ||
69 | .off = 0, \ | ||
70 | .imm = IMM }) | ||
71 | |||
72 | #define BPF_MOV32_IMM(DST, IMM) \ | ||
73 | ((struct bpf_insn) { \ | ||
74 | .code = BPF_ALU | BPF_MOV | BPF_K, \ | ||
75 | .dst_reg = DST, \ | ||
76 | .src_reg = 0, \ | ||
77 | .off = 0, \ | ||
78 | .imm = IMM }) | ||
79 | |||
80 | /* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ | ||
81 | #define BPF_LD_IMM64(DST, IMM) \ | ||
82 | BPF_LD_IMM64_RAW(DST, 0, IMM) | ||
83 | |||
84 | #define BPF_LD_IMM64_RAW(DST, SRC, IMM) \ | ||
85 | ((struct bpf_insn) { \ | ||
86 | .code = BPF_LD | BPF_DW | BPF_IMM, \ | ||
87 | .dst_reg = DST, \ | ||
88 | .src_reg = SRC, \ | ||
89 | .off = 0, \ | ||
90 | .imm = (__u32) (IMM) }), \ | ||
91 | ((struct bpf_insn) { \ | ||
92 | .code = 0, /* zero is reserved opcode */ \ | ||
93 | .dst_reg = 0, \ | ||
94 | .src_reg = 0, \ | ||
95 | .off = 0, \ | ||
96 | .imm = ((__u64) (IMM)) >> 32 }) | ||
97 | |||
98 | #ifndef BPF_PSEUDO_MAP_FD | ||
99 | # define BPF_PSEUDO_MAP_FD 1 | ||
100 | #endif | ||
101 | |||
102 | /* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */ | ||
103 | #define BPF_LD_MAP_FD(DST, MAP_FD) \ | ||
104 | BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) | ||
105 | |||
106 | |||
107 | /* Direct packet access, R0 = *(uint *) (skb->data + imm32) */ | ||
108 | |||
109 | #define BPF_LD_ABS(SIZE, IMM) \ | ||
110 | ((struct bpf_insn) { \ | ||
111 | .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \ | ||
112 | .dst_reg = 0, \ | ||
113 | .src_reg = 0, \ | ||
114 | .off = 0, \ | ||
115 | .imm = IMM }) | ||
116 | |||
117 | /* Memory load, dst_reg = *(uint *) (src_reg + off16) */ | ||
118 | |||
119 | #define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ | ||
120 | ((struct bpf_insn) { \ | ||
121 | .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ | ||
122 | .dst_reg = DST, \ | ||
123 | .src_reg = SRC, \ | ||
124 | .off = OFF, \ | ||
125 | .imm = 0 }) | ||
126 | |||
127 | /* Memory store, *(uint *) (dst_reg + off16) = src_reg */ | ||
128 | |||
129 | #define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ | ||
130 | ((struct bpf_insn) { \ | ||
131 | .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \ | ||
132 | .dst_reg = DST, \ | ||
133 | .src_reg = SRC, \ | ||
134 | .off = OFF, \ | ||
135 | .imm = 0 }) | ||
136 | |||
137 | /* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */ | ||
138 | |||
139 | #define BPF_STX_XADD(SIZE, DST, SRC, OFF) \ | ||
140 | ((struct bpf_insn) { \ | ||
141 | .code = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD, \ | ||
142 | .dst_reg = DST, \ | ||
143 | .src_reg = SRC, \ | ||
144 | .off = OFF, \ | ||
145 | .imm = 0 }) | ||
146 | |||
147 | /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ | ||
148 | |||
149 | #define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ | ||
150 | ((struct bpf_insn) { \ | ||
151 | .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ | ||
152 | .dst_reg = DST, \ | ||
153 | .src_reg = 0, \ | ||
154 | .off = OFF, \ | ||
155 | .imm = IMM }) | ||
156 | |||
157 | /* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */ | ||
158 | |||
159 | #define BPF_JMP_REG(OP, DST, SRC, OFF) \ | ||
160 | ((struct bpf_insn) { \ | ||
161 | .code = BPF_JMP | BPF_OP(OP) | BPF_X, \ | ||
162 | .dst_reg = DST, \ | ||
163 | .src_reg = SRC, \ | ||
164 | .off = OFF, \ | ||
165 | .imm = 0 }) | ||
166 | |||
167 | /* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */ | ||
168 | |||
169 | #define BPF_JMP32_REG(OP, DST, SRC, OFF) \ | ||
170 | ((struct bpf_insn) { \ | ||
171 | .code = BPF_JMP32 | BPF_OP(OP) | BPF_X, \ | ||
172 | .dst_reg = DST, \ | ||
173 | .src_reg = SRC, \ | ||
174 | .off = OFF, \ | ||
175 | .imm = 0 }) | ||
176 | |||
177 | /* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ | ||
178 | |||
179 | #define BPF_JMP_IMM(OP, DST, IMM, OFF) \ | ||
180 | ((struct bpf_insn) { \ | ||
181 | .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ | ||
182 | .dst_reg = DST, \ | ||
183 | .src_reg = 0, \ | ||
184 | .off = OFF, \ | ||
185 | .imm = IMM }) | ||
186 | |||
187 | /* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */ | ||
188 | |||
189 | #define BPF_JMP32_IMM(OP, DST, IMM, OFF) \ | ||
190 | ((struct bpf_insn) { \ | ||
191 | .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \ | ||
192 | .dst_reg = DST, \ | ||
193 | .src_reg = 0, \ | ||
194 | .off = OFF, \ | ||
195 | .imm = IMM }) | ||
196 | |||
197 | /* Raw code statement block */ | ||
198 | |||
199 | #define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \ | ||
200 | ((struct bpf_insn) { \ | ||
201 | .code = CODE, \ | ||
202 | .dst_reg = DST, \ | ||
203 | .src_reg = SRC, \ | ||
204 | .off = OFF, \ | ||
205 | .imm = IMM }) | ||
206 | |||
207 | /* Program exit */ | ||
208 | |||
209 | #define BPF_EXIT_INSN() \ | ||
210 | ((struct bpf_insn) { \ | ||
211 | .code = BPF_JMP | BPF_EXIT, \ | ||
212 | .dst_reg = 0, \ | ||
213 | .src_reg = 0, \ | ||
214 | .off = 0, \ | ||
215 | .imm = 0 }) | ||
216 | |||
217 | #endif | ||
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c new file mode 100644 index 000000000..c5ad528f0 --- /dev/null +++ b/samples/bpf/bpf_load.c | |||
@@ -0,0 +1,667 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #include <stdio.h> | ||
3 | #include <sys/types.h> | ||
4 | #include <sys/stat.h> | ||
5 | #include <fcntl.h> | ||
6 | #include <libelf.h> | ||
7 | #include <gelf.h> | ||
8 | #include <errno.h> | ||
9 | #include <unistd.h> | ||
10 | #include <string.h> | ||
11 | #include <stdbool.h> | ||
12 | #include <stdlib.h> | ||
13 | #include <linux/bpf.h> | ||
14 | #include <linux/filter.h> | ||
15 | #include <linux/perf_event.h> | ||
16 | #include <linux/netlink.h> | ||
17 | #include <linux/rtnetlink.h> | ||
18 | #include <linux/types.h> | ||
19 | #include <sys/socket.h> | ||
20 | #include <sys/syscall.h> | ||
21 | #include <sys/ioctl.h> | ||
22 | #include <sys/mman.h> | ||
23 | #include <poll.h> | ||
24 | #include <ctype.h> | ||
25 | #include <assert.h> | ||
26 | #include <bpf/bpf.h> | ||
27 | #include "bpf_load.h" | ||
28 | #include "perf-sys.h" | ||
29 | |||
30 | #define DEBUGFS "/sys/kernel/debug/tracing/" | ||
31 | |||
32 | static char license[128]; | ||
33 | static int kern_version; | ||
34 | static bool processed_sec[128]; | ||
35 | char bpf_log_buf[BPF_LOG_BUF_SIZE]; | ||
36 | int map_fd[MAX_MAPS]; | ||
37 | int prog_fd[MAX_PROGS]; | ||
38 | int event_fd[MAX_PROGS]; | ||
39 | int prog_cnt; | ||
40 | int prog_array_fd = -1; | ||
41 | |||
42 | struct bpf_map_data map_data[MAX_MAPS]; | ||
43 | int map_data_count; | ||
44 | |||
45 | static int populate_prog_array(const char *event, int prog_fd) | ||
46 | { | ||
47 | int ind = atoi(event), err; | ||
48 | |||
49 | err = bpf_map_update_elem(prog_array_fd, &ind, &prog_fd, BPF_ANY); | ||
50 | if (err < 0) { | ||
51 | printf("failed to store prog_fd in prog_array\n"); | ||
52 | return -1; | ||
53 | } | ||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | static int write_kprobe_events(const char *val) | ||
58 | { | ||
59 | int fd, ret, flags; | ||
60 | |||
61 | if (val == NULL) | ||
62 | return -1; | ||
63 | else if (val[0] == '\0') | ||
64 | flags = O_WRONLY | O_TRUNC; | ||
65 | else | ||
66 | flags = O_WRONLY | O_APPEND; | ||
67 | |||
68 | fd = open(DEBUGFS "kprobe_events", flags); | ||
69 | |||
70 | ret = write(fd, val, strlen(val)); | ||
71 | close(fd); | ||
72 | |||
73 | return ret; | ||
74 | } | ||
75 | |||
76 | static int load_and_attach(const char *event, struct bpf_insn *prog, int size) | ||
77 | { | ||
78 | bool is_socket = strncmp(event, "socket", 6) == 0; | ||
79 | bool is_kprobe = strncmp(event, "kprobe/", 7) == 0; | ||
80 | bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0; | ||
81 | bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0; | ||
82 | bool is_raw_tracepoint = strncmp(event, "raw_tracepoint/", 15) == 0; | ||
83 | bool is_xdp = strncmp(event, "xdp", 3) == 0; | ||
84 | bool is_perf_event = strncmp(event, "perf_event", 10) == 0; | ||
85 | bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0; | ||
86 | bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0; | ||
87 | bool is_sockops = strncmp(event, "sockops", 7) == 0; | ||
88 | bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0; | ||
89 | bool is_sk_msg = strncmp(event, "sk_msg", 6) == 0; | ||
90 | size_t insns_cnt = size / sizeof(struct bpf_insn); | ||
91 | enum bpf_prog_type prog_type; | ||
92 | char buf[256]; | ||
93 | int fd, efd, err, id; | ||
94 | struct perf_event_attr attr = {}; | ||
95 | |||
96 | attr.type = PERF_TYPE_TRACEPOINT; | ||
97 | attr.sample_type = PERF_SAMPLE_RAW; | ||
98 | attr.sample_period = 1; | ||
99 | attr.wakeup_events = 1; | ||
100 | |||
101 | if (is_socket) { | ||
102 | prog_type = BPF_PROG_TYPE_SOCKET_FILTER; | ||
103 | } else if (is_kprobe || is_kretprobe) { | ||
104 | prog_type = BPF_PROG_TYPE_KPROBE; | ||
105 | } else if (is_tracepoint) { | ||
106 | prog_type = BPF_PROG_TYPE_TRACEPOINT; | ||
107 | } else if (is_raw_tracepoint) { | ||
108 | prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT; | ||
109 | } else if (is_xdp) { | ||
110 | prog_type = BPF_PROG_TYPE_XDP; | ||
111 | } else if (is_perf_event) { | ||
112 | prog_type = BPF_PROG_TYPE_PERF_EVENT; | ||
113 | } else if (is_cgroup_skb) { | ||
114 | prog_type = BPF_PROG_TYPE_CGROUP_SKB; | ||
115 | } else if (is_cgroup_sk) { | ||
116 | prog_type = BPF_PROG_TYPE_CGROUP_SOCK; | ||
117 | } else if (is_sockops) { | ||
118 | prog_type = BPF_PROG_TYPE_SOCK_OPS; | ||
119 | } else if (is_sk_skb) { | ||
120 | prog_type = BPF_PROG_TYPE_SK_SKB; | ||
121 | } else if (is_sk_msg) { | ||
122 | prog_type = BPF_PROG_TYPE_SK_MSG; | ||
123 | } else { | ||
124 | printf("Unknown event '%s'\n", event); | ||
125 | return -1; | ||
126 | } | ||
127 | |||
128 | if (prog_cnt == MAX_PROGS) | ||
129 | return -1; | ||
130 | |||
131 | fd = bpf_load_program(prog_type, prog, insns_cnt, license, kern_version, | ||
132 | bpf_log_buf, BPF_LOG_BUF_SIZE); | ||
133 | if (fd < 0) { | ||
134 | printf("bpf_load_program() err=%d\n%s", errno, bpf_log_buf); | ||
135 | return -1; | ||
136 | } | ||
137 | |||
138 | prog_fd[prog_cnt++] = fd; | ||
139 | |||
140 | if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk) | ||
141 | return 0; | ||
142 | |||
143 | if (is_socket || is_sockops || is_sk_skb || is_sk_msg) { | ||
144 | if (is_socket) | ||
145 | event += 6; | ||
146 | else | ||
147 | event += 7; | ||
148 | if (*event != '/') | ||
149 | return 0; | ||
150 | event++; | ||
151 | if (!isdigit(*event)) { | ||
152 | printf("invalid prog number\n"); | ||
153 | return -1; | ||
154 | } | ||
155 | return populate_prog_array(event, fd); | ||
156 | } | ||
157 | |||
158 | if (is_raw_tracepoint) { | ||
159 | efd = bpf_raw_tracepoint_open(event + 15, fd); | ||
160 | if (efd < 0) { | ||
161 | printf("tracepoint %s %s\n", event + 15, strerror(errno)); | ||
162 | return -1; | ||
163 | } | ||
164 | event_fd[prog_cnt - 1] = efd; | ||
165 | return 0; | ||
166 | } | ||
167 | |||
168 | if (is_kprobe || is_kretprobe) { | ||
169 | bool need_normal_check = true; | ||
170 | const char *event_prefix = ""; | ||
171 | |||
172 | if (is_kprobe) | ||
173 | event += 7; | ||
174 | else | ||
175 | event += 10; | ||
176 | |||
177 | if (*event == 0) { | ||
178 | printf("event name cannot be empty\n"); | ||
179 | return -1; | ||
180 | } | ||
181 | |||
182 | if (isdigit(*event)) | ||
183 | return populate_prog_array(event, fd); | ||
184 | |||
185 | #ifdef __x86_64__ | ||
186 | if (strncmp(event, "sys_", 4) == 0) { | ||
187 | snprintf(buf, sizeof(buf), "%c:__x64_%s __x64_%s", | ||
188 | is_kprobe ? 'p' : 'r', event, event); | ||
189 | err = write_kprobe_events(buf); | ||
190 | if (err >= 0) { | ||
191 | need_normal_check = false; | ||
192 | event_prefix = "__x64_"; | ||
193 | } | ||
194 | } | ||
195 | #endif | ||
196 | if (need_normal_check) { | ||
197 | snprintf(buf, sizeof(buf), "%c:%s %s", | ||
198 | is_kprobe ? 'p' : 'r', event, event); | ||
199 | err = write_kprobe_events(buf); | ||
200 | if (err < 0) { | ||
201 | printf("failed to create kprobe '%s' error '%s'\n", | ||
202 | event, strerror(errno)); | ||
203 | return -1; | ||
204 | } | ||
205 | } | ||
206 | |||
207 | strcpy(buf, DEBUGFS); | ||
208 | strcat(buf, "events/kprobes/"); | ||
209 | strcat(buf, event_prefix); | ||
210 | strcat(buf, event); | ||
211 | strcat(buf, "/id"); | ||
212 | } else if (is_tracepoint) { | ||
213 | event += 11; | ||
214 | |||
215 | if (*event == 0) { | ||
216 | printf("event name cannot be empty\n"); | ||
217 | return -1; | ||
218 | } | ||
219 | strcpy(buf, DEBUGFS); | ||
220 | strcat(buf, "events/"); | ||
221 | strcat(buf, event); | ||
222 | strcat(buf, "/id"); | ||
223 | } | ||
224 | |||
225 | efd = open(buf, O_RDONLY, 0); | ||
226 | if (efd < 0) { | ||
227 | printf("failed to open event %s\n", event); | ||
228 | return -1; | ||
229 | } | ||
230 | |||
231 | err = read(efd, buf, sizeof(buf)); | ||
232 | if (err < 0 || err >= sizeof(buf)) { | ||
233 | printf("read from '%s' failed '%s'\n", event, strerror(errno)); | ||
234 | return -1; | ||
235 | } | ||
236 | |||
237 | close(efd); | ||
238 | |||
239 | buf[err] = 0; | ||
240 | id = atoi(buf); | ||
241 | attr.config = id; | ||
242 | |||
243 | efd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0); | ||
244 | if (efd < 0) { | ||
245 | printf("event %d fd %d err %s\n", id, efd, strerror(errno)); | ||
246 | return -1; | ||
247 | } | ||
248 | event_fd[prog_cnt - 1] = efd; | ||
249 | err = ioctl(efd, PERF_EVENT_IOC_ENABLE, 0); | ||
250 | if (err < 0) { | ||
251 | printf("ioctl PERF_EVENT_IOC_ENABLE failed err %s\n", | ||
252 | strerror(errno)); | ||
253 | return -1; | ||
254 | } | ||
255 | err = ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd); | ||
256 | if (err < 0) { | ||
257 | printf("ioctl PERF_EVENT_IOC_SET_BPF failed err %s\n", | ||
258 | strerror(errno)); | ||
259 | return -1; | ||
260 | } | ||
261 | |||
262 | return 0; | ||
263 | } | ||
264 | |||
265 | static int load_maps(struct bpf_map_data *maps, int nr_maps, | ||
266 | fixup_map_cb fixup_map) | ||
267 | { | ||
268 | int i, numa_node; | ||
269 | |||
270 | for (i = 0; i < nr_maps; i++) { | ||
271 | if (fixup_map) { | ||
272 | fixup_map(&maps[i], i); | ||
273 | /* Allow userspace to assign map FD prior to creation */ | ||
274 | if (maps[i].fd != -1) { | ||
275 | map_fd[i] = maps[i].fd; | ||
276 | continue; | ||
277 | } | ||
278 | } | ||
279 | |||
280 | numa_node = maps[i].def.map_flags & BPF_F_NUMA_NODE ? | ||
281 | maps[i].def.numa_node : -1; | ||
282 | |||
283 | if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS || | ||
284 | maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) { | ||
285 | int inner_map_fd = map_fd[maps[i].def.inner_map_idx]; | ||
286 | |||
287 | map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type, | ||
288 | maps[i].name, | ||
289 | maps[i].def.key_size, | ||
290 | inner_map_fd, | ||
291 | maps[i].def.max_entries, | ||
292 | maps[i].def.map_flags, | ||
293 | numa_node); | ||
294 | } else { | ||
295 | map_fd[i] = bpf_create_map_node(maps[i].def.type, | ||
296 | maps[i].name, | ||
297 | maps[i].def.key_size, | ||
298 | maps[i].def.value_size, | ||
299 | maps[i].def.max_entries, | ||
300 | maps[i].def.map_flags, | ||
301 | numa_node); | ||
302 | } | ||
303 | if (map_fd[i] < 0) { | ||
304 | printf("failed to create map %d (%s): %d %s\n", | ||
305 | i, maps[i].name, errno, strerror(errno)); | ||
306 | return 1; | ||
307 | } | ||
308 | maps[i].fd = map_fd[i]; | ||
309 | |||
310 | if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY) | ||
311 | prog_array_fd = map_fd[i]; | ||
312 | } | ||
313 | return 0; | ||
314 | } | ||
315 | |||
316 | static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname, | ||
317 | GElf_Shdr *shdr, Elf_Data **data) | ||
318 | { | ||
319 | Elf_Scn *scn; | ||
320 | |||
321 | scn = elf_getscn(elf, i); | ||
322 | if (!scn) | ||
323 | return 1; | ||
324 | |||
325 | if (gelf_getshdr(scn, shdr) != shdr) | ||
326 | return 2; | ||
327 | |||
328 | *shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name); | ||
329 | if (!*shname || !shdr->sh_size) | ||
330 | return 3; | ||
331 | |||
332 | *data = elf_getdata(scn, 0); | ||
333 | if (!*data || elf_getdata(scn, *data) != NULL) | ||
334 | return 4; | ||
335 | |||
336 | return 0; | ||
337 | } | ||
338 | |||
339 | static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols, | ||
340 | GElf_Shdr *shdr, struct bpf_insn *insn, | ||
341 | struct bpf_map_data *maps, int nr_maps) | ||
342 | { | ||
343 | int i, nrels; | ||
344 | |||
345 | nrels = shdr->sh_size / shdr->sh_entsize; | ||
346 | |||
347 | for (i = 0; i < nrels; i++) { | ||
348 | GElf_Sym sym; | ||
349 | GElf_Rel rel; | ||
350 | unsigned int insn_idx; | ||
351 | bool match = false; | ||
352 | int j, map_idx; | ||
353 | |||
354 | gelf_getrel(data, i, &rel); | ||
355 | |||
356 | insn_idx = rel.r_offset / sizeof(struct bpf_insn); | ||
357 | |||
358 | gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym); | ||
359 | |||
360 | if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) { | ||
361 | printf("invalid relo for insn[%d].code 0x%x\n", | ||
362 | insn_idx, insn[insn_idx].code); | ||
363 | return 1; | ||
364 | } | ||
365 | insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD; | ||
366 | |||
367 | /* Match FD relocation against recorded map_data[] offset */ | ||
368 | for (map_idx = 0; map_idx < nr_maps; map_idx++) { | ||
369 | if (maps[map_idx].elf_offset == sym.st_value) { | ||
370 | match = true; | ||
371 | break; | ||
372 | } | ||
373 | } | ||
374 | if (match) { | ||
375 | insn[insn_idx].imm = maps[map_idx].fd; | ||
376 | } else { | ||
377 | printf("invalid relo for insn[%d] no map_data match\n", | ||
378 | insn_idx); | ||
379 | return 1; | ||
380 | } | ||
381 | } | ||
382 | |||
383 | return 0; | ||
384 | } | ||
385 | |||
386 | static int cmp_symbols(const void *l, const void *r) | ||
387 | { | ||
388 | const GElf_Sym *lsym = (const GElf_Sym *)l; | ||
389 | const GElf_Sym *rsym = (const GElf_Sym *)r; | ||
390 | |||
391 | if (lsym->st_value < rsym->st_value) | ||
392 | return -1; | ||
393 | else if (lsym->st_value > rsym->st_value) | ||
394 | return 1; | ||
395 | else | ||
396 | return 0; | ||
397 | } | ||
398 | |||
399 | static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx, | ||
400 | Elf *elf, Elf_Data *symbols, int strtabidx) | ||
401 | { | ||
402 | int map_sz_elf, map_sz_copy; | ||
403 | bool validate_zero = false; | ||
404 | Elf_Data *data_maps; | ||
405 | int i, nr_maps; | ||
406 | GElf_Sym *sym; | ||
407 | Elf_Scn *scn; | ||
408 | int copy_sz; | ||
409 | |||
410 | if (maps_shndx < 0) | ||
411 | return -EINVAL; | ||
412 | if (!symbols) | ||
413 | return -EINVAL; | ||
414 | |||
415 | /* Get data for maps section via elf index */ | ||
416 | scn = elf_getscn(elf, maps_shndx); | ||
417 | if (scn) | ||
418 | data_maps = elf_getdata(scn, NULL); | ||
419 | if (!scn || !data_maps) { | ||
420 | printf("Failed to get Elf_Data from maps section %d\n", | ||
421 | maps_shndx); | ||
422 | return -EINVAL; | ||
423 | } | ||
424 | |||
425 | /* For each map get corrosponding symbol table entry */ | ||
426 | sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym)); | ||
427 | for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) { | ||
428 | assert(nr_maps < MAX_MAPS+1); | ||
429 | if (!gelf_getsym(symbols, i, &sym[nr_maps])) | ||
430 | continue; | ||
431 | if (sym[nr_maps].st_shndx != maps_shndx) | ||
432 | continue; | ||
433 | /* Only increment iif maps section */ | ||
434 | nr_maps++; | ||
435 | } | ||
436 | |||
437 | /* Align to map_fd[] order, via sort on offset in sym.st_value */ | ||
438 | qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols); | ||
439 | |||
440 | /* Keeping compatible with ELF maps section changes | ||
441 | * ------------------------------------------------ | ||
442 | * The program size of struct bpf_load_map_def is known by loader | ||
443 | * code, but struct stored in ELF file can be different. | ||
444 | * | ||
445 | * Unfortunately sym[i].st_size is zero. To calculate the | ||
446 | * struct size stored in the ELF file, assume all struct have | ||
447 | * the same size, and simply divide with number of map | ||
448 | * symbols. | ||
449 | */ | ||
450 | map_sz_elf = data_maps->d_size / nr_maps; | ||
451 | map_sz_copy = sizeof(struct bpf_load_map_def); | ||
452 | if (map_sz_elf < map_sz_copy) { | ||
453 | /* | ||
454 | * Backward compat, loading older ELF file with | ||
455 | * smaller struct, keeping remaining bytes zero. | ||
456 | */ | ||
457 | map_sz_copy = map_sz_elf; | ||
458 | } else if (map_sz_elf > map_sz_copy) { | ||
459 | /* | ||
460 | * Forward compat, loading newer ELF file with larger | ||
461 | * struct with unknown features. Assume zero means | ||
462 | * feature not used. Thus, validate rest of struct | ||
463 | * data is zero. | ||
464 | */ | ||
465 | validate_zero = true; | ||
466 | } | ||
467 | |||
468 | /* Memcpy relevant part of ELF maps data to loader maps */ | ||
469 | for (i = 0; i < nr_maps; i++) { | ||
470 | struct bpf_load_map_def *def; | ||
471 | unsigned char *addr, *end; | ||
472 | const char *map_name; | ||
473 | size_t offset; | ||
474 | |||
475 | map_name = elf_strptr(elf, strtabidx, sym[i].st_name); | ||
476 | maps[i].name = strdup(map_name); | ||
477 | if (!maps[i].name) { | ||
478 | printf("strdup(%s): %s(%d)\n", map_name, | ||
479 | strerror(errno), errno); | ||
480 | free(sym); | ||
481 | return -errno; | ||
482 | } | ||
483 | |||
484 | /* Symbol value is offset into ELF maps section data area */ | ||
485 | offset = sym[i].st_value; | ||
486 | def = (struct bpf_load_map_def *)(data_maps->d_buf + offset); | ||
487 | maps[i].elf_offset = offset; | ||
488 | memset(&maps[i].def, 0, sizeof(struct bpf_load_map_def)); | ||
489 | memcpy(&maps[i].def, def, map_sz_copy); | ||
490 | |||
491 | /* Verify no newer features were requested */ | ||
492 | if (validate_zero) { | ||
493 | addr = (unsigned char *) def + map_sz_copy; | ||
494 | end = (unsigned char *) def + map_sz_elf; | ||
495 | for (; addr < end; addr++) { | ||
496 | if (*addr != 0) { | ||
497 | free(sym); | ||
498 | return -EFBIG; | ||
499 | } | ||
500 | } | ||
501 | } | ||
502 | } | ||
503 | |||
504 | free(sym); | ||
505 | return nr_maps; | ||
506 | } | ||
507 | |||
508 | static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) | ||
509 | { | ||
510 | int fd, i, ret, maps_shndx = -1, strtabidx = -1; | ||
511 | Elf *elf; | ||
512 | GElf_Ehdr ehdr; | ||
513 | GElf_Shdr shdr, shdr_prog; | ||
514 | Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL; | ||
515 | char *shname, *shname_prog; | ||
516 | int nr_maps = 0; | ||
517 | |||
518 | /* reset global variables */ | ||
519 | kern_version = 0; | ||
520 | memset(license, 0, sizeof(license)); | ||
521 | memset(processed_sec, 0, sizeof(processed_sec)); | ||
522 | |||
523 | if (elf_version(EV_CURRENT) == EV_NONE) | ||
524 | return 1; | ||
525 | |||
526 | fd = open(path, O_RDONLY, 0); | ||
527 | if (fd < 0) | ||
528 | return 1; | ||
529 | |||
530 | elf = elf_begin(fd, ELF_C_READ, NULL); | ||
531 | |||
532 | if (!elf) | ||
533 | return 1; | ||
534 | |||
535 | if (gelf_getehdr(elf, &ehdr) != &ehdr) | ||
536 | return 1; | ||
537 | |||
538 | /* clear all kprobes */ | ||
539 | i = write_kprobe_events(""); | ||
540 | |||
541 | /* scan over all elf sections to get license and map info */ | ||
542 | for (i = 1; i < ehdr.e_shnum; i++) { | ||
543 | |||
544 | if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) | ||
545 | continue; | ||
546 | |||
547 | if (0) /* helpful for llvm debugging */ | ||
548 | printf("section %d:%s data %p size %zd link %d flags %d\n", | ||
549 | i, shname, data->d_buf, data->d_size, | ||
550 | shdr.sh_link, (int) shdr.sh_flags); | ||
551 | |||
552 | if (strcmp(shname, "license") == 0) { | ||
553 | processed_sec[i] = true; | ||
554 | memcpy(license, data->d_buf, data->d_size); | ||
555 | } else if (strcmp(shname, "version") == 0) { | ||
556 | processed_sec[i] = true; | ||
557 | if (data->d_size != sizeof(int)) { | ||
558 | printf("invalid size of version section %zd\n", | ||
559 | data->d_size); | ||
560 | return 1; | ||
561 | } | ||
562 | memcpy(&kern_version, data->d_buf, sizeof(int)); | ||
563 | } else if (strcmp(shname, "maps") == 0) { | ||
564 | int j; | ||
565 | |||
566 | maps_shndx = i; | ||
567 | data_maps = data; | ||
568 | for (j = 0; j < MAX_MAPS; j++) | ||
569 | map_data[j].fd = -1; | ||
570 | } else if (shdr.sh_type == SHT_SYMTAB) { | ||
571 | strtabidx = shdr.sh_link; | ||
572 | symbols = data; | ||
573 | } | ||
574 | } | ||
575 | |||
576 | ret = 1; | ||
577 | |||
578 | if (!symbols) { | ||
579 | printf("missing SHT_SYMTAB section\n"); | ||
580 | goto done; | ||
581 | } | ||
582 | |||
583 | if (data_maps) { | ||
584 | nr_maps = load_elf_maps_section(map_data, maps_shndx, | ||
585 | elf, symbols, strtabidx); | ||
586 | if (nr_maps < 0) { | ||
587 | printf("Error: Failed loading ELF maps (errno:%d):%s\n", | ||
588 | nr_maps, strerror(-nr_maps)); | ||
589 | goto done; | ||
590 | } | ||
591 | if (load_maps(map_data, nr_maps, fixup_map)) | ||
592 | goto done; | ||
593 | map_data_count = nr_maps; | ||
594 | |||
595 | processed_sec[maps_shndx] = true; | ||
596 | } | ||
597 | |||
598 | /* process all relo sections, and rewrite bpf insns for maps */ | ||
599 | for (i = 1; i < ehdr.e_shnum; i++) { | ||
600 | if (processed_sec[i]) | ||
601 | continue; | ||
602 | |||
603 | if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) | ||
604 | continue; | ||
605 | |||
606 | if (shdr.sh_type == SHT_REL) { | ||
607 | struct bpf_insn *insns; | ||
608 | |||
609 | /* locate prog sec that need map fixup (relocations) */ | ||
610 | if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog, | ||
611 | &shdr_prog, &data_prog)) | ||
612 | continue; | ||
613 | |||
614 | if (shdr_prog.sh_type != SHT_PROGBITS || | ||
615 | !(shdr_prog.sh_flags & SHF_EXECINSTR)) | ||
616 | continue; | ||
617 | |||
618 | insns = (struct bpf_insn *) data_prog->d_buf; | ||
619 | processed_sec[i] = true; /* relo section */ | ||
620 | |||
621 | if (parse_relo_and_apply(data, symbols, &shdr, insns, | ||
622 | map_data, nr_maps)) | ||
623 | continue; | ||
624 | } | ||
625 | } | ||
626 | |||
627 | /* load programs */ | ||
628 | for (i = 1; i < ehdr.e_shnum; i++) { | ||
629 | |||
630 | if (processed_sec[i]) | ||
631 | continue; | ||
632 | |||
633 | if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) | ||
634 | continue; | ||
635 | |||
636 | if (memcmp(shname, "kprobe/", 7) == 0 || | ||
637 | memcmp(shname, "kretprobe/", 10) == 0 || | ||
638 | memcmp(shname, "tracepoint/", 11) == 0 || | ||
639 | memcmp(shname, "raw_tracepoint/", 15) == 0 || | ||
640 | memcmp(shname, "xdp", 3) == 0 || | ||
641 | memcmp(shname, "perf_event", 10) == 0 || | ||
642 | memcmp(shname, "socket", 6) == 0 || | ||
643 | memcmp(shname, "cgroup/", 7) == 0 || | ||
644 | memcmp(shname, "sockops", 7) == 0 || | ||
645 | memcmp(shname, "sk_skb", 6) == 0 || | ||
646 | memcmp(shname, "sk_msg", 6) == 0) { | ||
647 | ret = load_and_attach(shname, data->d_buf, | ||
648 | data->d_size); | ||
649 | if (ret != 0) | ||
650 | goto done; | ||
651 | } | ||
652 | } | ||
653 | |||
654 | done: | ||
655 | close(fd); | ||
656 | return ret; | ||
657 | } | ||
658 | |||
659 | int load_bpf_file(char *path) | ||
660 | { | ||
661 | return do_load_bpf_file(path, NULL); | ||
662 | } | ||
663 | |||
664 | int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map) | ||
665 | { | ||
666 | return do_load_bpf_file(path, fixup_map); | ||
667 | } | ||
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h new file mode 100644 index 000000000..4fcd258c6 --- /dev/null +++ b/samples/bpf/bpf_load.h | |||
@@ -0,0 +1,57 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
2 | #ifndef __BPF_LOAD_H | ||
3 | #define __BPF_LOAD_H | ||
4 | |||
5 | #include <bpf/bpf.h> | ||
6 | |||
7 | #define MAX_MAPS 32 | ||
8 | #define MAX_PROGS 32 | ||
9 | |||
10 | struct bpf_load_map_def { | ||
11 | unsigned int type; | ||
12 | unsigned int key_size; | ||
13 | unsigned int value_size; | ||
14 | unsigned int max_entries; | ||
15 | unsigned int map_flags; | ||
16 | unsigned int inner_map_idx; | ||
17 | unsigned int numa_node; | ||
18 | }; | ||
19 | |||
20 | struct bpf_map_data { | ||
21 | int fd; | ||
22 | char *name; | ||
23 | size_t elf_offset; | ||
24 | struct bpf_load_map_def def; | ||
25 | }; | ||
26 | |||
27 | typedef void (*fixup_map_cb)(struct bpf_map_data *map, int idx); | ||
28 | |||
29 | extern int prog_fd[MAX_PROGS]; | ||
30 | extern int event_fd[MAX_PROGS]; | ||
31 | extern char bpf_log_buf[BPF_LOG_BUF_SIZE]; | ||
32 | extern int prog_cnt; | ||
33 | |||
34 | /* There is a one-to-one mapping between map_fd[] and map_data[]. | ||
35 | * The map_data[] just contains more rich info on the given map. | ||
36 | */ | ||
37 | extern int map_fd[MAX_MAPS]; | ||
38 | extern struct bpf_map_data map_data[MAX_MAPS]; | ||
39 | extern int map_data_count; | ||
40 | |||
41 | /* parses elf file compiled by llvm .c->.o | ||
42 | * . parses 'maps' section and creates maps via BPF syscall | ||
43 | * . parses 'license' section and passes it to syscall | ||
44 | * . parses elf relocations for BPF maps and adjusts BPF_LD_IMM64 insns by | ||
45 | * storing map_fd into insn->imm and marking such insns as BPF_PSEUDO_MAP_FD | ||
46 | * . loads eBPF programs via BPF syscall | ||
47 | * | ||
48 | * One ELF file can contain multiple BPF programs which will be loaded | ||
49 | * and their FDs stored stored in prog_fd array | ||
50 | * | ||
51 | * returns zero on success | ||
52 | */ | ||
53 | int load_bpf_file(char *path); | ||
54 | int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map); | ||
55 | |||
56 | int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags); | ||
57 | #endif | ||
diff --git a/samples/bpf/cookie_uid_helper_example.c b/samples/bpf/cookie_uid_helper_example.c new file mode 100644 index 000000000..deb0e3e03 --- /dev/null +++ b/samples/bpf/cookie_uid_helper_example.c | |||
@@ -0,0 +1,323 @@ | |||
1 | /* This test is a demo of using get_socket_uid and get_socket_cookie | ||
2 | * helper function to do per socket based network traffic monitoring. | ||
3 | * It requires iptables version higher then 1.6.1. to load pinned eBPF | ||
4 | * program into the xt_bpf match. | ||
5 | * | ||
6 | * TEST: | ||
7 | * ./run_cookie_uid_helper_example.sh -option | ||
8 | * option: | ||
9 | * -t: do traffic monitoring test, the program will continuously | ||
10 | * print out network traffic happens after program started A sample | ||
11 | * output is shown below: | ||
12 | * | ||
13 | * cookie: 877, uid: 0x3e8, Pakcet Count: 20, Bytes Count: 11058 | ||
14 | * cookie: 132, uid: 0x0, Pakcet Count: 2, Bytes Count: 286 | ||
15 | * cookie: 812, uid: 0x3e8, Pakcet Count: 3, Bytes Count: 1726 | ||
16 | * cookie: 802, uid: 0x3e8, Pakcet Count: 2, Bytes Count: 104 | ||
17 | * cookie: 877, uid: 0x3e8, Pakcet Count: 20, Bytes Count: 11058 | ||
18 | * cookie: 831, uid: 0x3e8, Pakcet Count: 2, Bytes Count: 104 | ||
19 | * cookie: 0, uid: 0x0, Pakcet Count: 6, Bytes Count: 712 | ||
20 | * cookie: 880, uid: 0xfffe, Pakcet Count: 1, Bytes Count: 70 | ||
21 | * | ||
22 | * -s: do getsockopt SO_COOKIE test, the program will set up a pair of | ||
23 | * UDP sockets and send packets between them. And read out the traffic data | ||
24 | * directly from the ebpf map based on the socket cookie. | ||
25 | * | ||
26 | * Clean up: if using shell script, the script file will delete the iptables | ||
27 | * rule and unmount the bpf program when exit. Else the iptables rule need | ||
28 | * to be deleted by hand, see run_cookie_uid_helper_example.sh for detail. | ||
29 | */ | ||
30 | |||
31 | #define _GNU_SOURCE | ||
32 | |||
33 | #define offsetof(type, member) __builtin_offsetof(type, member) | ||
34 | #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) | ||
35 | |||
36 | #include <arpa/inet.h> | ||
37 | #include <errno.h> | ||
38 | #include <error.h> | ||
39 | #include <limits.h> | ||
40 | #include <linux/bpf.h> | ||
41 | #include <linux/if_ether.h> | ||
42 | #include <net/if.h> | ||
43 | #include <signal.h> | ||
44 | #include <stdbool.h> | ||
45 | #include <stdint.h> | ||
46 | #include <stdio.h> | ||
47 | #include <stdlib.h> | ||
48 | #include <string.h> | ||
49 | #include <sys/socket.h> | ||
50 | #include <sys/stat.h> | ||
51 | #include <sys/types.h> | ||
52 | #include <unistd.h> | ||
53 | #include <bpf/bpf.h> | ||
54 | #include "bpf_insn.h" | ||
55 | |||
56 | #define PORT 8888 | ||
57 | |||
58 | struct stats { | ||
59 | uint32_t uid; | ||
60 | uint64_t packets; | ||
61 | uint64_t bytes; | ||
62 | }; | ||
63 | |||
64 | static int map_fd, prog_fd; | ||
65 | |||
66 | static bool test_finish; | ||
67 | |||
68 | static void maps_create(void) | ||
69 | { | ||
70 | map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(uint32_t), | ||
71 | sizeof(struct stats), 100, 0); | ||
72 | if (map_fd < 0) | ||
73 | error(1, errno, "map create failed!\n"); | ||
74 | } | ||
75 | |||
76 | static void prog_load(void) | ||
77 | { | ||
78 | static char log_buf[1 << 16]; | ||
79 | |||
80 | struct bpf_insn prog[] = { | ||
81 | /* | ||
82 | * Save sk_buff for future usage. value stored in R6 to R10 will | ||
83 | * not be reset after a bpf helper function call. | ||
84 | */ | ||
85 | BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), | ||
86 | /* | ||
87 | * pc1: BPF_FUNC_get_socket_cookie takes one parameter, | ||
88 | * R1: sk_buff | ||
89 | */ | ||
90 | BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, | ||
91 | BPF_FUNC_get_socket_cookie), | ||
92 | /* pc2-4: save &socketCookie to r7 for future usage*/ | ||
93 | BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), | ||
94 | BPF_MOV64_REG(BPF_REG_7, BPF_REG_10), | ||
95 | BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8), | ||
96 | /* | ||
97 | * pc5-8: set up the registers for BPF_FUNC_map_lookup_elem, | ||
98 | * it takes two parameters (R1: map_fd, R2: &socket_cookie) | ||
99 | */ | ||
100 | BPF_LD_MAP_FD(BPF_REG_1, map_fd), | ||
101 | BPF_MOV64_REG(BPF_REG_2, BPF_REG_7), | ||
102 | BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, | ||
103 | BPF_FUNC_map_lookup_elem), | ||
104 | /* | ||
105 | * pc9. if r0 != 0x0, go to pc+14, since we have the cookie | ||
106 | * stored already | ||
107 | * Otherwise do pc10-22 to setup a new data entry. | ||
108 | */ | ||
109 | BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 14), | ||
110 | BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), | ||
111 | BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, | ||
112 | BPF_FUNC_get_socket_uid), | ||
113 | /* | ||
114 | * Place a struct stats in the R10 stack and sequentially | ||
115 | * place the member value into the memory. Packets value | ||
116 | * is set by directly place a IMM value 1 into the stack. | ||
117 | */ | ||
118 | BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, | ||
119 | -32 + (__s16)offsetof(struct stats, uid)), | ||
120 | BPF_ST_MEM(BPF_DW, BPF_REG_10, | ||
121 | -32 + (__s16)offsetof(struct stats, packets), 1), | ||
122 | /* | ||
123 | * __sk_buff is a special struct used for eBPF program to | ||
124 | * directly access some sk_buff field. | ||
125 | */ | ||
126 | BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, | ||
127 | offsetof(struct __sk_buff, len)), | ||
128 | BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, | ||
129 | -32 + (__s16)offsetof(struct stats, bytes)), | ||
130 | /* | ||
131 | * add new map entry using BPF_FUNC_map_update_elem, it takes | ||
132 | * 4 parameters (R1: map_fd, R2: &socket_cookie, R3: &stats, | ||
133 | * R4: flags) | ||
134 | */ | ||
135 | BPF_LD_MAP_FD(BPF_REG_1, map_fd), | ||
136 | BPF_MOV64_REG(BPF_REG_2, BPF_REG_7), | ||
137 | BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), | ||
138 | BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -32), | ||
139 | BPF_MOV64_IMM(BPF_REG_4, 0), | ||
140 | BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, | ||
141 | BPF_FUNC_map_update_elem), | ||
142 | BPF_JMP_IMM(BPF_JA, 0, 0, 5), | ||
143 | /* | ||
144 | * pc24-30 update the packet info to a exist data entry, it can | ||
145 | * be done by directly write to pointers instead of using | ||
146 | * BPF_FUNC_map_update_elem helper function | ||
147 | */ | ||
148 | BPF_MOV64_REG(BPF_REG_9, BPF_REG_0), | ||
149 | BPF_MOV64_IMM(BPF_REG_1, 1), | ||
150 | BPF_STX_XADD(BPF_DW, BPF_REG_9, BPF_REG_1, | ||
151 | offsetof(struct stats, packets)), | ||
152 | BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, | ||
153 | offsetof(struct __sk_buff, len)), | ||
154 | BPF_STX_XADD(BPF_DW, BPF_REG_9, BPF_REG_1, | ||
155 | offsetof(struct stats, bytes)), | ||
156 | BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, | ||
157 | offsetof(struct __sk_buff, len)), | ||
158 | BPF_EXIT_INSN(), | ||
159 | }; | ||
160 | prog_fd = bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog, | ||
161 | ARRAY_SIZE(prog), "GPL", 0, | ||
162 | log_buf, sizeof(log_buf)); | ||
163 | if (prog_fd < 0) | ||
164 | error(1, errno, "failed to load prog\n%s\n", log_buf); | ||
165 | } | ||
166 | |||
167 | static void prog_attach_iptables(char *file) | ||
168 | { | ||
169 | int ret; | ||
170 | char rules[100]; | ||
171 | |||
172 | if (bpf_obj_pin(prog_fd, file)) | ||
173 | error(1, errno, "bpf_obj_pin"); | ||
174 | if (strlen(file) > 50) { | ||
175 | printf("file path too long: %s\n", file); | ||
176 | exit(1); | ||
177 | } | ||
178 | sprintf(rules, "iptables -A OUTPUT -m bpf --object-pinned %s -j ACCEPT", | ||
179 | file); | ||
180 | ret = system(rules); | ||
181 | if (ret < 0) { | ||
182 | printf("iptables rule update failed: %d/n", WEXITSTATUS(ret)); | ||
183 | exit(1); | ||
184 | } | ||
185 | } | ||
186 | |||
187 | static void print_table(void) | ||
188 | { | ||
189 | struct stats curEntry; | ||
190 | uint32_t curN = UINT32_MAX; | ||
191 | uint32_t nextN; | ||
192 | int res; | ||
193 | |||
194 | while (bpf_map_get_next_key(map_fd, &curN, &nextN) > -1) { | ||
195 | curN = nextN; | ||
196 | res = bpf_map_lookup_elem(map_fd, &curN, &curEntry); | ||
197 | if (res < 0) { | ||
198 | error(1, errno, "fail to get entry value of Key: %u\n", | ||
199 | curN); | ||
200 | } else { | ||
201 | printf("cookie: %u, uid: 0x%x, Packet Count: %lu," | ||
202 | " Bytes Count: %lu\n", curN, curEntry.uid, | ||
203 | curEntry.packets, curEntry.bytes); | ||
204 | } | ||
205 | } | ||
206 | } | ||
207 | |||
208 | static void udp_client(void) | ||
209 | { | ||
210 | struct sockaddr_in si_other = {0}; | ||
211 | struct sockaddr_in si_me = {0}; | ||
212 | struct stats dataEntry; | ||
213 | int s_rcv, s_send, i, recv_len; | ||
214 | char message = 'a'; | ||
215 | char buf; | ||
216 | uint64_t cookie; | ||
217 | int res; | ||
218 | socklen_t cookie_len = sizeof(cookie); | ||
219 | socklen_t slen = sizeof(si_other); | ||
220 | |||
221 | s_rcv = socket(PF_INET, SOCK_DGRAM, 0); | ||
222 | if (s_rcv < 0) | ||
223 | error(1, errno, "rcv socket creat failed!\n"); | ||
224 | si_other.sin_family = AF_INET; | ||
225 | si_other.sin_port = htons(PORT); | ||
226 | if (inet_aton("127.0.0.1", &si_other.sin_addr) == 0) | ||
227 | error(1, errno, "inet_aton\n"); | ||
228 | if (bind(s_rcv, (struct sockaddr *)&si_other, sizeof(si_other)) == -1) | ||
229 | error(1, errno, "bind\n"); | ||
230 | s_send = socket(PF_INET, SOCK_DGRAM, 0); | ||
231 | if (s_send < 0) | ||
232 | error(1, errno, "send socket creat failed!\n"); | ||
233 | res = getsockopt(s_send, SOL_SOCKET, SO_COOKIE, &cookie, &cookie_len); | ||
234 | if (res < 0) | ||
235 | printf("get cookie failed: %s\n", strerror(errno)); | ||
236 | res = bpf_map_lookup_elem(map_fd, &cookie, &dataEntry); | ||
237 | if (res != -1) | ||
238 | error(1, errno, "socket stat found while flow not active\n"); | ||
239 | for (i = 0; i < 10; i++) { | ||
240 | res = sendto(s_send, &message, sizeof(message), 0, | ||
241 | (struct sockaddr *)&si_other, slen); | ||
242 | if (res == -1) | ||
243 | error(1, errno, "send\n"); | ||
244 | if (res != sizeof(message)) | ||
245 | error(1, 0, "%uB != %luB\n", res, sizeof(message)); | ||
246 | recv_len = recvfrom(s_rcv, &buf, sizeof(buf), 0, | ||
247 | (struct sockaddr *)&si_me, &slen); | ||
248 | if (recv_len < 0) | ||
249 | error(1, errno, "receive\n"); | ||
250 | res = memcmp(&(si_other.sin_addr), &(si_me.sin_addr), | ||
251 | sizeof(si_me.sin_addr)); | ||
252 | if (res != 0) | ||
253 | error(1, EFAULT, "sender addr error: %d\n", res); | ||
254 | printf("Message received: %c\n", buf); | ||
255 | res = bpf_map_lookup_elem(map_fd, &cookie, &dataEntry); | ||
256 | if (res < 0) | ||
257 | error(1, errno, "lookup sk stat failed, cookie: %lu\n", | ||
258 | cookie); | ||
259 | printf("cookie: %lu, uid: 0x%x, Packet Count: %lu," | ||
260 | " Bytes Count: %lu\n\n", cookie, dataEntry.uid, | ||
261 | dataEntry.packets, dataEntry.bytes); | ||
262 | } | ||
263 | close(s_send); | ||
264 | close(s_rcv); | ||
265 | } | ||
266 | |||
267 | static int usage(void) | ||
268 | { | ||
269 | printf("Usage: ./run_cookie_uid_helper_example.sh" | ||
270 | " bpfObjName -option\n" | ||
271 | " -t traffic monitor test\n" | ||
272 | " -s getsockopt cookie test\n"); | ||
273 | return 1; | ||
274 | } | ||
275 | |||
276 | static void finish(int ret) | ||
277 | { | ||
278 | test_finish = true; | ||
279 | } | ||
280 | |||
281 | int main(int argc, char *argv[]) | ||
282 | { | ||
283 | int opt; | ||
284 | bool cfg_test_traffic = false; | ||
285 | bool cfg_test_cookie = false; | ||
286 | |||
287 | if (argc != 3) | ||
288 | return usage(); | ||
289 | while ((opt = getopt(argc, argv, "ts")) != -1) { | ||
290 | switch (opt) { | ||
291 | case 't': | ||
292 | cfg_test_traffic = true; | ||
293 | break; | ||
294 | case 's': | ||
295 | cfg_test_cookie = true; | ||
296 | break; | ||
297 | |||
298 | default: | ||
299 | printf("unknown option %c\n", opt); | ||
300 | usage(); | ||
301 | return -1; | ||
302 | } | ||
303 | } | ||
304 | maps_create(); | ||
305 | prog_load(); | ||
306 | prog_attach_iptables(argv[2]); | ||
307 | if (cfg_test_traffic) { | ||
308 | if (signal(SIGINT, finish) == SIG_ERR) | ||
309 | error(1, errno, "register SIGINT handler failed"); | ||
310 | if (signal(SIGTERM, finish) == SIG_ERR) | ||
311 | error(1, errno, "register SIGTERM handler failed"); | ||
312 | while (!test_finish) { | ||
313 | print_table(); | ||
314 | printf("\n"); | ||
315 | sleep(1); | ||
316 | }; | ||
317 | } else if (cfg_test_cookie) { | ||
318 | udp_client(); | ||
319 | } | ||
320 | close(prog_fd); | ||
321 | close(map_fd); | ||
322 | return 0; | ||
323 | } | ||
diff --git a/samples/bpf/cpustat_kern.c b/samples/bpf/cpustat_kern.c new file mode 100644 index 000000000..5aefd19cd --- /dev/null +++ b/samples/bpf/cpustat_kern.c | |||
@@ -0,0 +1,281 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | #include <linux/version.h> | ||
4 | #include <linux/ptrace.h> | ||
5 | #include <uapi/linux/bpf.h> | ||
6 | #include <bpf/bpf_helpers.h> | ||
7 | |||
8 | /* | ||
9 | * The CPU number, cstate number and pstate number are based | ||
10 | * on 96boards Hikey with octa CA53 CPUs. | ||
11 | * | ||
12 | * Every CPU have three idle states for cstate: | ||
13 | * WFI, CPU_OFF, CLUSTER_OFF | ||
14 | * | ||
15 | * Every CPU have 5 operating points: | ||
16 | * 208MHz, 432MHz, 729MHz, 960MHz, 1200MHz | ||
17 | * | ||
18 | * This code is based on these assumption and other platforms | ||
19 | * need to adjust these definitions. | ||
20 | */ | ||
21 | #define MAX_CPU 8 | ||
22 | #define MAX_PSTATE_ENTRIES 5 | ||
23 | #define MAX_CSTATE_ENTRIES 3 | ||
24 | |||
25 | static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 }; | ||
26 | |||
27 | /* | ||
28 | * my_map structure is used to record cstate and pstate index and | ||
29 | * timestamp (Idx, Ts), when new event incoming we need to update | ||
30 | * combination for new state index and timestamp (Idx`, Ts`). | ||
31 | * | ||
32 | * Based on (Idx, Ts) and (Idx`, Ts`) we can calculate the time | ||
33 | * interval for the previous state: Duration(Idx) = Ts` - Ts. | ||
34 | * | ||
35 | * Every CPU has one below array for recording state index and | ||
36 | * timestamp, and record for cstate and pstate saperately: | ||
37 | * | ||
38 | * +--------------------------+ | ||
39 | * | cstate timestamp | | ||
40 | * +--------------------------+ | ||
41 | * | cstate index | | ||
42 | * +--------------------------+ | ||
43 | * | pstate timestamp | | ||
44 | * +--------------------------+ | ||
45 | * | pstate index | | ||
46 | * +--------------------------+ | ||
47 | */ | ||
48 | #define MAP_OFF_CSTATE_TIME 0 | ||
49 | #define MAP_OFF_CSTATE_IDX 1 | ||
50 | #define MAP_OFF_PSTATE_TIME 2 | ||
51 | #define MAP_OFF_PSTATE_IDX 3 | ||
52 | #define MAP_OFF_NUM 4 | ||
53 | |||
54 | struct { | ||
55 | __uint(type, BPF_MAP_TYPE_ARRAY); | ||
56 | __type(key, u32); | ||
57 | __type(value, u64); | ||
58 | __uint(max_entries, MAX_CPU * MAP_OFF_NUM); | ||
59 | } my_map SEC(".maps"); | ||
60 | |||
61 | /* cstate_duration records duration time for every idle state per CPU */ | ||
62 | struct { | ||
63 | __uint(type, BPF_MAP_TYPE_ARRAY); | ||
64 | __type(key, u32); | ||
65 | __type(value, u64); | ||
66 | __uint(max_entries, MAX_CPU * MAX_CSTATE_ENTRIES); | ||
67 | } cstate_duration SEC(".maps"); | ||
68 | |||
69 | /* pstate_duration records duration time for every operating point per CPU */ | ||
70 | struct { | ||
71 | __uint(type, BPF_MAP_TYPE_ARRAY); | ||
72 | __type(key, u32); | ||
73 | __type(value, u64); | ||
74 | __uint(max_entries, MAX_CPU * MAX_PSTATE_ENTRIES); | ||
75 | } pstate_duration SEC(".maps"); | ||
76 | |||
77 | /* | ||
78 | * The trace events for cpu_idle and cpu_frequency are taken from: | ||
79 | * /sys/kernel/debug/tracing/events/power/cpu_idle/format | ||
80 | * /sys/kernel/debug/tracing/events/power/cpu_frequency/format | ||
81 | * | ||
82 | * These two events have same format, so define one common structure. | ||
83 | */ | ||
84 | struct cpu_args { | ||
85 | u64 pad; | ||
86 | u32 state; | ||
87 | u32 cpu_id; | ||
88 | }; | ||
89 | |||
90 | /* calculate pstate index, returns MAX_PSTATE_ENTRIES for failure */ | ||
91 | static u32 find_cpu_pstate_idx(u32 frequency) | ||
92 | { | ||
93 | u32 i; | ||
94 | |||
95 | for (i = 0; i < sizeof(cpu_opps) / sizeof(u32); i++) { | ||
96 | if (frequency == cpu_opps[i]) | ||
97 | return i; | ||
98 | } | ||
99 | |||
100 | return i; | ||
101 | } | ||
102 | |||
103 | SEC("tracepoint/power/cpu_idle") | ||
104 | int bpf_prog1(struct cpu_args *ctx) | ||
105 | { | ||
106 | u64 *cts, *pts, *cstate, *pstate, prev_state, cur_ts, delta; | ||
107 | u32 key, cpu, pstate_idx; | ||
108 | u64 *val; | ||
109 | |||
110 | if (ctx->cpu_id > MAX_CPU) | ||
111 | return 0; | ||
112 | |||
113 | cpu = ctx->cpu_id; | ||
114 | |||
115 | key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_TIME; | ||
116 | cts = bpf_map_lookup_elem(&my_map, &key); | ||
117 | if (!cts) | ||
118 | return 0; | ||
119 | |||
120 | key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX; | ||
121 | cstate = bpf_map_lookup_elem(&my_map, &key); | ||
122 | if (!cstate) | ||
123 | return 0; | ||
124 | |||
125 | key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME; | ||
126 | pts = bpf_map_lookup_elem(&my_map, &key); | ||
127 | if (!pts) | ||
128 | return 0; | ||
129 | |||
130 | key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX; | ||
131 | pstate = bpf_map_lookup_elem(&my_map, &key); | ||
132 | if (!pstate) | ||
133 | return 0; | ||
134 | |||
135 | prev_state = *cstate; | ||
136 | *cstate = ctx->state; | ||
137 | |||
138 | if (!*cts) { | ||
139 | *cts = bpf_ktime_get_ns(); | ||
140 | return 0; | ||
141 | } | ||
142 | |||
143 | cur_ts = bpf_ktime_get_ns(); | ||
144 | delta = cur_ts - *cts; | ||
145 | *cts = cur_ts; | ||
146 | |||
147 | /* | ||
148 | * When state doesn't equal to (u32)-1, the cpu will enter | ||
149 | * one idle state; for this case we need to record interval | ||
150 | * for the pstate. | ||
151 | * | ||
152 | * OPP2 | ||
153 | * +---------------------+ | ||
154 | * OPP1 | | | ||
155 | * ---------+ | | ||
156 | * | Idle state | ||
157 | * +--------------- | ||
158 | * | ||
159 | * |<- pstate duration ->| | ||
160 | * ^ ^ | ||
161 | * pts cur_ts | ||
162 | */ | ||
163 | if (ctx->state != (u32)-1) { | ||
164 | |||
165 | /* record pstate after have first cpu_frequency event */ | ||
166 | if (!*pts) | ||
167 | return 0; | ||
168 | |||
169 | delta = cur_ts - *pts; | ||
170 | |||
171 | pstate_idx = find_cpu_pstate_idx(*pstate); | ||
172 | if (pstate_idx >= MAX_PSTATE_ENTRIES) | ||
173 | return 0; | ||
174 | |||
175 | key = cpu * MAX_PSTATE_ENTRIES + pstate_idx; | ||
176 | val = bpf_map_lookup_elem(&pstate_duration, &key); | ||
177 | if (val) | ||
178 | __sync_fetch_and_add((long *)val, delta); | ||
179 | |||
180 | /* | ||
181 | * When state equal to (u32)-1, the cpu just exits from one | ||
182 | * specific idle state; for this case we need to record | ||
183 | * interval for the pstate. | ||
184 | * | ||
185 | * OPP2 | ||
186 | * -----------+ | ||
187 | * | OPP1 | ||
188 | * | +----------- | ||
189 | * | Idle state | | ||
190 | * +---------------------+ | ||
191 | * | ||
192 | * |<- cstate duration ->| | ||
193 | * ^ ^ | ||
194 | * cts cur_ts | ||
195 | */ | ||
196 | } else { | ||
197 | |||
198 | key = cpu * MAX_CSTATE_ENTRIES + prev_state; | ||
199 | val = bpf_map_lookup_elem(&cstate_duration, &key); | ||
200 | if (val) | ||
201 | __sync_fetch_and_add((long *)val, delta); | ||
202 | } | ||
203 | |||
204 | /* Update timestamp for pstate as new start time */ | ||
205 | if (*pts) | ||
206 | *pts = cur_ts; | ||
207 | |||
208 | return 0; | ||
209 | } | ||
210 | |||
211 | SEC("tracepoint/power/cpu_frequency") | ||
212 | int bpf_prog2(struct cpu_args *ctx) | ||
213 | { | ||
214 | u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta; | ||
215 | u32 key, cpu, pstate_idx; | ||
216 | u64 *val; | ||
217 | |||
218 | cpu = ctx->cpu_id; | ||
219 | |||
220 | key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME; | ||
221 | pts = bpf_map_lookup_elem(&my_map, &key); | ||
222 | if (!pts) | ||
223 | return 0; | ||
224 | |||
225 | key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX; | ||
226 | pstate = bpf_map_lookup_elem(&my_map, &key); | ||
227 | if (!pstate) | ||
228 | return 0; | ||
229 | |||
230 | key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX; | ||
231 | cstate = bpf_map_lookup_elem(&my_map, &key); | ||
232 | if (!cstate) | ||
233 | return 0; | ||
234 | |||
235 | prev_state = *pstate; | ||
236 | *pstate = ctx->state; | ||
237 | |||
238 | if (!*pts) { | ||
239 | *pts = bpf_ktime_get_ns(); | ||
240 | return 0; | ||
241 | } | ||
242 | |||
243 | cur_ts = bpf_ktime_get_ns(); | ||
244 | delta = cur_ts - *pts; | ||
245 | *pts = cur_ts; | ||
246 | |||
247 | /* When CPU is in idle, bail out to skip pstate statistics */ | ||
248 | if (*cstate != (u32)(-1)) | ||
249 | return 0; | ||
250 | |||
251 | /* | ||
252 | * The cpu changes to another different OPP (in below diagram | ||
253 | * change frequency from OPP3 to OPP1), need recording interval | ||
254 | * for previous frequency OPP3 and update timestamp as start | ||
255 | * time for new frequency OPP1. | ||
256 | * | ||
257 | * OPP3 | ||
258 | * +---------------------+ | ||
259 | * OPP2 | | | ||
260 | * ---------+ | | ||
261 | * | OPP1 | ||
262 | * +--------------- | ||
263 | * | ||
264 | * |<- pstate duration ->| | ||
265 | * ^ ^ | ||
266 | * pts cur_ts | ||
267 | */ | ||
268 | pstate_idx = find_cpu_pstate_idx(*pstate); | ||
269 | if (pstate_idx >= MAX_PSTATE_ENTRIES) | ||
270 | return 0; | ||
271 | |||
272 | key = cpu * MAX_PSTATE_ENTRIES + pstate_idx; | ||
273 | val = bpf_map_lookup_elem(&pstate_duration, &key); | ||
274 | if (val) | ||
275 | __sync_fetch_and_add((long *)val, delta); | ||
276 | |||
277 | return 0; | ||
278 | } | ||
279 | |||
280 | char _license[] SEC("license") = "GPL"; | ||
281 | u32 _version SEC("version") = LINUX_VERSION_CODE; | ||
diff --git a/samples/bpf/cpustat_user.c b/samples/bpf/cpustat_user.c new file mode 100644 index 000000000..96675985e --- /dev/null +++ b/samples/bpf/cpustat_user.c | |||
@@ -0,0 +1,252 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | #define _GNU_SOURCE | ||
4 | #include <errno.h> | ||
5 | #include <stdio.h> | ||
6 | #include <stdlib.h> | ||
7 | #include <signal.h> | ||
8 | #include <sched.h> | ||
9 | #include <string.h> | ||
10 | #include <unistd.h> | ||
11 | #include <fcntl.h> | ||
12 | #include <locale.h> | ||
13 | #include <sys/types.h> | ||
14 | #include <sys/stat.h> | ||
15 | #include <sys/time.h> | ||
16 | #include <sys/resource.h> | ||
17 | #include <sys/wait.h> | ||
18 | |||
19 | #include <bpf/bpf.h> | ||
20 | #include <bpf/libbpf.h> | ||
21 | |||
22 | static int cstate_map_fd, pstate_map_fd; | ||
23 | |||
24 | #define MAX_CPU 8 | ||
25 | #define MAX_PSTATE_ENTRIES 5 | ||
26 | #define MAX_CSTATE_ENTRIES 3 | ||
27 | #define MAX_STARS 40 | ||
28 | |||
29 | #define CPUFREQ_MAX_SYSFS_PATH "/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq" | ||
30 | #define CPUFREQ_LOWEST_FREQ "208000" | ||
31 | #define CPUFREQ_HIGHEST_FREQ "12000000" | ||
32 | |||
33 | struct cpu_stat_data { | ||
34 | unsigned long cstate[MAX_CSTATE_ENTRIES]; | ||
35 | unsigned long pstate[MAX_PSTATE_ENTRIES]; | ||
36 | }; | ||
37 | |||
38 | static struct cpu_stat_data stat_data[MAX_CPU]; | ||
39 | |||
40 | static void cpu_stat_print(void) | ||
41 | { | ||
42 | int i, j; | ||
43 | char state_str[sizeof("cstate-9")]; | ||
44 | struct cpu_stat_data *data; | ||
45 | |||
46 | /* Clear screen */ | ||
47 | printf("\033[2J"); | ||
48 | |||
49 | /* Header */ | ||
50 | printf("\nCPU states statistics:\n"); | ||
51 | printf("%-10s ", "state(ms)"); | ||
52 | |||
53 | for (i = 0; i < MAX_CSTATE_ENTRIES; i++) { | ||
54 | sprintf(state_str, "cstate-%d", i); | ||
55 | printf("%-11s ", state_str); | ||
56 | } | ||
57 | |||
58 | for (i = 0; i < MAX_PSTATE_ENTRIES; i++) { | ||
59 | sprintf(state_str, "pstate-%d", i); | ||
60 | printf("%-11s ", state_str); | ||
61 | } | ||
62 | |||
63 | printf("\n"); | ||
64 | |||
65 | for (j = 0; j < MAX_CPU; j++) { | ||
66 | data = &stat_data[j]; | ||
67 | |||
68 | printf("CPU-%-6d ", j); | ||
69 | for (i = 0; i < MAX_CSTATE_ENTRIES; i++) | ||
70 | printf("%-11ld ", data->cstate[i] / 1000000); | ||
71 | |||
72 | for (i = 0; i < MAX_PSTATE_ENTRIES; i++) | ||
73 | printf("%-11ld ", data->pstate[i] / 1000000); | ||
74 | |||
75 | printf("\n"); | ||
76 | } | ||
77 | } | ||
78 | |||
79 | static void cpu_stat_update(int cstate_fd, int pstate_fd) | ||
80 | { | ||
81 | unsigned long key, value; | ||
82 | int c, i; | ||
83 | |||
84 | for (c = 0; c < MAX_CPU; c++) { | ||
85 | for (i = 0; i < MAX_CSTATE_ENTRIES; i++) { | ||
86 | key = c * MAX_CSTATE_ENTRIES + i; | ||
87 | bpf_map_lookup_elem(cstate_fd, &key, &value); | ||
88 | stat_data[c].cstate[i] = value; | ||
89 | } | ||
90 | |||
91 | for (i = 0; i < MAX_PSTATE_ENTRIES; i++) { | ||
92 | key = c * MAX_PSTATE_ENTRIES + i; | ||
93 | bpf_map_lookup_elem(pstate_fd, &key, &value); | ||
94 | stat_data[c].pstate[i] = value; | ||
95 | } | ||
96 | } | ||
97 | } | ||
98 | |||
99 | /* | ||
100 | * This function is copied from 'idlestat' tool function | ||
101 | * idlestat_wake_all() in idlestate.c. | ||
102 | * | ||
103 | * It sets the self running task affinity to cpus one by one so can wake up | ||
104 | * the specific CPU to handle scheduling; this results in all cpus can be | ||
105 | * waken up once and produce ftrace event 'trace_cpu_idle'. | ||
106 | */ | ||
107 | static int cpu_stat_inject_cpu_idle_event(void) | ||
108 | { | ||
109 | int rcpu, i, ret; | ||
110 | cpu_set_t cpumask; | ||
111 | cpu_set_t original_cpumask; | ||
112 | |||
113 | ret = sysconf(_SC_NPROCESSORS_CONF); | ||
114 | if (ret < 0) | ||
115 | return -1; | ||
116 | |||
117 | rcpu = sched_getcpu(); | ||
118 | if (rcpu < 0) | ||
119 | return -1; | ||
120 | |||
121 | /* Keep track of the CPUs we will run on */ | ||
122 | sched_getaffinity(0, sizeof(original_cpumask), &original_cpumask); | ||
123 | |||
124 | for (i = 0; i < ret; i++) { | ||
125 | |||
126 | /* Pointless to wake up ourself */ | ||
127 | if (i == rcpu) | ||
128 | continue; | ||
129 | |||
130 | /* Pointless to wake CPUs we will not run on */ | ||
131 | if (!CPU_ISSET(i, &original_cpumask)) | ||
132 | continue; | ||
133 | |||
134 | CPU_ZERO(&cpumask); | ||
135 | CPU_SET(i, &cpumask); | ||
136 | |||
137 | sched_setaffinity(0, sizeof(cpumask), &cpumask); | ||
138 | } | ||
139 | |||
140 | /* Enable all the CPUs of the original mask */ | ||
141 | sched_setaffinity(0, sizeof(original_cpumask), &original_cpumask); | ||
142 | return 0; | ||
143 | } | ||
144 | |||
145 | /* | ||
146 | * It's possible to have no any frequency change for long time and cannot | ||
147 | * get ftrace event 'trace_cpu_frequency' for long period, this introduces | ||
148 | * big deviation for pstate statistics. | ||
149 | * | ||
150 | * To solve this issue, below code forces to set 'scaling_max_freq' to 208MHz | ||
151 | * for triggering ftrace event 'trace_cpu_frequency' and then recovery back to | ||
152 | * the maximum frequency value 1.2GHz. | ||
153 | */ | ||
154 | static int cpu_stat_inject_cpu_frequency_event(void) | ||
155 | { | ||
156 | int len, fd; | ||
157 | |||
158 | fd = open(CPUFREQ_MAX_SYSFS_PATH, O_WRONLY); | ||
159 | if (fd < 0) { | ||
160 | printf("failed to open scaling_max_freq, errno=%d\n", errno); | ||
161 | return fd; | ||
162 | } | ||
163 | |||
164 | len = write(fd, CPUFREQ_LOWEST_FREQ, strlen(CPUFREQ_LOWEST_FREQ)); | ||
165 | if (len < 0) { | ||
166 | printf("failed to open scaling_max_freq, errno=%d\n", errno); | ||
167 | goto err; | ||
168 | } | ||
169 | |||
170 | len = write(fd, CPUFREQ_HIGHEST_FREQ, strlen(CPUFREQ_HIGHEST_FREQ)); | ||
171 | if (len < 0) { | ||
172 | printf("failed to open scaling_max_freq, errno=%d\n", errno); | ||
173 | goto err; | ||
174 | } | ||
175 | |||
176 | err: | ||
177 | close(fd); | ||
178 | return len; | ||
179 | } | ||
180 | |||
181 | static void int_exit(int sig) | ||
182 | { | ||
183 | cpu_stat_inject_cpu_idle_event(); | ||
184 | cpu_stat_inject_cpu_frequency_event(); | ||
185 | cpu_stat_update(cstate_map_fd, pstate_map_fd); | ||
186 | cpu_stat_print(); | ||
187 | exit(0); | ||
188 | } | ||
189 | |||
190 | int main(int argc, char **argv) | ||
191 | { | ||
192 | struct bpf_link *link = NULL; | ||
193 | struct bpf_program *prog; | ||
194 | struct bpf_object *obj; | ||
195 | char filename[256]; | ||
196 | int ret; | ||
197 | |||
198 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
199 | obj = bpf_object__open_file(filename, NULL); | ||
200 | if (libbpf_get_error(obj)) { | ||
201 | fprintf(stderr, "ERROR: opening BPF object file failed\n"); | ||
202 | return 0; | ||
203 | } | ||
204 | |||
205 | prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); | ||
206 | if (!prog) { | ||
207 | printf("finding a prog in obj file failed\n"); | ||
208 | goto cleanup; | ||
209 | } | ||
210 | |||
211 | /* load BPF program */ | ||
212 | if (bpf_object__load(obj)) { | ||
213 | fprintf(stderr, "ERROR: loading BPF object file failed\n"); | ||
214 | goto cleanup; | ||
215 | } | ||
216 | |||
217 | cstate_map_fd = bpf_object__find_map_fd_by_name(obj, "cstate_duration"); | ||
218 | pstate_map_fd = bpf_object__find_map_fd_by_name(obj, "pstate_duration"); | ||
219 | if (cstate_map_fd < 0 || pstate_map_fd < 0) { | ||
220 | fprintf(stderr, "ERROR: finding a map in obj file failed\n"); | ||
221 | goto cleanup; | ||
222 | } | ||
223 | |||
224 | link = bpf_program__attach(prog); | ||
225 | if (libbpf_get_error(link)) { | ||
226 | fprintf(stderr, "ERROR: bpf_program__attach failed\n"); | ||
227 | link = NULL; | ||
228 | goto cleanup; | ||
229 | } | ||
230 | |||
231 | ret = cpu_stat_inject_cpu_idle_event(); | ||
232 | if (ret < 0) | ||
233 | return 1; | ||
234 | |||
235 | ret = cpu_stat_inject_cpu_frequency_event(); | ||
236 | if (ret < 0) | ||
237 | return 1; | ||
238 | |||
239 | signal(SIGINT, int_exit); | ||
240 | signal(SIGTERM, int_exit); | ||
241 | |||
242 | while (1) { | ||
243 | cpu_stat_update(cstate_map_fd, pstate_map_fd); | ||
244 | cpu_stat_print(); | ||
245 | sleep(5); | ||
246 | } | ||
247 | |||
248 | cleanup: | ||
249 | bpf_link__destroy(link); | ||
250 | bpf_object__close(obj); | ||
251 | return 0; | ||
252 | } | ||
diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh new file mode 100755 index 000000000..ffe4c0607 --- /dev/null +++ b/samples/bpf/do_hbm_test.sh | |||
@@ -0,0 +1,442 @@ | |||
1 | #!/bin/bash | ||
2 | # SPDX-License-Identifier: GPL-2.0 | ||
3 | # | ||
4 | # Copyright (c) 2019 Facebook | ||
5 | # | ||
6 | # This program is free software; you can redistribute it and/or | ||
7 | # modify it under the terms of version 2 of the GNU General Public | ||
8 | # License as published by the Free Software Foundation. | ||
9 | |||
10 | Usage() { | ||
11 | echo "Script for testing HBM (Host Bandwidth Manager) framework." | ||
12 | echo "It creates a cgroup to use for testing and load a BPF program to limit" | ||
13 | echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create" | ||
14 | echo "loads. The output is the goodput in Mbps (unless -D was used)." | ||
15 | echo "" | ||
16 | echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>]" | ||
17 | echo " [-D] [-d=<delay>|--delay=<delay>] [--debug] [-E] [--edt]" | ||
18 | echo " [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]" | ||
19 | echo " [-l] [-N] [--no_cn] [-p=<port>|--port=<port>] [-P]" | ||
20 | echo " [-q=<qdisc>] [-R] [-s=<server>|--server=<server]" | ||
21 | echo " [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]" | ||
22 | echo " Where:" | ||
23 | echo " out egress (default)" | ||
24 | echo " -b or --bpf BPF program filename to load and attach." | ||
25 | echo " Default is hbm_out_kern.o for egress," | ||
26 | echo " -c or -cc TCP congestion control (cubic or dctcp)" | ||
27 | echo " --debug print BPF trace buffer" | ||
28 | echo " -d or --delay add a delay in ms using netem" | ||
29 | echo " -D In addition to the goodput in Mbps, it also outputs" | ||
30 | echo " other detailed information. This information is" | ||
31 | echo " test dependent (i.e. iperf3 or netperf)." | ||
32 | echo " -E enable ECN (not required for dctcp)" | ||
33 | echo " --edt use fq's Earliest Departure Time (requires fq)" | ||
34 | echo " -f or --flows number of concurrent flows (default=1)" | ||
35 | echo " -i or --id cgroup id (an integer, default is 1)" | ||
36 | echo " -N use netperf instead of iperf3" | ||
37 | echo " --no_cn Do not return CN notifications" | ||
38 | echo " -l do not limit flows using loopback" | ||
39 | echo " -h Help" | ||
40 | echo " -p or --port iperf3 port (default is 5201)" | ||
41 | echo " -P use an iperf3 instance for each flow" | ||
42 | echo " -q use the specified qdisc" | ||
43 | echo " -r or --rate rate in Mbps (default 1s 1Gbps)" | ||
44 | echo " -R Use TCP_RR for netperf. 1st flow has req" | ||
45 | echo " size of 10KB, rest of 1MB. Reply in all" | ||
46 | echo " cases is 1 byte." | ||
47 | echo " More detailed output for each flow can be found" | ||
48 | echo " in the files netperf.<cg>.<flow>, where <cg> is the" | ||
49 | echo " cgroup id as specified with the -i flag, and <flow>" | ||
50 | echo " is the flow id starting at 1 and increasing by 1 for" | ||
51 | echo " flow (as specified by -f)." | ||
52 | echo " -s or --server hostname of netperf server. Used to create netperf" | ||
53 | echo " test traffic between to hosts (default is within host)" | ||
54 | echo " netserver must be running on the host." | ||
55 | echo " -S or --stats whether to update hbm stats (default is yes)." | ||
56 | echo " -t or --time duration of iperf3 in seconds (default=5)" | ||
57 | echo " -w Work conserving flag. cgroup can increase its" | ||
58 | echo " bandwidth beyond the rate limit specified" | ||
59 | echo " while there is available bandwidth. Current" | ||
60 | echo " implementation assumes there is only one NIC" | ||
61 | echo " (eth0), but can be extended to support multiple" | ||
62 | echo " NICs." | ||
63 | echo " cubic or dctcp specify which TCP CC to use" | ||
64 | echo " " | ||
65 | exit | ||
66 | } | ||
67 | |||
68 | #set -x | ||
69 | |||
70 | debug_flag=0 | ||
71 | args="$@" | ||
72 | name="$0" | ||
73 | netem=0 | ||
74 | cc=x | ||
75 | dir="-o" | ||
76 | dir_name="out" | ||
77 | dur=5 | ||
78 | flows=1 | ||
79 | id=1 | ||
80 | prog="" | ||
81 | port=5201 | ||
82 | rate=1000 | ||
83 | multi_iperf=0 | ||
84 | flow_cnt=1 | ||
85 | use_netperf=0 | ||
86 | rr=0 | ||
87 | ecn=0 | ||
88 | details=0 | ||
89 | server="" | ||
90 | qdisc="" | ||
91 | flags="" | ||
92 | do_stats=0 | ||
93 | |||
94 | function start_hbm () { | ||
95 | rm -f hbm.out | ||
96 | echo "./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog" > hbm.out | ||
97 | echo " " >> hbm.out | ||
98 | ./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog >> hbm.out 2>&1 & | ||
99 | echo $! | ||
100 | } | ||
101 | |||
102 | processArgs () { | ||
103 | for i in $args ; do | ||
104 | case $i in | ||
105 | # Support for upcomming ingress rate limiting | ||
106 | #in) # support for upcoming ingress rate limiting | ||
107 | # dir="-i" | ||
108 | # dir_name="in" | ||
109 | # ;; | ||
110 | out) | ||
111 | dir="-o" | ||
112 | dir_name="out" | ||
113 | ;; | ||
114 | -b=*|--bpf=*) | ||
115 | prog="${i#*=}" | ||
116 | ;; | ||
117 | -c=*|--cc=*) | ||
118 | cc="${i#*=}" | ||
119 | ;; | ||
120 | --no_cn) | ||
121 | flags="$flags --no_cn" | ||
122 | ;; | ||
123 | --debug) | ||
124 | flags="$flags -d" | ||
125 | debug_flag=1 | ||
126 | ;; | ||
127 | -d=*|--delay=*) | ||
128 | netem="${i#*=}" | ||
129 | ;; | ||
130 | -D) | ||
131 | details=1 | ||
132 | ;; | ||
133 | -E) | ||
134 | ecn=1 | ||
135 | ;; | ||
136 | --edt) | ||
137 | flags="$flags --edt" | ||
138 | qdisc="fq" | ||
139 | ;; | ||
140 | -f=*|--flows=*) | ||
141 | flows="${i#*=}" | ||
142 | ;; | ||
143 | -i=*|--id=*) | ||
144 | id="${i#*=}" | ||
145 | ;; | ||
146 | -l) | ||
147 | flags="$flags -l" | ||
148 | ;; | ||
149 | -N) | ||
150 | use_netperf=1 | ||
151 | ;; | ||
152 | -p=*|--port=*) | ||
153 | port="${i#*=}" | ||
154 | ;; | ||
155 | -P) | ||
156 | multi_iperf=1 | ||
157 | ;; | ||
158 | -q=*) | ||
159 | qdisc="${i#*=}" | ||
160 | ;; | ||
161 | -r=*|--rate=*) | ||
162 | rate="${i#*=}" | ||
163 | ;; | ||
164 | -R) | ||
165 | rr=1 | ||
166 | ;; | ||
167 | -s=*|--server=*) | ||
168 | server="${i#*=}" | ||
169 | ;; | ||
170 | -S|--stats) | ||
171 | flags="$flags -s" | ||
172 | do_stats=1 | ||
173 | ;; | ||
174 | -t=*|--time=*) | ||
175 | dur="${i#*=}" | ||
176 | ;; | ||
177 | -w) | ||
178 | flags="$flags -w" | ||
179 | ;; | ||
180 | cubic) | ||
181 | cc=cubic | ||
182 | ;; | ||
183 | dctcp) | ||
184 | cc=dctcp | ||
185 | ;; | ||
186 | *) | ||
187 | echo "Unknown arg:$i" | ||
188 | Usage | ||
189 | ;; | ||
190 | esac | ||
191 | done | ||
192 | } | ||
193 | |||
194 | processArgs | ||
195 | |||
196 | if [ $debug_flag -eq 1 ] ; then | ||
197 | rm -f hbm_out.log | ||
198 | fi | ||
199 | |||
200 | hbm_pid=$(start_hbm) | ||
201 | usleep 100000 | ||
202 | |||
203 | host=`hostname` | ||
204 | cg_base_dir=/sys/fs/cgroup | ||
205 | cg_dir="$cg_base_dir/cgroup-test-work-dir/hbm$id" | ||
206 | |||
207 | echo $$ >> $cg_dir/cgroup.procs | ||
208 | |||
209 | ulimit -l unlimited | ||
210 | |||
211 | rm -f ss.out | ||
212 | rm -f hbm.[0-9]*.$dir_name | ||
213 | if [ $ecn -ne 0 ] ; then | ||
214 | sysctl -w -q -n net.ipv4.tcp_ecn=1 | ||
215 | fi | ||
216 | |||
217 | if [ $use_netperf -eq 0 ] ; then | ||
218 | cur_cc=`sysctl -n net.ipv4.tcp_congestion_control` | ||
219 | if [ "$cc" != "x" ] ; then | ||
220 | sysctl -w -q -n net.ipv4.tcp_congestion_control=$cc | ||
221 | fi | ||
222 | fi | ||
223 | |||
224 | if [ "$netem" -ne "0" ] ; then | ||
225 | if [ "$qdisc" != "" ] ; then | ||
226 | echo "WARNING: Ignoring -q options because -d option used" | ||
227 | fi | ||
228 | tc qdisc del dev lo root > /dev/null 2>&1 | ||
229 | tc qdisc add dev lo root netem delay $netem\ms > /dev/null 2>&1 | ||
230 | elif [ "$qdisc" != "" ] ; then | ||
231 | tc qdisc del dev eth0 root > /dev/null 2>&1 | ||
232 | tc qdisc add dev eth0 root $qdisc > /dev/null 2>&1 | ||
233 | fi | ||
234 | |||
235 | n=0 | ||
236 | m=$[$dur * 5] | ||
237 | hn="::1" | ||
238 | if [ $use_netperf -ne 0 ] ; then | ||
239 | if [ "$server" != "" ] ; then | ||
240 | hn=$server | ||
241 | fi | ||
242 | fi | ||
243 | |||
244 | ( ping6 -i 0.2 -c $m $hn > ping.out 2>&1 ) & | ||
245 | |||
246 | if [ $use_netperf -ne 0 ] ; then | ||
247 | begNetserverPid=`ps ax | grep netserver | grep --invert-match "grep" | \ | ||
248 | awk '{ print $1 }'` | ||
249 | if [ "$begNetserverPid" == "" ] ; then | ||
250 | if [ "$server" == "" ] ; then | ||
251 | ( ./netserver > /dev/null 2>&1) & | ||
252 | usleep 100000 | ||
253 | fi | ||
254 | fi | ||
255 | flow_cnt=1 | ||
256 | if [ "$server" == "" ] ; then | ||
257 | np_server=$host | ||
258 | else | ||
259 | np_server=$server | ||
260 | fi | ||
261 | if [ "$cc" == "x" ] ; then | ||
262 | np_cc="" | ||
263 | else | ||
264 | np_cc="-K $cc,$cc" | ||
265 | fi | ||
266 | replySize=1 | ||
267 | while [ $flow_cnt -le $flows ] ; do | ||
268 | if [ $rr -ne 0 ] ; then | ||
269 | reqSize=1M | ||
270 | if [ $flow_cnt -eq 1 ] ; then | ||
271 | reqSize=10K | ||
272 | fi | ||
273 | if [ "$dir" == "-i" ] ; then | ||
274 | replySize=$reqSize | ||
275 | reqSize=1 | ||
276 | fi | ||
277 | ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r $reqSize,$replySize $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,REMOTE_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,LOCAL_RECV_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) & | ||
278 | else | ||
279 | if [ "$dir" == "-i" ] ; then | ||
280 | ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r 1,10M $np_cc -k P50_LATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REMOTE_TRANSPORT_RETRANS,REMOTE_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) & | ||
281 | else | ||
282 | ( ./netperf -H $np_server -l $dur -f m -j -t TCP_STREAM -- $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) & | ||
283 | fi | ||
284 | fi | ||
285 | flow_cnt=$[flow_cnt+1] | ||
286 | done | ||
287 | |||
288 | # sleep for duration of test (plus some buffer) | ||
289 | n=$[dur+2] | ||
290 | sleep $n | ||
291 | |||
292 | # force graceful termination of netperf | ||
293 | pids=`pgrep netperf` | ||
294 | for p in $pids ; do | ||
295 | kill -SIGALRM $p | ||
296 | done | ||
297 | |||
298 | flow_cnt=1 | ||
299 | rate=0 | ||
300 | if [ $details -ne 0 ] ; then | ||
301 | echo "" | ||
302 | echo "Details for HBM in cgroup $id" | ||
303 | if [ $do_stats -eq 1 ] ; then | ||
304 | if [ -e hbm.$id.$dir_name ] ; then | ||
305 | cat hbm.$id.$dir_name | ||
306 | fi | ||
307 | fi | ||
308 | fi | ||
309 | while [ $flow_cnt -le $flows ] ; do | ||
310 | if [ "$dir" == "-i" ] ; then | ||
311 | r=`cat netperf.$id.$flow_cnt | grep -o "REMOTE_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"` | ||
312 | else | ||
313 | r=`cat netperf.$id.$flow_cnt | grep -o "LOCAL_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"` | ||
314 | fi | ||
315 | echo "rate for flow $flow_cnt: $r" | ||
316 | rate=$[rate+r] | ||
317 | if [ $details -ne 0 ] ; then | ||
318 | echo "-----" | ||
319 | echo "Details for cgroup $id, flow $flow_cnt" | ||
320 | cat netperf.$id.$flow_cnt | ||
321 | fi | ||
322 | flow_cnt=$[flow_cnt+1] | ||
323 | done | ||
324 | if [ $details -ne 0 ] ; then | ||
325 | echo "" | ||
326 | delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"` | ||
327 | echo "PING AVG DELAY:$delay" | ||
328 | echo "AGGREGATE_GOODPUT:$rate" | ||
329 | else | ||
330 | echo $rate | ||
331 | fi | ||
332 | elif [ $multi_iperf -eq 0 ] ; then | ||
333 | (iperf3 -s -p $port -1 > /dev/null 2>&1) & | ||
334 | usleep 100000 | ||
335 | iperf3 -c $host -p $port -i 0 -P $flows -f m -t $dur > iperf.$id | ||
336 | rates=`grep receiver iperf.$id | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*"` | ||
337 | rate=`echo $rates | grep -o "[0-9]*$"` | ||
338 | |||
339 | if [ $details -ne 0 ] ; then | ||
340 | echo "" | ||
341 | echo "Details for HBM in cgroup $id" | ||
342 | if [ $do_stats -eq 1 ] ; then | ||
343 | if [ -e hbm.$id.$dir_name ] ; then | ||
344 | cat hbm.$id.$dir_name | ||
345 | fi | ||
346 | fi | ||
347 | delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"` | ||
348 | echo "PING AVG DELAY:$delay" | ||
349 | echo "AGGREGATE_GOODPUT:$rate" | ||
350 | else | ||
351 | echo $rate | ||
352 | fi | ||
353 | else | ||
354 | flow_cnt=1 | ||
355 | while [ $flow_cnt -le $flows ] ; do | ||
356 | (iperf3 -s -p $port -1 > /dev/null 2>&1) & | ||
357 | ( iperf3 -c $host -p $port -i 0 -P 1 -f m -t $dur | grep receiver | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*" | grep -o "[0-9]*$" > iperf3.$id.$flow_cnt ) & | ||
358 | port=$[port+1] | ||
359 | flow_cnt=$[flow_cnt+1] | ||
360 | done | ||
361 | n=$[dur+1] | ||
362 | sleep $n | ||
363 | flow_cnt=1 | ||
364 | rate=0 | ||
365 | if [ $details -ne 0 ] ; then | ||
366 | echo "" | ||
367 | echo "Details for HBM in cgroup $id" | ||
368 | if [ $do_stats -eq 1 ] ; then | ||
369 | if [ -e hbm.$id.$dir_name ] ; then | ||
370 | cat hbm.$id.$dir_name | ||
371 | fi | ||
372 | fi | ||
373 | fi | ||
374 | |||
375 | while [ $flow_cnt -le $flows ] ; do | ||
376 | r=`cat iperf3.$id.$flow_cnt` | ||
377 | # echo "rate for flow $flow_cnt: $r" | ||
378 | if [ $details -ne 0 ] ; then | ||
379 | echo "Rate for cgroup $id, flow $flow_cnt LOCAL_SEND_THROUGHPUT=$r" | ||
380 | fi | ||
381 | rate=$[rate+r] | ||
382 | flow_cnt=$[flow_cnt+1] | ||
383 | done | ||
384 | if [ $details -ne 0 ] ; then | ||
385 | delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"` | ||
386 | echo "PING AVG DELAY:$delay" | ||
387 | echo "AGGREGATE_GOODPUT:$rate" | ||
388 | else | ||
389 | echo $rate | ||
390 | fi | ||
391 | fi | ||
392 | |||
393 | if [ $use_netperf -eq 0 ] ; then | ||
394 | sysctl -w -q -n net.ipv4.tcp_congestion_control=$cur_cc | ||
395 | fi | ||
396 | if [ $ecn -ne 0 ] ; then | ||
397 | sysctl -w -q -n net.ipv4.tcp_ecn=0 | ||
398 | fi | ||
399 | if [ "$netem" -ne "0" ] ; then | ||
400 | tc qdisc del dev lo root > /dev/null 2>&1 | ||
401 | fi | ||
402 | if [ "$qdisc" != "" ] ; then | ||
403 | tc qdisc del dev eth0 root > /dev/null 2>&1 | ||
404 | fi | ||
405 | sleep 2 | ||
406 | |||
407 | hbmPid=`ps ax | grep "hbm " | grep --invert-match "grep" | awk '{ print $1 }'` | ||
408 | if [ "$hbmPid" == "$hbm_pid" ] ; then | ||
409 | kill $hbm_pid | ||
410 | fi | ||
411 | |||
412 | sleep 1 | ||
413 | |||
414 | # Detach any BPF programs that may have lingered | ||
415 | ttx=`bpftool cgroup tree | grep hbm` | ||
416 | v=2 | ||
417 | for x in $ttx ; do | ||
418 | if [ "${x:0:36}" == "/sys/fs/cgroup/cgroup-test-work-dir/" ] ; then | ||
419 | cg=$x ; v=0 | ||
420 | else | ||
421 | if [ $v -eq 0 ] ; then | ||
422 | id=$x ; v=1 | ||
423 | else | ||
424 | if [ $v -eq 1 ] ; then | ||
425 | type=$x ; bpftool cgroup detach $cg $type id $id | ||
426 | v=0 | ||
427 | fi | ||
428 | fi | ||
429 | fi | ||
430 | done | ||
431 | |||
432 | if [ $use_netperf -ne 0 ] ; then | ||
433 | if [ "$server" == "" ] ; then | ||
434 | if [ "$begNetserverPid" == "" ] ; then | ||
435 | netserverPid=`ps ax | grep netserver | grep --invert-match "grep" | awk '{ print $1 }'` | ||
436 | if [ "$netserverPid" != "" ] ; then | ||
437 | kill $netserverPid | ||
438 | fi | ||
439 | fi | ||
440 | fi | ||
441 | fi | ||
442 | exit | ||
diff --git a/samples/bpf/fds_example.c b/samples/bpf/fds_example.c new file mode 100644 index 000000000..59f45fef5 --- /dev/null +++ b/samples/bpf/fds_example.c | |||
@@ -0,0 +1,193 @@ | |||
1 | #include <linux/unistd.h> | ||
2 | #include <linux/bpf.h> | ||
3 | |||
4 | #include <stdio.h> | ||
5 | #include <stdlib.h> | ||
6 | #include <stdint.h> | ||
7 | #include <unistd.h> | ||
8 | #include <string.h> | ||
9 | #include <assert.h> | ||
10 | #include <errno.h> | ||
11 | |||
12 | #include <sys/types.h> | ||
13 | #include <sys/socket.h> | ||
14 | |||
15 | #include <bpf/bpf.h> | ||
16 | |||
17 | #include <bpf/libbpf.h> | ||
18 | #include "bpf_insn.h" | ||
19 | #include "sock_example.h" | ||
20 | |||
21 | #define BPF_F_PIN (1 << 0) | ||
22 | #define BPF_F_GET (1 << 1) | ||
23 | #define BPF_F_PIN_GET (BPF_F_PIN | BPF_F_GET) | ||
24 | |||
25 | #define BPF_F_KEY (1 << 2) | ||
26 | #define BPF_F_VAL (1 << 3) | ||
27 | #define BPF_F_KEY_VAL (BPF_F_KEY | BPF_F_VAL) | ||
28 | |||
29 | #define BPF_M_UNSPEC 0 | ||
30 | #define BPF_M_MAP 1 | ||
31 | #define BPF_M_PROG 2 | ||
32 | |||
33 | char bpf_log_buf[BPF_LOG_BUF_SIZE]; | ||
34 | |||
35 | static void usage(void) | ||
36 | { | ||
37 | printf("Usage: fds_example [...]\n"); | ||
38 | printf(" -F <file> File to pin/get object\n"); | ||
39 | printf(" -P |- pin object\n"); | ||
40 | printf(" -G `- get object\n"); | ||
41 | printf(" -m eBPF map mode\n"); | ||
42 | printf(" -k <key> |- map key\n"); | ||
43 | printf(" -v <value> `- map value\n"); | ||
44 | printf(" -p eBPF prog mode\n"); | ||
45 | printf(" -o <object> `- object file\n"); | ||
46 | printf(" -h Display this help.\n"); | ||
47 | } | ||
48 | |||
49 | static int bpf_map_create(void) | ||
50 | { | ||
51 | return bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(uint32_t), | ||
52 | sizeof(uint32_t), 1024, 0); | ||
53 | } | ||
54 | |||
55 | static int bpf_prog_create(const char *object) | ||
56 | { | ||
57 | static struct bpf_insn insns[] = { | ||
58 | BPF_MOV64_IMM(BPF_REG_0, 1), | ||
59 | BPF_EXIT_INSN(), | ||
60 | }; | ||
61 | size_t insns_cnt = sizeof(insns) / sizeof(struct bpf_insn); | ||
62 | struct bpf_object *obj; | ||
63 | int prog_fd; | ||
64 | |||
65 | if (object) { | ||
66 | assert(!bpf_prog_load(object, BPF_PROG_TYPE_UNSPEC, | ||
67 | &obj, &prog_fd)); | ||
68 | return prog_fd; | ||
69 | } else { | ||
70 | return bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, | ||
71 | insns, insns_cnt, "GPL", 0, | ||
72 | bpf_log_buf, BPF_LOG_BUF_SIZE); | ||
73 | } | ||
74 | } | ||
75 | |||
76 | static int bpf_do_map(const char *file, uint32_t flags, uint32_t key, | ||
77 | uint32_t value) | ||
78 | { | ||
79 | int fd, ret; | ||
80 | |||
81 | if (flags & BPF_F_PIN) { | ||
82 | fd = bpf_map_create(); | ||
83 | printf("bpf: map fd:%d (%s)\n", fd, strerror(errno)); | ||
84 | assert(fd > 0); | ||
85 | |||
86 | ret = bpf_obj_pin(fd, file); | ||
87 | printf("bpf: pin ret:(%d,%s)\n", ret, strerror(errno)); | ||
88 | assert(ret == 0); | ||
89 | } else { | ||
90 | fd = bpf_obj_get(file); | ||
91 | printf("bpf: get fd:%d (%s)\n", fd, strerror(errno)); | ||
92 | assert(fd > 0); | ||
93 | } | ||
94 | |||
95 | if ((flags & BPF_F_KEY_VAL) == BPF_F_KEY_VAL) { | ||
96 | ret = bpf_map_update_elem(fd, &key, &value, 0); | ||
97 | printf("bpf: fd:%d u->(%u:%u) ret:(%d,%s)\n", fd, key, value, | ||
98 | ret, strerror(errno)); | ||
99 | assert(ret == 0); | ||
100 | } else if (flags & BPF_F_KEY) { | ||
101 | ret = bpf_map_lookup_elem(fd, &key, &value); | ||
102 | printf("bpf: fd:%d l->(%u):%u ret:(%d,%s)\n", fd, key, value, | ||
103 | ret, strerror(errno)); | ||
104 | assert(ret == 0); | ||
105 | } | ||
106 | |||
107 | return 0; | ||
108 | } | ||
109 | |||
110 | static int bpf_do_prog(const char *file, uint32_t flags, const char *object) | ||
111 | { | ||
112 | int fd, sock, ret; | ||
113 | |||
114 | if (flags & BPF_F_PIN) { | ||
115 | fd = bpf_prog_create(object); | ||
116 | printf("bpf: prog fd:%d (%s)\n", fd, strerror(errno)); | ||
117 | assert(fd > 0); | ||
118 | |||
119 | ret = bpf_obj_pin(fd, file); | ||
120 | printf("bpf: pin ret:(%d,%s)\n", ret, strerror(errno)); | ||
121 | assert(ret == 0); | ||
122 | } else { | ||
123 | fd = bpf_obj_get(file); | ||
124 | printf("bpf: get fd:%d (%s)\n", fd, strerror(errno)); | ||
125 | assert(fd > 0); | ||
126 | } | ||
127 | |||
128 | sock = open_raw_sock("lo"); | ||
129 | assert(sock > 0); | ||
130 | |||
131 | ret = setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &fd, sizeof(fd)); | ||
132 | printf("bpf: sock:%d <- fd:%d attached ret:(%d,%s)\n", sock, fd, | ||
133 | ret, strerror(errno)); | ||
134 | assert(ret == 0); | ||
135 | |||
136 | return 0; | ||
137 | } | ||
138 | |||
139 | int main(int argc, char **argv) | ||
140 | { | ||
141 | const char *file = NULL, *object = NULL; | ||
142 | uint32_t key = 0, value = 0, flags = 0; | ||
143 | int opt, mode = BPF_M_UNSPEC; | ||
144 | |||
145 | while ((opt = getopt(argc, argv, "F:PGmk:v:po:")) != -1) { | ||
146 | switch (opt) { | ||
147 | /* General args */ | ||
148 | case 'F': | ||
149 | file = optarg; | ||
150 | break; | ||
151 | case 'P': | ||
152 | flags |= BPF_F_PIN; | ||
153 | break; | ||
154 | case 'G': | ||
155 | flags |= BPF_F_GET; | ||
156 | break; | ||
157 | /* Map-related args */ | ||
158 | case 'm': | ||
159 | mode = BPF_M_MAP; | ||
160 | break; | ||
161 | case 'k': | ||
162 | key = strtoul(optarg, NULL, 0); | ||
163 | flags |= BPF_F_KEY; | ||
164 | break; | ||
165 | case 'v': | ||
166 | value = strtoul(optarg, NULL, 0); | ||
167 | flags |= BPF_F_VAL; | ||
168 | break; | ||
169 | /* Prog-related args */ | ||
170 | case 'p': | ||
171 | mode = BPF_M_PROG; | ||
172 | break; | ||
173 | case 'o': | ||
174 | object = optarg; | ||
175 | break; | ||
176 | default: | ||
177 | goto out; | ||
178 | } | ||
179 | } | ||
180 | |||
181 | if (!(flags & BPF_F_PIN_GET) || !file) | ||
182 | goto out; | ||
183 | |||
184 | switch (mode) { | ||
185 | case BPF_M_MAP: | ||
186 | return bpf_do_map(file, flags, key, value); | ||
187 | case BPF_M_PROG: | ||
188 | return bpf_do_prog(file, flags, object); | ||
189 | } | ||
190 | out: | ||
191 | usage(); | ||
192 | return -1; | ||
193 | } | ||
diff --git a/samples/bpf/hash_func01.h b/samples/bpf/hash_func01.h new file mode 100644 index 000000000..38255812e --- /dev/null +++ b/samples/bpf/hash_func01.h | |||
@@ -0,0 +1,55 @@ | |||
1 | /* SPDX-License-Identifier: LGPL-2.1 | ||
2 | * | ||
3 | * Based on Paul Hsieh's (LGPG 2.1) hash function | ||
4 | * From: http://www.azillionmonkeys.com/qed/hash.html | ||
5 | */ | ||
6 | |||
7 | #define get16bits(d) (*((const __u16 *) (d))) | ||
8 | |||
9 | static __always_inline | ||
10 | __u32 SuperFastHash (const char *data, int len, __u32 initval) { | ||
11 | __u32 hash = initval; | ||
12 | __u32 tmp; | ||
13 | int rem; | ||
14 | |||
15 | if (len <= 0 || data == NULL) return 0; | ||
16 | |||
17 | rem = len & 3; | ||
18 | len >>= 2; | ||
19 | |||
20 | /* Main loop */ | ||
21 | #pragma clang loop unroll(full) | ||
22 | for (;len > 0; len--) { | ||
23 | hash += get16bits (data); | ||
24 | tmp = (get16bits (data+2) << 11) ^ hash; | ||
25 | hash = (hash << 16) ^ tmp; | ||
26 | data += 2*sizeof (__u16); | ||
27 | hash += hash >> 11; | ||
28 | } | ||
29 | |||
30 | /* Handle end cases */ | ||
31 | switch (rem) { | ||
32 | case 3: hash += get16bits (data); | ||
33 | hash ^= hash << 16; | ||
34 | hash ^= ((signed char)data[sizeof (__u16)]) << 18; | ||
35 | hash += hash >> 11; | ||
36 | break; | ||
37 | case 2: hash += get16bits (data); | ||
38 | hash ^= hash << 11; | ||
39 | hash += hash >> 17; | ||
40 | break; | ||
41 | case 1: hash += (signed char)*data; | ||
42 | hash ^= hash << 10; | ||
43 | hash += hash >> 1; | ||
44 | } | ||
45 | |||
46 | /* Force "avalanching" of final 127 bits */ | ||
47 | hash ^= hash << 3; | ||
48 | hash += hash >> 5; | ||
49 | hash ^= hash << 4; | ||
50 | hash += hash >> 17; | ||
51 | hash ^= hash << 25; | ||
52 | hash += hash >> 6; | ||
53 | |||
54 | return hash; | ||
55 | } | ||
diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c new file mode 100644 index 000000000..ff4c533df --- /dev/null +++ b/samples/bpf/hbm.c | |||
@@ -0,0 +1,499 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* Copyright (c) 2019 Facebook | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of version 2 of the GNU General Public | ||
6 | * License as published by the Free Software Foundation. | ||
7 | * | ||
8 | * Example program for Host Bandwidth Managment | ||
9 | * | ||
10 | * This program loads a cgroup skb BPF program to enforce cgroup output | ||
11 | * (egress) or input (ingress) bandwidth limits. | ||
12 | * | ||
13 | * USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog] | ||
14 | * Where: | ||
15 | * -d Print BPF trace debug buffer | ||
16 | * -l Also limit flows doing loopback | ||
17 | * -n <#> To create cgroup \"/hbm#\" and attach prog | ||
18 | * Default is /hbm1 | ||
19 | * --no_cn Do not return cn notifications | ||
20 | * -r <rate> Rate limit in Mbps | ||
21 | * -s Get HBM stats (marked, dropped, etc.) | ||
22 | * -t <time> Exit after specified seconds (default is 0) | ||
23 | * -w Work conserving flag. cgroup can increase its bandwidth | ||
24 | * beyond the rate limit specified while there is available | ||
25 | * bandwidth. Current implementation assumes there is only | ||
26 | * NIC (eth0), but can be extended to support multiple NICs. | ||
27 | * Currrently only supported for egress. | ||
28 | * -h Print this info | ||
29 | * prog BPF program file name. Name defaults to hbm_out_kern.o | ||
30 | */ | ||
31 | |||
32 | #define _GNU_SOURCE | ||
33 | |||
34 | #include <stdio.h> | ||
35 | #include <stdlib.h> | ||
36 | #include <assert.h> | ||
37 | #include <sys/resource.h> | ||
38 | #include <sys/time.h> | ||
39 | #include <unistd.h> | ||
40 | #include <errno.h> | ||
41 | #include <fcntl.h> | ||
42 | #include <linux/unistd.h> | ||
43 | #include <linux/compiler.h> | ||
44 | |||
45 | #include <linux/bpf.h> | ||
46 | #include <bpf/bpf.h> | ||
47 | #include <getopt.h> | ||
48 | |||
49 | #include "bpf_load.h" | ||
50 | #include "bpf_rlimit.h" | ||
51 | #include "cgroup_helpers.h" | ||
52 | #include "hbm.h" | ||
53 | #include "bpf_util.h" | ||
54 | #include <bpf/bpf.h> | ||
55 | #include <bpf/libbpf.h> | ||
56 | |||
57 | bool outFlag = true; | ||
58 | int minRate = 1000; /* cgroup rate limit in Mbps */ | ||
59 | int rate = 1000; /* can grow if rate conserving is enabled */ | ||
60 | int dur = 1; | ||
61 | bool stats_flag; | ||
62 | bool loopback_flag; | ||
63 | bool debugFlag; | ||
64 | bool work_conserving_flag; | ||
65 | bool no_cn_flag; | ||
66 | bool edt_flag; | ||
67 | |||
68 | static void Usage(void); | ||
69 | static void read_trace_pipe2(void); | ||
70 | static void do_error(char *msg, bool errno_flag); | ||
71 | |||
72 | #define DEBUGFS "/sys/kernel/debug/tracing/" | ||
73 | |||
74 | struct bpf_object *obj; | ||
75 | int bpfprog_fd; | ||
76 | int cgroup_storage_fd; | ||
77 | |||
78 | static void read_trace_pipe2(void) | ||
79 | { | ||
80 | int trace_fd; | ||
81 | FILE *outf; | ||
82 | char *outFname = "hbm_out.log"; | ||
83 | |||
84 | trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0); | ||
85 | if (trace_fd < 0) { | ||
86 | printf("Error opening trace_pipe\n"); | ||
87 | return; | ||
88 | } | ||
89 | |||
90 | // Future support of ingress | ||
91 | // if (!outFlag) | ||
92 | // outFname = "hbm_in.log"; | ||
93 | outf = fopen(outFname, "w"); | ||
94 | |||
95 | if (outf == NULL) | ||
96 | printf("Error creating %s\n", outFname); | ||
97 | |||
98 | while (1) { | ||
99 | static char buf[4097]; | ||
100 | ssize_t sz; | ||
101 | |||
102 | sz = read(trace_fd, buf, sizeof(buf) - 1); | ||
103 | if (sz > 0) { | ||
104 | buf[sz] = 0; | ||
105 | puts(buf); | ||
106 | if (outf != NULL) { | ||
107 | fprintf(outf, "%s\n", buf); | ||
108 | fflush(outf); | ||
109 | } | ||
110 | } | ||
111 | } | ||
112 | } | ||
113 | |||
114 | static void do_error(char *msg, bool errno_flag) | ||
115 | { | ||
116 | if (errno_flag) | ||
117 | printf("ERROR: %s, errno: %d\n", msg, errno); | ||
118 | else | ||
119 | printf("ERROR: %s\n", msg); | ||
120 | exit(1); | ||
121 | } | ||
122 | |||
123 | static int prog_load(char *prog) | ||
124 | { | ||
125 | struct bpf_prog_load_attr prog_load_attr = { | ||
126 | .prog_type = BPF_PROG_TYPE_CGROUP_SKB, | ||
127 | .file = prog, | ||
128 | .expected_attach_type = BPF_CGROUP_INET_EGRESS, | ||
129 | }; | ||
130 | int map_fd; | ||
131 | struct bpf_map *map; | ||
132 | |||
133 | int ret = 0; | ||
134 | |||
135 | if (access(prog, O_RDONLY) < 0) { | ||
136 | printf("Error accessing file %s: %s\n", prog, strerror(errno)); | ||
137 | return 1; | ||
138 | } | ||
139 | if (bpf_prog_load_xattr(&prog_load_attr, &obj, &bpfprog_fd)) | ||
140 | ret = 1; | ||
141 | if (!ret) { | ||
142 | map = bpf_object__find_map_by_name(obj, "queue_stats"); | ||
143 | map_fd = bpf_map__fd(map); | ||
144 | if (map_fd < 0) { | ||
145 | printf("Map not found: %s\n", strerror(map_fd)); | ||
146 | ret = 1; | ||
147 | } | ||
148 | } | ||
149 | |||
150 | if (ret) { | ||
151 | printf("ERROR: bpf_prog_load_xattr failed for: %s\n", prog); | ||
152 | printf(" Output from verifier:\n%s\n------\n", bpf_log_buf); | ||
153 | ret = -1; | ||
154 | } else { | ||
155 | ret = map_fd; | ||
156 | } | ||
157 | |||
158 | return ret; | ||
159 | } | ||
160 | |||
161 | static int run_bpf_prog(char *prog, int cg_id) | ||
162 | { | ||
163 | int map_fd; | ||
164 | int rc = 0; | ||
165 | int key = 0; | ||
166 | int cg1 = 0; | ||
167 | int type = BPF_CGROUP_INET_EGRESS; | ||
168 | char cg_dir[100]; | ||
169 | struct hbm_queue_stats qstats = {0}; | ||
170 | |||
171 | sprintf(cg_dir, "/hbm%d", cg_id); | ||
172 | map_fd = prog_load(prog); | ||
173 | if (map_fd == -1) | ||
174 | return 1; | ||
175 | |||
176 | if (setup_cgroup_environment()) { | ||
177 | printf("ERROR: setting cgroup environment\n"); | ||
178 | goto err; | ||
179 | } | ||
180 | cg1 = create_and_get_cgroup(cg_dir); | ||
181 | if (!cg1) { | ||
182 | printf("ERROR: create_and_get_cgroup\n"); | ||
183 | goto err; | ||
184 | } | ||
185 | if (join_cgroup(cg_dir)) { | ||
186 | printf("ERROR: join_cgroup\n"); | ||
187 | goto err; | ||
188 | } | ||
189 | |||
190 | qstats.rate = rate; | ||
191 | qstats.stats = stats_flag ? 1 : 0; | ||
192 | qstats.loopback = loopback_flag ? 1 : 0; | ||
193 | qstats.no_cn = no_cn_flag ? 1 : 0; | ||
194 | if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) { | ||
195 | printf("ERROR: Could not update map element\n"); | ||
196 | goto err; | ||
197 | } | ||
198 | |||
199 | if (!outFlag) | ||
200 | type = BPF_CGROUP_INET_INGRESS; | ||
201 | if (bpf_prog_attach(bpfprog_fd, cg1, type, 0)) { | ||
202 | printf("ERROR: bpf_prog_attach fails!\n"); | ||
203 | log_err("Attaching prog"); | ||
204 | goto err; | ||
205 | } | ||
206 | |||
207 | if (work_conserving_flag) { | ||
208 | struct timeval t0, t_last, t_new; | ||
209 | FILE *fin; | ||
210 | unsigned long long last_eth_tx_bytes, new_eth_tx_bytes; | ||
211 | signed long long last_cg_tx_bytes, new_cg_tx_bytes; | ||
212 | signed long long delta_time, delta_bytes, delta_rate; | ||
213 | int delta_ms; | ||
214 | #define DELTA_RATE_CHECK 10000 /* in us */ | ||
215 | #define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */ | ||
216 | |||
217 | bpf_map_lookup_elem(map_fd, &key, &qstats); | ||
218 | if (gettimeofday(&t0, NULL) < 0) | ||
219 | do_error("gettimeofday failed", true); | ||
220 | t_last = t0; | ||
221 | fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r"); | ||
222 | if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1) | ||
223 | do_error("fscanf fails", false); | ||
224 | fclose(fin); | ||
225 | last_cg_tx_bytes = qstats.bytes_total; | ||
226 | while (true) { | ||
227 | usleep(DELTA_RATE_CHECK); | ||
228 | if (gettimeofday(&t_new, NULL) < 0) | ||
229 | do_error("gettimeofday failed", true); | ||
230 | delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 + | ||
231 | (t_new.tv_usec - t0.tv_usec)/1000; | ||
232 | if (delta_ms > dur * 1000) | ||
233 | break; | ||
234 | delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 + | ||
235 | (t_new.tv_usec - t_last.tv_usec); | ||
236 | if (delta_time == 0) | ||
237 | continue; | ||
238 | t_last = t_new; | ||
239 | fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", | ||
240 | "r"); | ||
241 | if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1) | ||
242 | do_error("fscanf fails", false); | ||
243 | fclose(fin); | ||
244 | printf(" new_eth_tx_bytes:%llu\n", | ||
245 | new_eth_tx_bytes); | ||
246 | bpf_map_lookup_elem(map_fd, &key, &qstats); | ||
247 | new_cg_tx_bytes = qstats.bytes_total; | ||
248 | delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes; | ||
249 | last_eth_tx_bytes = new_eth_tx_bytes; | ||
250 | delta_rate = (delta_bytes * 8000000) / delta_time; | ||
251 | printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps", | ||
252 | delta_ms, delta_rate/1000000000.0, | ||
253 | rate/1000.0); | ||
254 | if (delta_rate < RATE_THRESHOLD) { | ||
255 | /* can increase cgroup rate limit, but first | ||
256 | * check if we are using the current limit. | ||
257 | * Currently increasing by 6.25%, unknown | ||
258 | * if that is the optimal rate. | ||
259 | */ | ||
260 | int rate_diff100; | ||
261 | |||
262 | delta_bytes = new_cg_tx_bytes - | ||
263 | last_cg_tx_bytes; | ||
264 | last_cg_tx_bytes = new_cg_tx_bytes; | ||
265 | delta_rate = (delta_bytes * 8000000) / | ||
266 | delta_time; | ||
267 | printf(" rate:%.3fGbps", | ||
268 | delta_rate/1000000000.0); | ||
269 | rate_diff100 = (((long long)rate)*1000000 - | ||
270 | delta_rate) * 100 / | ||
271 | (((long long) rate) * 1000000); | ||
272 | printf(" rdiff:%d", rate_diff100); | ||
273 | if (rate_diff100 <= 3) { | ||
274 | rate += (rate >> 4); | ||
275 | if (rate > RATE_THRESHOLD / 1000000) | ||
276 | rate = RATE_THRESHOLD / 1000000; | ||
277 | qstats.rate = rate; | ||
278 | printf(" INC\n"); | ||
279 | } else { | ||
280 | printf("\n"); | ||
281 | } | ||
282 | } else { | ||
283 | /* Need to decrease cgroup rate limit. | ||
284 | * Currently decreasing by 12.5%, unknown | ||
285 | * if that is optimal | ||
286 | */ | ||
287 | printf(" DEC\n"); | ||
288 | rate -= (rate >> 3); | ||
289 | if (rate < minRate) | ||
290 | rate = minRate; | ||
291 | qstats.rate = rate; | ||
292 | } | ||
293 | if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) | ||
294 | do_error("update map element fails", false); | ||
295 | } | ||
296 | } else { | ||
297 | sleep(dur); | ||
298 | } | ||
299 | // Get stats! | ||
300 | if (stats_flag && bpf_map_lookup_elem(map_fd, &key, &qstats)) { | ||
301 | char fname[100]; | ||
302 | FILE *fout; | ||
303 | |||
304 | if (!outFlag) | ||
305 | sprintf(fname, "hbm.%d.in", cg_id); | ||
306 | else | ||
307 | sprintf(fname, "hbm.%d.out", cg_id); | ||
308 | fout = fopen(fname, "w"); | ||
309 | fprintf(fout, "id:%d\n", cg_id); | ||
310 | fprintf(fout, "ERROR: Could not lookup queue_stats\n"); | ||
311 | } else if (stats_flag && qstats.lastPacketTime > | ||
312 | qstats.firstPacketTime) { | ||
313 | long long delta_us = (qstats.lastPacketTime - | ||
314 | qstats.firstPacketTime)/1000; | ||
315 | unsigned int rate_mbps = ((qstats.bytes_total - | ||
316 | qstats.bytes_dropped) * 8 / | ||
317 | delta_us); | ||
318 | double percent_pkts, percent_bytes; | ||
319 | char fname[100]; | ||
320 | FILE *fout; | ||
321 | int k; | ||
322 | static const char *returnValNames[] = { | ||
323 | "DROP_PKT", | ||
324 | "ALLOW_PKT", | ||
325 | "DROP_PKT_CWR", | ||
326 | "ALLOW_PKT_CWR" | ||
327 | }; | ||
328 | #define RET_VAL_COUNT 4 | ||
329 | |||
330 | // Future support of ingress | ||
331 | // if (!outFlag) | ||
332 | // sprintf(fname, "hbm.%d.in", cg_id); | ||
333 | // else | ||
334 | sprintf(fname, "hbm.%d.out", cg_id); | ||
335 | fout = fopen(fname, "w"); | ||
336 | fprintf(fout, "id:%d\n", cg_id); | ||
337 | fprintf(fout, "rate_mbps:%d\n", rate_mbps); | ||
338 | fprintf(fout, "duration:%.1f secs\n", | ||
339 | (qstats.lastPacketTime - qstats.firstPacketTime) / | ||
340 | 1000000000.0); | ||
341 | fprintf(fout, "packets:%d\n", (int)qstats.pkts_total); | ||
342 | fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total / | ||
343 | 1000000)); | ||
344 | fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped); | ||
345 | fprintf(fout, "bytes_dropped_MB:%d\n", | ||
346 | (int)(qstats.bytes_dropped / | ||
347 | 1000000)); | ||
348 | // Marked Pkts and Bytes | ||
349 | percent_pkts = (qstats.pkts_marked * 100.0) / | ||
350 | (qstats.pkts_total + 1); | ||
351 | percent_bytes = (qstats.bytes_marked * 100.0) / | ||
352 | (qstats.bytes_total + 1); | ||
353 | fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts); | ||
354 | fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes); | ||
355 | |||
356 | // Dropped Pkts and Bytes | ||
357 | percent_pkts = (qstats.pkts_dropped * 100.0) / | ||
358 | (qstats.pkts_total + 1); | ||
359 | percent_bytes = (qstats.bytes_dropped * 100.0) / | ||
360 | (qstats.bytes_total + 1); | ||
361 | fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts); | ||
362 | fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes); | ||
363 | |||
364 | // ECN CE markings | ||
365 | percent_pkts = (qstats.pkts_ecn_ce * 100.0) / | ||
366 | (qstats.pkts_total + 1); | ||
367 | fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts, | ||
368 | (int)qstats.pkts_ecn_ce); | ||
369 | |||
370 | // Average cwnd | ||
371 | fprintf(fout, "avg cwnd:%d\n", | ||
372 | (int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1))); | ||
373 | // Average rtt | ||
374 | fprintf(fout, "avg rtt:%d\n", | ||
375 | (int)(qstats.sum_rtt / (qstats.pkts_total + 1))); | ||
376 | // Average credit | ||
377 | if (edt_flag) | ||
378 | fprintf(fout, "avg credit_ms:%.03f\n", | ||
379 | (qstats.sum_credit / | ||
380 | (qstats.pkts_total + 1.0)) / 1000000.0); | ||
381 | else | ||
382 | fprintf(fout, "avg credit:%d\n", | ||
383 | (int)(qstats.sum_credit / | ||
384 | (1500 * ((int)qstats.pkts_total ) + 1))); | ||
385 | |||
386 | // Return values stats | ||
387 | for (k = 0; k < RET_VAL_COUNT; k++) { | ||
388 | percent_pkts = (qstats.returnValCount[k] * 100.0) / | ||
389 | (qstats.pkts_total + 1); | ||
390 | fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k], | ||
391 | percent_pkts, (int)qstats.returnValCount[k]); | ||
392 | } | ||
393 | fclose(fout); | ||
394 | } | ||
395 | |||
396 | if (debugFlag) | ||
397 | read_trace_pipe2(); | ||
398 | return rc; | ||
399 | err: | ||
400 | rc = 1; | ||
401 | |||
402 | if (cg1) | ||
403 | close(cg1); | ||
404 | cleanup_cgroup_environment(); | ||
405 | |||
406 | return rc; | ||
407 | } | ||
408 | |||
409 | static void Usage(void) | ||
410 | { | ||
411 | printf("This program loads a cgroup skb BPF program to enforce\n" | ||
412 | "cgroup output (egress) bandwidth limits.\n\n" | ||
413 | "USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>]\n" | ||
414 | " [-s] [-t <secs>] [-w] [-h] [prog]\n" | ||
415 | " Where:\n" | ||
416 | " -o indicates egress direction (default)\n" | ||
417 | " -d print BPF trace debug buffer\n" | ||
418 | " --edt use fq's Earliest Departure Time\n" | ||
419 | " -l also limit flows using loopback\n" | ||
420 | " -n <#> to create cgroup \"/hbm#\" and attach prog\n" | ||
421 | " Default is /hbm1\n" | ||
422 | " --no_cn disable CN notifications\n" | ||
423 | " -r <rate> Rate in Mbps\n" | ||
424 | " -s Update HBM stats\n" | ||
425 | " -t <time> Exit after specified seconds (default is 0)\n" | ||
426 | " -w Work conserving flag. cgroup can increase\n" | ||
427 | " bandwidth beyond the rate limit specified\n" | ||
428 | " while there is available bandwidth. Current\n" | ||
429 | " implementation assumes there is only eth0\n" | ||
430 | " but can be extended to support multiple NICs\n" | ||
431 | " -h print this info\n" | ||
432 | " prog BPF program file name. Name defaults to\n" | ||
433 | " hbm_out_kern.o\n"); | ||
434 | } | ||
435 | |||
436 | int main(int argc, char **argv) | ||
437 | { | ||
438 | char *prog = "hbm_out_kern.o"; | ||
439 | int k; | ||
440 | int cg_id = 1; | ||
441 | char *optstring = "iodln:r:st:wh"; | ||
442 | struct option loptions[] = { | ||
443 | {"no_cn", 0, NULL, 1}, | ||
444 | {"edt", 0, NULL, 2}, | ||
445 | {NULL, 0, NULL, 0} | ||
446 | }; | ||
447 | |||
448 | while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) { | ||
449 | switch (k) { | ||
450 | case 1: | ||
451 | no_cn_flag = true; | ||
452 | break; | ||
453 | case 2: | ||
454 | prog = "hbm_edt_kern.o"; | ||
455 | edt_flag = true; | ||
456 | break; | ||
457 | case'o': | ||
458 | break; | ||
459 | case 'd': | ||
460 | debugFlag = true; | ||
461 | break; | ||
462 | case 'l': | ||
463 | loopback_flag = true; | ||
464 | break; | ||
465 | case 'n': | ||
466 | cg_id = atoi(optarg); | ||
467 | break; | ||
468 | case 'r': | ||
469 | minRate = atoi(optarg) * 1.024; | ||
470 | rate = minRate; | ||
471 | break; | ||
472 | case 's': | ||
473 | stats_flag = true; | ||
474 | break; | ||
475 | case 't': | ||
476 | dur = atoi(optarg); | ||
477 | break; | ||
478 | case 'w': | ||
479 | work_conserving_flag = true; | ||
480 | break; | ||
481 | case '?': | ||
482 | if (optopt == 'n' || optopt == 'r' || optopt == 't') | ||
483 | fprintf(stderr, | ||
484 | "Option -%c requires an argument.\n\n", | ||
485 | optopt); | ||
486 | case 'h': | ||
487 | __fallthrough; | ||
488 | default: | ||
489 | Usage(); | ||
490 | return 0; | ||
491 | } | ||
492 | } | ||
493 | |||
494 | if (optind < argc) | ||
495 | prog = argv[optind]; | ||
496 | printf("HBM prog: %s\n", prog != NULL ? prog : "NULL"); | ||
497 | |||
498 | return run_bpf_prog(prog, cg_id); | ||
499 | } | ||
diff --git a/samples/bpf/hbm.h b/samples/bpf/hbm.h new file mode 100644 index 000000000..f0963ed6a --- /dev/null +++ b/samples/bpf/hbm.h | |||
@@ -0,0 +1,38 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 | ||
2 | * | ||
3 | * Copyright (c) 2019 Facebook | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of version 2 of the GNU General Public | ||
7 | * License as published by the Free Software Foundation. | ||
8 | * | ||
9 | * Include file for Host Bandwidth Management (HBM) programs | ||
10 | */ | ||
11 | struct hbm_vqueue { | ||
12 | struct bpf_spin_lock lock; | ||
13 | /* 4 byte hole */ | ||
14 | unsigned long long lasttime; /* In ns */ | ||
15 | int credit; /* In bytes */ | ||
16 | unsigned int rate; /* In bytes per NS << 20 */ | ||
17 | }; | ||
18 | |||
19 | struct hbm_queue_stats { | ||
20 | unsigned long rate; /* in Mbps*/ | ||
21 | unsigned long stats:1, /* get HBM stats (marked, dropped,..) */ | ||
22 | loopback:1, /* also limit flows using loopback */ | ||
23 | no_cn:1; /* do not use cn flags */ | ||
24 | unsigned long long pkts_marked; | ||
25 | unsigned long long bytes_marked; | ||
26 | unsigned long long pkts_dropped; | ||
27 | unsigned long long bytes_dropped; | ||
28 | unsigned long long pkts_total; | ||
29 | unsigned long long bytes_total; | ||
30 | unsigned long long firstPacketTime; | ||
31 | unsigned long long lastPacketTime; | ||
32 | unsigned long long pkts_ecn_ce; | ||
33 | unsigned long long returnValCount[4]; | ||
34 | unsigned long long sum_cwnd; | ||
35 | unsigned long long sum_rtt; | ||
36 | unsigned long long sum_cwnd_cnt; | ||
37 | long long sum_credit; | ||
38 | }; | ||
diff --git a/samples/bpf/hbm_edt_kern.c b/samples/bpf/hbm_edt_kern.c new file mode 100644 index 000000000..a65b677ac --- /dev/null +++ b/samples/bpf/hbm_edt_kern.c | |||
@@ -0,0 +1,168 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* Copyright (c) 2019 Facebook | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of version 2 of the GNU General Public | ||
6 | * License as published by the Free Software Foundation. | ||
7 | * | ||
8 | * Sample Host Bandwidth Manager (HBM) BPF program. | ||
9 | * | ||
10 | * A cgroup skb BPF egress program to limit cgroup output bandwidth. | ||
11 | * It uses a modified virtual token bucket queue to limit average | ||
12 | * egress bandwidth. The implementation uses credits instead of tokens. | ||
13 | * Negative credits imply that queueing would have happened (this is | ||
14 | * a virtual queue, so no queueing is done by it. However, queueing may | ||
15 | * occur at the actual qdisc (which is not used for rate limiting). | ||
16 | * | ||
17 | * This implementation uses 3 thresholds, one to start marking packets and | ||
18 | * the other two to drop packets: | ||
19 | * CREDIT | ||
20 | * - <--------------------------|------------------------> + | ||
21 | * | | | 0 | ||
22 | * | Large pkt | | ||
23 | * | drop thresh | | ||
24 | * Small pkt drop Mark threshold | ||
25 | * thresh | ||
26 | * | ||
27 | * The effect of marking depends on the type of packet: | ||
28 | * a) If the packet is ECN enabled and it is a TCP packet, then the packet | ||
29 | * is ECN marked. | ||
30 | * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr | ||
31 | * to reduce the congestion window. The current implementation uses a linear | ||
32 | * distribution (0% probability at marking threshold, 100% probability | ||
33 | * at drop threshold). | ||
34 | * c) If the packet is not a TCP packet, then it is dropped. | ||
35 | * | ||
36 | * If the credit is below the drop threshold, the packet is dropped. If it | ||
37 | * is a TCP packet, then it also calls tcp_cwr since packets dropped by | ||
38 | * by a cgroup skb BPF program do not automatically trigger a call to | ||
39 | * tcp_cwr in the current kernel code. | ||
40 | * | ||
41 | * This BPF program actually uses 2 drop thresholds, one threshold | ||
42 | * for larger packets (>= 120 bytes) and another for smaller packets. This | ||
43 | * protects smaller packets such as SYNs, ACKs, etc. | ||
44 | * | ||
45 | * The default bandwidth limit is set at 1Gbps but this can be changed by | ||
46 | * a user program through a shared BPF map. In addition, by default this BPF | ||
47 | * program does not limit connections using loopback. This behavior can be | ||
48 | * overwritten by the user program. There is also an option to calculate | ||
49 | * some statistics, such as percent of packets marked or dropped, which | ||
50 | * a user program, such as hbm, can access. | ||
51 | */ | ||
52 | |||
53 | #include "hbm_kern.h" | ||
54 | |||
55 | SEC("cgroup_skb/egress") | ||
56 | int _hbm_out_cg(struct __sk_buff *skb) | ||
57 | { | ||
58 | long long delta = 0, delta_send; | ||
59 | unsigned long long curtime, sendtime; | ||
60 | struct hbm_queue_stats *qsp = NULL; | ||
61 | unsigned int queue_index = 0; | ||
62 | bool congestion_flag = false; | ||
63 | bool ecn_ce_flag = false; | ||
64 | struct hbm_pkt_info pkti = {}; | ||
65 | struct hbm_vqueue *qdp; | ||
66 | bool drop_flag = false; | ||
67 | bool cwr_flag = false; | ||
68 | int len = skb->len; | ||
69 | int rv = ALLOW_PKT; | ||
70 | |||
71 | qsp = bpf_map_lookup_elem(&queue_stats, &queue_index); | ||
72 | |||
73 | // Check if we should ignore loopback traffic | ||
74 | if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1)) | ||
75 | return ALLOW_PKT; | ||
76 | |||
77 | hbm_get_pkt_info(skb, &pkti); | ||
78 | |||
79 | // We may want to account for the length of headers in len | ||
80 | // calculation, like ETH header + overhead, specially if it | ||
81 | // is a gso packet. But I am not doing it right now. | ||
82 | |||
83 | qdp = bpf_get_local_storage(&queue_state, 0); | ||
84 | if (!qdp) | ||
85 | return ALLOW_PKT; | ||
86 | if (qdp->lasttime == 0) | ||
87 | hbm_init_edt_vqueue(qdp, 1024); | ||
88 | |||
89 | curtime = bpf_ktime_get_ns(); | ||
90 | |||
91 | // Begin critical section | ||
92 | bpf_spin_lock(&qdp->lock); | ||
93 | delta = qdp->lasttime - curtime; | ||
94 | // bound bursts to 100us | ||
95 | if (delta < -BURST_SIZE_NS) { | ||
96 | // negative delta is a credit that allows bursts | ||
97 | qdp->lasttime = curtime - BURST_SIZE_NS; | ||
98 | delta = -BURST_SIZE_NS; | ||
99 | } | ||
100 | sendtime = qdp->lasttime; | ||
101 | delta_send = BYTES_TO_NS(len, qdp->rate); | ||
102 | __sync_add_and_fetch(&(qdp->lasttime), delta_send); | ||
103 | bpf_spin_unlock(&qdp->lock); | ||
104 | // End critical section | ||
105 | |||
106 | // Set EDT of packet | ||
107 | skb->tstamp = sendtime; | ||
108 | |||
109 | // Check if we should update rate | ||
110 | if (qsp != NULL && (qsp->rate * 128) != qdp->rate) | ||
111 | qdp->rate = qsp->rate * 128; | ||
112 | |||
113 | // Set flags (drop, congestion, cwr) | ||
114 | // last packet will be sent in the future, bound latency | ||
115 | if (delta > DROP_THRESH_NS || (delta > LARGE_PKT_DROP_THRESH_NS && | ||
116 | len > LARGE_PKT_THRESH)) { | ||
117 | drop_flag = true; | ||
118 | if (pkti.is_tcp && pkti.ecn == 0) | ||
119 | cwr_flag = true; | ||
120 | } else if (delta > MARK_THRESH_NS) { | ||
121 | if (pkti.is_tcp) | ||
122 | congestion_flag = true; | ||
123 | else | ||
124 | drop_flag = true; | ||
125 | } | ||
126 | |||
127 | if (congestion_flag) { | ||
128 | if (bpf_skb_ecn_set_ce(skb)) { | ||
129 | ecn_ce_flag = true; | ||
130 | } else { | ||
131 | if (pkti.is_tcp) { | ||
132 | unsigned int rand = bpf_get_prandom_u32(); | ||
133 | |||
134 | if (delta >= MARK_THRESH_NS + | ||
135 | (rand % MARK_REGION_SIZE_NS)) { | ||
136 | // Do congestion control | ||
137 | cwr_flag = true; | ||
138 | } | ||
139 | } else if (len > LARGE_PKT_THRESH) { | ||
140 | // Problem if too many small packets? | ||
141 | drop_flag = true; | ||
142 | congestion_flag = false; | ||
143 | } | ||
144 | } | ||
145 | } | ||
146 | |||
147 | if (pkti.is_tcp && drop_flag && pkti.packets_out <= 1) { | ||
148 | drop_flag = false; | ||
149 | cwr_flag = true; | ||
150 | congestion_flag = false; | ||
151 | } | ||
152 | |||
153 | if (qsp != NULL && qsp->no_cn) | ||
154 | cwr_flag = false; | ||
155 | |||
156 | hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag, | ||
157 | cwr_flag, ecn_ce_flag, &pkti, (int) delta); | ||
158 | |||
159 | if (drop_flag) { | ||
160 | __sync_add_and_fetch(&(qdp->lasttime), -delta_send); | ||
161 | rv = DROP_PKT; | ||
162 | } | ||
163 | |||
164 | if (cwr_flag) | ||
165 | rv |= CWR; | ||
166 | return rv; | ||
167 | } | ||
168 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h new file mode 100644 index 000000000..e00f26f6a --- /dev/null +++ b/samples/bpf/hbm_kern.h | |||
@@ -0,0 +1,217 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 | ||
2 | * | ||
3 | * Copyright (c) 2019 Facebook | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of version 2 of the GNU General Public | ||
7 | * License as published by the Free Software Foundation. | ||
8 | * | ||
9 | * Include file for sample Host Bandwidth Manager (HBM) BPF programs | ||
10 | */ | ||
11 | #define KBUILD_MODNAME "foo" | ||
12 | #include <stddef.h> | ||
13 | #include <stdbool.h> | ||
14 | #include <uapi/linux/bpf.h> | ||
15 | #include <uapi/linux/if_ether.h> | ||
16 | #include <uapi/linux/if_packet.h> | ||
17 | #include <uapi/linux/ip.h> | ||
18 | #include <uapi/linux/ipv6.h> | ||
19 | #include <uapi/linux/in.h> | ||
20 | #include <uapi/linux/tcp.h> | ||
21 | #include <uapi/linux/filter.h> | ||
22 | #include <uapi/linux/pkt_cls.h> | ||
23 | #include <net/ipv6.h> | ||
24 | #include <net/inet_ecn.h> | ||
25 | #include <bpf/bpf_endian.h> | ||
26 | #include <bpf/bpf_helpers.h> | ||
27 | #include "hbm.h" | ||
28 | |||
29 | #define DROP_PKT 0 | ||
30 | #define ALLOW_PKT 1 | ||
31 | #define TCP_ECN_OK 1 | ||
32 | #define CWR 2 | ||
33 | |||
34 | #ifndef HBM_DEBUG // Define HBM_DEBUG to enable debugging | ||
35 | #undef bpf_printk | ||
36 | #define bpf_printk(fmt, ...) | ||
37 | #endif | ||
38 | |||
39 | #define INITIAL_CREDIT_PACKETS 100 | ||
40 | #define MAX_BYTES_PER_PACKET 1500 | ||
41 | #define MARK_THRESH (40 * MAX_BYTES_PER_PACKET) | ||
42 | #define DROP_THRESH (80 * 5 * MAX_BYTES_PER_PACKET) | ||
43 | #define LARGE_PKT_DROP_THRESH (DROP_THRESH - (15 * MAX_BYTES_PER_PACKET)) | ||
44 | #define MARK_REGION_SIZE (LARGE_PKT_DROP_THRESH - MARK_THRESH) | ||
45 | #define LARGE_PKT_THRESH 120 | ||
46 | #define MAX_CREDIT (100 * MAX_BYTES_PER_PACKET) | ||
47 | #define INIT_CREDIT (INITIAL_CREDIT_PACKETS * MAX_BYTES_PER_PACKET) | ||
48 | |||
49 | // Time base accounting for fq's EDT | ||
50 | #define BURST_SIZE_NS 100000 // 100us | ||
51 | #define MARK_THRESH_NS 50000 // 50us | ||
52 | #define DROP_THRESH_NS 500000 // 500us | ||
53 | // Reserve 20us of queuing for small packets (less than 120 bytes) | ||
54 | #define LARGE_PKT_DROP_THRESH_NS (DROP_THRESH_NS - 20000) | ||
55 | #define MARK_REGION_SIZE_NS (LARGE_PKT_DROP_THRESH_NS - MARK_THRESH_NS) | ||
56 | |||
57 | // rate in bytes per ns << 20 | ||
58 | #define CREDIT_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20) | ||
59 | #define BYTES_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20) | ||
60 | #define BYTES_TO_NS(bytes, rate) div64_u64(((u64)(bytes)) << 20, (u64)(rate)) | ||
61 | |||
62 | struct { | ||
63 | __uint(type, BPF_MAP_TYPE_CGROUP_STORAGE); | ||
64 | __type(key, struct bpf_cgroup_storage_key); | ||
65 | __type(value, struct hbm_vqueue); | ||
66 | } queue_state SEC(".maps"); | ||
67 | |||
68 | struct { | ||
69 | __uint(type, BPF_MAP_TYPE_ARRAY); | ||
70 | __uint(max_entries, 1); | ||
71 | __type(key, u32); | ||
72 | __type(value, struct hvm_queue_stats); | ||
73 | } queue_stats SEC(".maps"); | ||
74 | |||
75 | struct hbm_pkt_info { | ||
76 | int cwnd; | ||
77 | int rtt; | ||
78 | int packets_out; | ||
79 | bool is_ip; | ||
80 | bool is_tcp; | ||
81 | short ecn; | ||
82 | }; | ||
83 | |||
84 | static int get_tcp_info(struct __sk_buff *skb, struct hbm_pkt_info *pkti) | ||
85 | { | ||
86 | struct bpf_sock *sk; | ||
87 | struct bpf_tcp_sock *tp; | ||
88 | |||
89 | sk = skb->sk; | ||
90 | if (sk) { | ||
91 | sk = bpf_sk_fullsock(sk); | ||
92 | if (sk) { | ||
93 | if (sk->protocol == IPPROTO_TCP) { | ||
94 | tp = bpf_tcp_sock(sk); | ||
95 | if (tp) { | ||
96 | pkti->cwnd = tp->snd_cwnd; | ||
97 | pkti->rtt = tp->srtt_us >> 3; | ||
98 | pkti->packets_out = tp->packets_out; | ||
99 | return 0; | ||
100 | } | ||
101 | } | ||
102 | } | ||
103 | } | ||
104 | pkti->cwnd = 0; | ||
105 | pkti->rtt = 0; | ||
106 | pkti->packets_out = 0; | ||
107 | return 1; | ||
108 | } | ||
109 | |||
110 | static void hbm_get_pkt_info(struct __sk_buff *skb, | ||
111 | struct hbm_pkt_info *pkti) | ||
112 | { | ||
113 | struct iphdr iph; | ||
114 | struct ipv6hdr *ip6h; | ||
115 | |||
116 | pkti->cwnd = 0; | ||
117 | pkti->rtt = 0; | ||
118 | bpf_skb_load_bytes(skb, 0, &iph, 12); | ||
119 | if (iph.version == 6) { | ||
120 | ip6h = (struct ipv6hdr *)&iph; | ||
121 | pkti->is_ip = true; | ||
122 | pkti->is_tcp = (ip6h->nexthdr == 6); | ||
123 | pkti->ecn = (ip6h->flow_lbl[0] >> 4) & INET_ECN_MASK; | ||
124 | } else if (iph.version == 4) { | ||
125 | pkti->is_ip = true; | ||
126 | pkti->is_tcp = (iph.protocol == 6); | ||
127 | pkti->ecn = iph.tos & INET_ECN_MASK; | ||
128 | } else { | ||
129 | pkti->is_ip = false; | ||
130 | pkti->is_tcp = false; | ||
131 | pkti->ecn = 0; | ||
132 | } | ||
133 | if (pkti->is_tcp) | ||
134 | get_tcp_info(skb, pkti); | ||
135 | } | ||
136 | |||
137 | static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate) | ||
138 | { | ||
139 | bpf_printk("Initializing queue_state, rate:%d\n", rate * 128); | ||
140 | qdp->lasttime = bpf_ktime_get_ns(); | ||
141 | qdp->credit = INIT_CREDIT; | ||
142 | qdp->rate = rate * 128; | ||
143 | } | ||
144 | |||
145 | static __always_inline void hbm_init_edt_vqueue(struct hbm_vqueue *qdp, | ||
146 | int rate) | ||
147 | { | ||
148 | unsigned long long curtime; | ||
149 | |||
150 | curtime = bpf_ktime_get_ns(); | ||
151 | bpf_printk("Initializing queue_state, rate:%d\n", rate * 128); | ||
152 | qdp->lasttime = curtime - BURST_SIZE_NS; // support initial burst | ||
153 | qdp->credit = 0; // not used | ||
154 | qdp->rate = rate * 128; | ||
155 | } | ||
156 | |||
157 | static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp, | ||
158 | int len, | ||
159 | unsigned long long curtime, | ||
160 | bool congestion_flag, | ||
161 | bool drop_flag, | ||
162 | bool cwr_flag, | ||
163 | bool ecn_ce_flag, | ||
164 | struct hbm_pkt_info *pkti, | ||
165 | int credit) | ||
166 | { | ||
167 | int rv = ALLOW_PKT; | ||
168 | |||
169 | if (qsp != NULL) { | ||
170 | // Following is needed for work conserving | ||
171 | __sync_add_and_fetch(&(qsp->bytes_total), len); | ||
172 | if (qsp->stats) { | ||
173 | // Optionally update statistics | ||
174 | if (qsp->firstPacketTime == 0) | ||
175 | qsp->firstPacketTime = curtime; | ||
176 | qsp->lastPacketTime = curtime; | ||
177 | __sync_add_and_fetch(&(qsp->pkts_total), 1); | ||
178 | if (congestion_flag) { | ||
179 | __sync_add_and_fetch(&(qsp->pkts_marked), 1); | ||
180 | __sync_add_and_fetch(&(qsp->bytes_marked), len); | ||
181 | } | ||
182 | if (drop_flag) { | ||
183 | __sync_add_and_fetch(&(qsp->pkts_dropped), 1); | ||
184 | __sync_add_and_fetch(&(qsp->bytes_dropped), | ||
185 | len); | ||
186 | } | ||
187 | if (ecn_ce_flag) | ||
188 | __sync_add_and_fetch(&(qsp->pkts_ecn_ce), 1); | ||
189 | if (pkti->cwnd) { | ||
190 | __sync_add_and_fetch(&(qsp->sum_cwnd), | ||
191 | pkti->cwnd); | ||
192 | __sync_add_and_fetch(&(qsp->sum_cwnd_cnt), 1); | ||
193 | } | ||
194 | if (pkti->rtt) | ||
195 | __sync_add_and_fetch(&(qsp->sum_rtt), | ||
196 | pkti->rtt); | ||
197 | __sync_add_and_fetch(&(qsp->sum_credit), credit); | ||
198 | |||
199 | if (drop_flag) | ||
200 | rv = DROP_PKT; | ||
201 | if (cwr_flag) | ||
202 | rv |= 2; | ||
203 | if (rv == DROP_PKT) | ||
204 | __sync_add_and_fetch(&(qsp->returnValCount[0]), | ||
205 | 1); | ||
206 | else if (rv == ALLOW_PKT) | ||
207 | __sync_add_and_fetch(&(qsp->returnValCount[1]), | ||
208 | 1); | ||
209 | else if (rv == 2) | ||
210 | __sync_add_and_fetch(&(qsp->returnValCount[2]), | ||
211 | 1); | ||
212 | else if (rv == 3) | ||
213 | __sync_add_and_fetch(&(qsp->returnValCount[3]), | ||
214 | 1); | ||
215 | } | ||
216 | } | ||
217 | } | ||
diff --git a/samples/bpf/hbm_out_kern.c b/samples/bpf/hbm_out_kern.c new file mode 100644 index 000000000..829934bd4 --- /dev/null +++ b/samples/bpf/hbm_out_kern.c | |||
@@ -0,0 +1,179 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* Copyright (c) 2019 Facebook | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of version 2 of the GNU General Public | ||
6 | * License as published by the Free Software Foundation. | ||
7 | * | ||
8 | * Sample Host Bandwidth Manager (HBM) BPF program. | ||
9 | * | ||
10 | * A cgroup skb BPF egress program to limit cgroup output bandwidth. | ||
11 | * It uses a modified virtual token bucket queue to limit average | ||
12 | * egress bandwidth. The implementation uses credits instead of tokens. | ||
13 | * Negative credits imply that queueing would have happened (this is | ||
14 | * a virtual queue, so no queueing is done by it. However, queueing may | ||
15 | * occur at the actual qdisc (which is not used for rate limiting). | ||
16 | * | ||
17 | * This implementation uses 3 thresholds, one to start marking packets and | ||
18 | * the other two to drop packets: | ||
19 | * CREDIT | ||
20 | * - <--------------------------|------------------------> + | ||
21 | * | | | 0 | ||
22 | * | Large pkt | | ||
23 | * | drop thresh | | ||
24 | * Small pkt drop Mark threshold | ||
25 | * thresh | ||
26 | * | ||
27 | * The effect of marking depends on the type of packet: | ||
28 | * a) If the packet is ECN enabled and it is a TCP packet, then the packet | ||
29 | * is ECN marked. | ||
30 | * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr | ||
31 | * to reduce the congestion window. The current implementation uses a linear | ||
32 | * distribution (0% probability at marking threshold, 100% probability | ||
33 | * at drop threshold). | ||
34 | * c) If the packet is not a TCP packet, then it is dropped. | ||
35 | * | ||
36 | * If the credit is below the drop threshold, the packet is dropped. If it | ||
37 | * is a TCP packet, then it also calls tcp_cwr since packets dropped by | ||
38 | * by a cgroup skb BPF program do not automatically trigger a call to | ||
39 | * tcp_cwr in the current kernel code. | ||
40 | * | ||
41 | * This BPF program actually uses 2 drop thresholds, one threshold | ||
42 | * for larger packets (>= 120 bytes) and another for smaller packets. This | ||
43 | * protects smaller packets such as SYNs, ACKs, etc. | ||
44 | * | ||
45 | * The default bandwidth limit is set at 1Gbps but this can be changed by | ||
46 | * a user program through a shared BPF map. In addition, by default this BPF | ||
47 | * program does not limit connections using loopback. This behavior can be | ||
48 | * overwritten by the user program. There is also an option to calculate | ||
49 | * some statistics, such as percent of packets marked or dropped, which | ||
50 | * the user program can access. | ||
51 | * | ||
52 | * A latter patch provides such a program (hbm.c) | ||
53 | */ | ||
54 | |||
55 | #include "hbm_kern.h" | ||
56 | |||
57 | SEC("cgroup_skb/egress") | ||
58 | int _hbm_out_cg(struct __sk_buff *skb) | ||
59 | { | ||
60 | struct hbm_pkt_info pkti; | ||
61 | int len = skb->len; | ||
62 | unsigned int queue_index = 0; | ||
63 | unsigned long long curtime; | ||
64 | int credit; | ||
65 | signed long long delta = 0, new_credit; | ||
66 | int max_credit = MAX_CREDIT; | ||
67 | bool congestion_flag = false; | ||
68 | bool drop_flag = false; | ||
69 | bool cwr_flag = false; | ||
70 | bool ecn_ce_flag = false; | ||
71 | struct hbm_vqueue *qdp; | ||
72 | struct hbm_queue_stats *qsp = NULL; | ||
73 | int rv = ALLOW_PKT; | ||
74 | |||
75 | qsp = bpf_map_lookup_elem(&queue_stats, &queue_index); | ||
76 | if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1)) | ||
77 | return ALLOW_PKT; | ||
78 | |||
79 | hbm_get_pkt_info(skb, &pkti); | ||
80 | |||
81 | // We may want to account for the length of headers in len | ||
82 | // calculation, like ETH header + overhead, specially if it | ||
83 | // is a gso packet. But I am not doing it right now. | ||
84 | |||
85 | qdp = bpf_get_local_storage(&queue_state, 0); | ||
86 | if (!qdp) | ||
87 | return ALLOW_PKT; | ||
88 | else if (qdp->lasttime == 0) | ||
89 | hbm_init_vqueue(qdp, 1024); | ||
90 | |||
91 | curtime = bpf_ktime_get_ns(); | ||
92 | |||
93 | // Begin critical section | ||
94 | bpf_spin_lock(&qdp->lock); | ||
95 | credit = qdp->credit; | ||
96 | delta = curtime - qdp->lasttime; | ||
97 | /* delta < 0 implies that another process with a curtime greater | ||
98 | * than ours beat us to the critical section and already added | ||
99 | * the new credit, so we should not add it ourselves | ||
100 | */ | ||
101 | if (delta > 0) { | ||
102 | qdp->lasttime = curtime; | ||
103 | new_credit = credit + CREDIT_PER_NS(delta, qdp->rate); | ||
104 | if (new_credit > MAX_CREDIT) | ||
105 | credit = MAX_CREDIT; | ||
106 | else | ||
107 | credit = new_credit; | ||
108 | } | ||
109 | credit -= len; | ||
110 | qdp->credit = credit; | ||
111 | bpf_spin_unlock(&qdp->lock); | ||
112 | // End critical section | ||
113 | |||
114 | // Check if we should update rate | ||
115 | if (qsp != NULL && (qsp->rate * 128) != qdp->rate) { | ||
116 | qdp->rate = qsp->rate * 128; | ||
117 | bpf_printk("Updating rate: %d (1sec:%llu bits)\n", | ||
118 | (int)qdp->rate, | ||
119 | CREDIT_PER_NS(1000000000, qdp->rate) * 8); | ||
120 | } | ||
121 | |||
122 | // Set flags (drop, congestion, cwr) | ||
123 | // Dropping => we are congested, so ignore congestion flag | ||
124 | if (credit < -DROP_THRESH || | ||
125 | (len > LARGE_PKT_THRESH && credit < -LARGE_PKT_DROP_THRESH)) { | ||
126 | // Very congested, set drop packet | ||
127 | drop_flag = true; | ||
128 | if (pkti.ecn) | ||
129 | congestion_flag = true; | ||
130 | else if (pkti.is_tcp) | ||
131 | cwr_flag = true; | ||
132 | } else if (credit < 0) { | ||
133 | // Congested, set congestion flag | ||
134 | if (pkti.ecn || pkti.is_tcp) { | ||
135 | if (credit < -MARK_THRESH) | ||
136 | congestion_flag = true; | ||
137 | else | ||
138 | congestion_flag = false; | ||
139 | } else { | ||
140 | congestion_flag = true; | ||
141 | } | ||
142 | } | ||
143 | |||
144 | if (congestion_flag) { | ||
145 | if (bpf_skb_ecn_set_ce(skb)) { | ||
146 | ecn_ce_flag = true; | ||
147 | } else { | ||
148 | if (pkti.is_tcp) { | ||
149 | unsigned int rand = bpf_get_prandom_u32(); | ||
150 | |||
151 | if (-credit >= MARK_THRESH + | ||
152 | (rand % MARK_REGION_SIZE)) { | ||
153 | // Do congestion control | ||
154 | cwr_flag = true; | ||
155 | } | ||
156 | } else if (len > LARGE_PKT_THRESH) { | ||
157 | // Problem if too many small packets? | ||
158 | drop_flag = true; | ||
159 | } | ||
160 | } | ||
161 | } | ||
162 | |||
163 | if (qsp != NULL) | ||
164 | if (qsp->no_cn) | ||
165 | cwr_flag = false; | ||
166 | |||
167 | hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag, | ||
168 | cwr_flag, ecn_ce_flag, &pkti, credit); | ||
169 | |||
170 | if (drop_flag) { | ||
171 | __sync_add_and_fetch(&(qdp->credit), len); | ||
172 | rv = DROP_PKT; | ||
173 | } | ||
174 | |||
175 | if (cwr_flag) | ||
176 | rv |= 2; | ||
177 | return rv; | ||
178 | } | ||
179 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/ibumad_kern.c b/samples/bpf/ibumad_kern.c new file mode 100644 index 000000000..3a91b4c19 --- /dev/null +++ b/samples/bpf/ibumad_kern.c | |||
@@ -0,0 +1,138 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB | ||
2 | |||
3 | /** | ||
4 | * ibumad BPF sample kernel side | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of version 2 of the GNU General Public | ||
8 | * License as published by the Free Software Foundation. | ||
9 | * | ||
10 | * Copyright(c) 2018 Ira Weiny, Intel Corporation | ||
11 | */ | ||
12 | |||
13 | #define KBUILD_MODNAME "ibumad_count_pkts_by_class" | ||
14 | #include <uapi/linux/bpf.h> | ||
15 | |||
16 | #include <bpf/bpf_helpers.h> | ||
17 | |||
18 | |||
19 | struct bpf_map_def SEC("maps") read_count = { | ||
20 | .type = BPF_MAP_TYPE_ARRAY, | ||
21 | .key_size = sizeof(u32), /* class; u32 required */ | ||
22 | .value_size = sizeof(u64), /* count of mads read */ | ||
23 | .max_entries = 256, /* Room for all Classes */ | ||
24 | }; | ||
25 | |||
26 | struct bpf_map_def SEC("maps") write_count = { | ||
27 | .type = BPF_MAP_TYPE_ARRAY, | ||
28 | .key_size = sizeof(u32), /* class; u32 required */ | ||
29 | .value_size = sizeof(u64), /* count of mads written */ | ||
30 | .max_entries = 256, /* Room for all Classes */ | ||
31 | }; | ||
32 | |||
33 | #undef DEBUG | ||
34 | #ifndef DEBUG | ||
35 | #undef bpf_printk | ||
36 | #define bpf_printk(fmt, ...) | ||
37 | #endif | ||
38 | |||
39 | /* Taken from the current format defined in | ||
40 | * include/trace/events/ib_umad.h | ||
41 | * and | ||
42 | * /sys/kernel/debug/tracing/events/ib_umad/ib_umad_read/format | ||
43 | * /sys/kernel/debug/tracing/events/ib_umad/ib_umad_write/format | ||
44 | */ | ||
45 | struct ib_umad_rw_args { | ||
46 | u64 pad; | ||
47 | u8 port_num; | ||
48 | u8 sl; | ||
49 | u8 path_bits; | ||
50 | u8 grh_present; | ||
51 | u32 id; | ||
52 | u32 status; | ||
53 | u32 timeout_ms; | ||
54 | u32 retires; | ||
55 | u32 length; | ||
56 | u32 qpn; | ||
57 | u32 qkey; | ||
58 | u8 gid_index; | ||
59 | u8 hop_limit; | ||
60 | u16 lid; | ||
61 | u16 attr_id; | ||
62 | u16 pkey_index; | ||
63 | u8 base_version; | ||
64 | u8 mgmt_class; | ||
65 | u8 class_version; | ||
66 | u8 method; | ||
67 | u32 flow_label; | ||
68 | u16 mad_status; | ||
69 | u16 class_specific; | ||
70 | u32 attr_mod; | ||
71 | u64 tid; | ||
72 | u8 gid[16]; | ||
73 | u32 dev_index; | ||
74 | u8 traffic_class; | ||
75 | }; | ||
76 | |||
77 | SEC("tracepoint/ib_umad/ib_umad_read_recv") | ||
78 | int on_ib_umad_read_recv(struct ib_umad_rw_args *ctx) | ||
79 | { | ||
80 | u64 zero = 0, *val; | ||
81 | u8 class = ctx->mgmt_class; | ||
82 | |||
83 | bpf_printk("ib_umad read recv : class 0x%x\n", class); | ||
84 | |||
85 | val = bpf_map_lookup_elem(&read_count, &class); | ||
86 | if (!val) { | ||
87 | bpf_map_update_elem(&read_count, &class, &zero, BPF_NOEXIST); | ||
88 | val = bpf_map_lookup_elem(&read_count, &class); | ||
89 | if (!val) | ||
90 | return 0; | ||
91 | } | ||
92 | |||
93 | (*val) += 1; | ||
94 | |||
95 | return 0; | ||
96 | } | ||
97 | SEC("tracepoint/ib_umad/ib_umad_read_send") | ||
98 | int on_ib_umad_read_send(struct ib_umad_rw_args *ctx) | ||
99 | { | ||
100 | u64 zero = 0, *val; | ||
101 | u8 class = ctx->mgmt_class; | ||
102 | |||
103 | bpf_printk("ib_umad read send : class 0x%x\n", class); | ||
104 | |||
105 | val = bpf_map_lookup_elem(&read_count, &class); | ||
106 | if (!val) { | ||
107 | bpf_map_update_elem(&read_count, &class, &zero, BPF_NOEXIST); | ||
108 | val = bpf_map_lookup_elem(&read_count, &class); | ||
109 | if (!val) | ||
110 | return 0; | ||
111 | } | ||
112 | |||
113 | (*val) += 1; | ||
114 | |||
115 | return 0; | ||
116 | } | ||
117 | SEC("tracepoint/ib_umad/ib_umad_write") | ||
118 | int on_ib_umad_write(struct ib_umad_rw_args *ctx) | ||
119 | { | ||
120 | u64 zero = 0, *val; | ||
121 | u8 class = ctx->mgmt_class; | ||
122 | |||
123 | bpf_printk("ib_umad write : class 0x%x\n", class); | ||
124 | |||
125 | val = bpf_map_lookup_elem(&write_count, &class); | ||
126 | if (!val) { | ||
127 | bpf_map_update_elem(&write_count, &class, &zero, BPF_NOEXIST); | ||
128 | val = bpf_map_lookup_elem(&write_count, &class); | ||
129 | if (!val) | ||
130 | return 0; | ||
131 | } | ||
132 | |||
133 | (*val) += 1; | ||
134 | |||
135 | return 0; | ||
136 | } | ||
137 | |||
138 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/ibumad_user.c b/samples/bpf/ibumad_user.c new file mode 100644 index 000000000..fa06eef31 --- /dev/null +++ b/samples/bpf/ibumad_user.c | |||
@@ -0,0 +1,122 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB | ||
2 | |||
3 | /** | ||
4 | * ibumad BPF sample user side | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of version 2 of the GNU General Public | ||
8 | * License as published by the Free Software Foundation. | ||
9 | * | ||
10 | * Copyright(c) 2018 Ira Weiny, Intel Corporation | ||
11 | */ | ||
12 | |||
13 | #include <linux/bpf.h> | ||
14 | #include <signal.h> | ||
15 | #include <stdio.h> | ||
16 | #include <stdlib.h> | ||
17 | #include <string.h> | ||
18 | #include <unistd.h> | ||
19 | #include <sys/types.h> | ||
20 | #include <limits.h> | ||
21 | |||
22 | #include <sys/resource.h> | ||
23 | #include <getopt.h> | ||
24 | #include <net/if.h> | ||
25 | |||
26 | #include "bpf_load.h" | ||
27 | #include "bpf_util.h" | ||
28 | #include <bpf/libbpf.h> | ||
29 | |||
30 | static void dump_counts(int fd) | ||
31 | { | ||
32 | __u32 key; | ||
33 | __u64 value; | ||
34 | |||
35 | for (key = 0; key < 256; key++) { | ||
36 | if (bpf_map_lookup_elem(fd, &key, &value)) { | ||
37 | printf("failed to read key %u\n", key); | ||
38 | continue; | ||
39 | } | ||
40 | if (value) | ||
41 | printf("0x%02x : %llu\n", key, value); | ||
42 | } | ||
43 | } | ||
44 | |||
45 | static void dump_all_counts(void) | ||
46 | { | ||
47 | printf("Read 'Class : count'\n"); | ||
48 | dump_counts(map_fd[0]); | ||
49 | printf("Write 'Class : count'\n"); | ||
50 | dump_counts(map_fd[1]); | ||
51 | } | ||
52 | |||
53 | static void dump_exit(int sig) | ||
54 | { | ||
55 | dump_all_counts(); | ||
56 | exit(0); | ||
57 | } | ||
58 | |||
59 | static const struct option long_options[] = { | ||
60 | {"help", no_argument, NULL, 'h'}, | ||
61 | {"delay", required_argument, NULL, 'd'}, | ||
62 | }; | ||
63 | |||
64 | static void usage(char *cmd) | ||
65 | { | ||
66 | printf("eBPF test program to count packets from various IP addresses\n" | ||
67 | "Usage: %s <options>\n" | ||
68 | " --help, -h this menu\n" | ||
69 | " --delay, -d <delay> wait <delay> sec between prints [1 - 1000000]\n" | ||
70 | , cmd | ||
71 | ); | ||
72 | } | ||
73 | |||
74 | int main(int argc, char **argv) | ||
75 | { | ||
76 | unsigned long delay = 5; | ||
77 | int longindex = 0; | ||
78 | int opt; | ||
79 | char bpf_file[256]; | ||
80 | |||
81 | /* Create the eBPF kernel code path name. | ||
82 | * This follows the pattern of all of the other bpf samples | ||
83 | */ | ||
84 | snprintf(bpf_file, sizeof(bpf_file), "%s_kern.o", argv[0]); | ||
85 | |||
86 | /* Do one final dump when exiting */ | ||
87 | signal(SIGINT, dump_exit); | ||
88 | signal(SIGTERM, dump_exit); | ||
89 | |||
90 | while ((opt = getopt_long(argc, argv, "hd:rSw", | ||
91 | long_options, &longindex)) != -1) { | ||
92 | switch (opt) { | ||
93 | case 'd': | ||
94 | delay = strtoul(optarg, NULL, 0); | ||
95 | if (delay == ULONG_MAX || delay < 0 || | ||
96 | delay > 1000000) { | ||
97 | fprintf(stderr, "ERROR: invalid delay : %s\n", | ||
98 | optarg); | ||
99 | usage(argv[0]); | ||
100 | return 1; | ||
101 | } | ||
102 | break; | ||
103 | default: | ||
104 | case 'h': | ||
105 | usage(argv[0]); | ||
106 | return 1; | ||
107 | } | ||
108 | } | ||
109 | |||
110 | if (load_bpf_file(bpf_file)) { | ||
111 | fprintf(stderr, "ERROR: failed to load eBPF from file : %s\n", | ||
112 | bpf_file); | ||
113 | return 1; | ||
114 | } | ||
115 | |||
116 | while (1) { | ||
117 | sleep(delay); | ||
118 | dump_all_counts(); | ||
119 | } | ||
120 | |||
121 | return 0; | ||
122 | } | ||
diff --git a/samples/bpf/lathist_kern.c b/samples/bpf/lathist_kern.c new file mode 100644 index 000000000..4adfcbbe6 --- /dev/null +++ b/samples/bpf/lathist_kern.c | |||
@@ -0,0 +1,99 @@ | |||
1 | /* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com | ||
2 | * Copyright (c) 2015 BMW Car IT GmbH | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of version 2 of the GNU General Public | ||
6 | * License as published by the Free Software Foundation. | ||
7 | */ | ||
8 | #include <linux/version.h> | ||
9 | #include <linux/ptrace.h> | ||
10 | #include <uapi/linux/bpf.h> | ||
11 | #include <bpf/bpf_helpers.h> | ||
12 | |||
13 | #define MAX_ENTRIES 20 | ||
14 | #define MAX_CPU 4 | ||
15 | |||
16 | /* We need to stick to static allocated memory (an array instead of | ||
17 | * hash table) because managing dynamic memory from the | ||
18 | * trace_preempt_[on|off] tracepoints hooks is not supported. | ||
19 | */ | ||
20 | |||
21 | struct { | ||
22 | __uint(type, BPF_MAP_TYPE_ARRAY); | ||
23 | __type(key, int); | ||
24 | __type(value, u64); | ||
25 | __uint(max_entries, MAX_CPU); | ||
26 | } my_map SEC(".maps"); | ||
27 | |||
28 | SEC("kprobe/trace_preempt_off") | ||
29 | int bpf_prog1(struct pt_regs *ctx) | ||
30 | { | ||
31 | int cpu = bpf_get_smp_processor_id(); | ||
32 | u64 *ts = bpf_map_lookup_elem(&my_map, &cpu); | ||
33 | |||
34 | if (ts) | ||
35 | *ts = bpf_ktime_get_ns(); | ||
36 | |||
37 | return 0; | ||
38 | } | ||
39 | |||
40 | static unsigned int log2(unsigned int v) | ||
41 | { | ||
42 | unsigned int r; | ||
43 | unsigned int shift; | ||
44 | |||
45 | r = (v > 0xFFFF) << 4; v >>= r; | ||
46 | shift = (v > 0xFF) << 3; v >>= shift; r |= shift; | ||
47 | shift = (v > 0xF) << 2; v >>= shift; r |= shift; | ||
48 | shift = (v > 0x3) << 1; v >>= shift; r |= shift; | ||
49 | r |= (v >> 1); | ||
50 | |||
51 | return r; | ||
52 | } | ||
53 | |||
54 | static unsigned int log2l(unsigned long v) | ||
55 | { | ||
56 | unsigned int hi = v >> 32; | ||
57 | |||
58 | if (hi) | ||
59 | return log2(hi) + 32; | ||
60 | else | ||
61 | return log2(v); | ||
62 | } | ||
63 | |||
64 | struct { | ||
65 | __uint(type, BPF_MAP_TYPE_ARRAY); | ||
66 | __type(key, int); | ||
67 | __type(value, long); | ||
68 | __uint(max_entries, MAX_CPU * MAX_ENTRIES); | ||
69 | } my_lat SEC(".maps"); | ||
70 | |||
71 | SEC("kprobe/trace_preempt_on") | ||
72 | int bpf_prog2(struct pt_regs *ctx) | ||
73 | { | ||
74 | u64 *ts, cur_ts, delta; | ||
75 | int key, cpu; | ||
76 | long *val; | ||
77 | |||
78 | cpu = bpf_get_smp_processor_id(); | ||
79 | ts = bpf_map_lookup_elem(&my_map, &cpu); | ||
80 | if (!ts) | ||
81 | return 0; | ||
82 | |||
83 | cur_ts = bpf_ktime_get_ns(); | ||
84 | delta = log2l(cur_ts - *ts); | ||
85 | |||
86 | if (delta > MAX_ENTRIES - 1) | ||
87 | delta = MAX_ENTRIES - 1; | ||
88 | |||
89 | key = cpu * MAX_ENTRIES + delta; | ||
90 | val = bpf_map_lookup_elem(&my_lat, &key); | ||
91 | if (val) | ||
92 | __sync_fetch_and_add((long *)val, 1); | ||
93 | |||
94 | return 0; | ||
95 | |||
96 | } | ||
97 | |||
98 | char _license[] SEC("license") = "GPL"; | ||
99 | u32 _version SEC("version") = LINUX_VERSION_CODE; | ||
diff --git a/samples/bpf/lathist_user.c b/samples/bpf/lathist_user.c new file mode 100644 index 000000000..7d8ff2418 --- /dev/null +++ b/samples/bpf/lathist_user.c | |||
@@ -0,0 +1,130 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com | ||
3 | * Copyright (c) 2015 BMW Car IT GmbH | ||
4 | */ | ||
5 | #include <stdio.h> | ||
6 | #include <unistd.h> | ||
7 | #include <stdlib.h> | ||
8 | #include <signal.h> | ||
9 | #include <bpf/libbpf.h> | ||
10 | #include <bpf/bpf.h> | ||
11 | |||
12 | #define MAX_ENTRIES 20 | ||
13 | #define MAX_CPU 4 | ||
14 | #define MAX_STARS 40 | ||
15 | |||
16 | struct cpu_hist { | ||
17 | long data[MAX_ENTRIES]; | ||
18 | long max; | ||
19 | }; | ||
20 | |||
21 | static struct cpu_hist cpu_hist[MAX_CPU]; | ||
22 | |||
23 | static void stars(char *str, long val, long max, int width) | ||
24 | { | ||
25 | int i; | ||
26 | |||
27 | for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++) | ||
28 | str[i] = '*'; | ||
29 | if (val > max) | ||
30 | str[i - 1] = '+'; | ||
31 | str[i] = '\0'; | ||
32 | } | ||
33 | |||
34 | static void print_hist(void) | ||
35 | { | ||
36 | char starstr[MAX_STARS]; | ||
37 | struct cpu_hist *hist; | ||
38 | int i, j; | ||
39 | |||
40 | /* clear screen */ | ||
41 | printf("\033[2J"); | ||
42 | |||
43 | for (j = 0; j < MAX_CPU; j++) { | ||
44 | hist = &cpu_hist[j]; | ||
45 | |||
46 | /* ignore CPUs without data (maybe offline?) */ | ||
47 | if (hist->max == 0) | ||
48 | continue; | ||
49 | |||
50 | printf("CPU %d\n", j); | ||
51 | printf(" latency : count distribution\n"); | ||
52 | for (i = 1; i <= MAX_ENTRIES; i++) { | ||
53 | stars(starstr, hist->data[i - 1], hist->max, MAX_STARS); | ||
54 | printf("%8ld -> %-8ld : %-8ld |%-*s|\n", | ||
55 | (1l << i) >> 1, (1l << i) - 1, | ||
56 | hist->data[i - 1], MAX_STARS, starstr); | ||
57 | } | ||
58 | } | ||
59 | } | ||
60 | |||
61 | static void get_data(int fd) | ||
62 | { | ||
63 | long key, value; | ||
64 | int c, i; | ||
65 | |||
66 | for (i = 0; i < MAX_CPU; i++) | ||
67 | cpu_hist[i].max = 0; | ||
68 | |||
69 | for (c = 0; c < MAX_CPU; c++) { | ||
70 | for (i = 0; i < MAX_ENTRIES; i++) { | ||
71 | key = c * MAX_ENTRIES + i; | ||
72 | bpf_map_lookup_elem(fd, &key, &value); | ||
73 | |||
74 | cpu_hist[c].data[i] = value; | ||
75 | if (value > cpu_hist[c].max) | ||
76 | cpu_hist[c].max = value; | ||
77 | } | ||
78 | } | ||
79 | } | ||
80 | |||
81 | int main(int argc, char **argv) | ||
82 | { | ||
83 | struct bpf_link *links[2]; | ||
84 | struct bpf_program *prog; | ||
85 | struct bpf_object *obj; | ||
86 | char filename[256]; | ||
87 | int map_fd, i = 0; | ||
88 | |||
89 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
90 | obj = bpf_object__open_file(filename, NULL); | ||
91 | if (libbpf_get_error(obj)) { | ||
92 | fprintf(stderr, "ERROR: opening BPF object file failed\n"); | ||
93 | return 0; | ||
94 | } | ||
95 | |||
96 | /* load BPF program */ | ||
97 | if (bpf_object__load(obj)) { | ||
98 | fprintf(stderr, "ERROR: loading BPF object file failed\n"); | ||
99 | goto cleanup; | ||
100 | } | ||
101 | |||
102 | map_fd = bpf_object__find_map_fd_by_name(obj, "my_lat"); | ||
103 | if (map_fd < 0) { | ||
104 | fprintf(stderr, "ERROR: finding a map in obj file failed\n"); | ||
105 | goto cleanup; | ||
106 | } | ||
107 | |||
108 | bpf_object__for_each_program(prog, obj) { | ||
109 | links[i] = bpf_program__attach(prog); | ||
110 | if (libbpf_get_error(links[i])) { | ||
111 | fprintf(stderr, "ERROR: bpf_program__attach failed\n"); | ||
112 | links[i] = NULL; | ||
113 | goto cleanup; | ||
114 | } | ||
115 | i++; | ||
116 | } | ||
117 | |||
118 | while (1) { | ||
119 | get_data(map_fd); | ||
120 | print_hist(); | ||
121 | sleep(5); | ||
122 | } | ||
123 | |||
124 | cleanup: | ||
125 | for (i--; i >= 0; i--) | ||
126 | bpf_link__destroy(links[i]); | ||
127 | |||
128 | bpf_object__close(obj); | ||
129 | return 0; | ||
130 | } | ||
diff --git a/samples/bpf/lwt_len_hist.sh b/samples/bpf/lwt_len_hist.sh new file mode 100755 index 000000000..0eda9754f --- /dev/null +++ b/samples/bpf/lwt_len_hist.sh | |||
@@ -0,0 +1,40 @@ | |||
1 | #!/bin/bash | ||
2 | # SPDX-License-Identifier: GPL-2.0 | ||
3 | |||
4 | NS1=lwt_ns1 | ||
5 | VETH0=tst_lwt1a | ||
6 | VETH1=tst_lwt1b | ||
7 | |||
8 | TRACE_ROOT=/sys/kernel/debug/tracing | ||
9 | |||
10 | function cleanup { | ||
11 | # To reset saved histogram, remove pinned map | ||
12 | rm /sys/fs/bpf/tc/globals/lwt_len_hist_map | ||
13 | ip route del 192.168.253.2/32 dev $VETH0 2> /dev/null | ||
14 | ip link del $VETH0 2> /dev/null | ||
15 | ip link del $VETH1 2> /dev/null | ||
16 | ip netns exec $NS1 killall netserver | ||
17 | ip netns delete $NS1 2> /dev/null | ||
18 | } | ||
19 | |||
20 | cleanup | ||
21 | |||
22 | ip netns add $NS1 | ||
23 | ip link add $VETH0 type veth peer name $VETH1 | ||
24 | ip link set dev $VETH0 up | ||
25 | ip addr add 192.168.253.1/24 dev $VETH0 | ||
26 | ip link set $VETH1 netns $NS1 | ||
27 | ip netns exec $NS1 ip link set dev $VETH1 up | ||
28 | ip netns exec $NS1 ip addr add 192.168.253.2/24 dev $VETH1 | ||
29 | ip netns exec $NS1 netserver | ||
30 | |||
31 | echo 1 > ${TRACE_ROOT}/tracing_on | ||
32 | cp /dev/null ${TRACE_ROOT}/trace | ||
33 | ip route add 192.168.253.2/32 encap bpf out obj lwt_len_hist_kern.o section len_hist dev $VETH0 | ||
34 | netperf -H 192.168.253.2 -t TCP_STREAM | ||
35 | cat ${TRACE_ROOT}/trace | grep -v '^#' | ||
36 | ./lwt_len_hist | ||
37 | cleanup | ||
38 | echo 0 > ${TRACE_ROOT}/tracing_on | ||
39 | |||
40 | exit 0 | ||
diff --git a/samples/bpf/lwt_len_hist_kern.c b/samples/bpf/lwt_len_hist_kern.c new file mode 100644 index 000000000..9ed63e10e --- /dev/null +++ b/samples/bpf/lwt_len_hist_kern.c | |||
@@ -0,0 +1,82 @@ | |||
1 | /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | * | ||
7 | * This program is distributed in the hope that it will be useful, but | ||
8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
10 | * General Public License for more details. | ||
11 | */ | ||
12 | |||
13 | #include <uapi/linux/bpf.h> | ||
14 | #include <uapi/linux/if_ether.h> | ||
15 | #include <uapi/linux/ip.h> | ||
16 | #include <uapi/linux/in.h> | ||
17 | #include <bpf/bpf_helpers.h> | ||
18 | |||
19 | # define printk(fmt, ...) \ | ||
20 | ({ \ | ||
21 | char ____fmt[] = fmt; \ | ||
22 | bpf_trace_printk(____fmt, sizeof(____fmt), \ | ||
23 | ##__VA_ARGS__); \ | ||
24 | }) | ||
25 | |||
26 | struct bpf_elf_map { | ||
27 | __u32 type; | ||
28 | __u32 size_key; | ||
29 | __u32 size_value; | ||
30 | __u32 max_elem; | ||
31 | __u32 flags; | ||
32 | __u32 id; | ||
33 | __u32 pinning; | ||
34 | }; | ||
35 | |||
36 | struct bpf_elf_map SEC("maps") lwt_len_hist_map = { | ||
37 | .type = BPF_MAP_TYPE_PERCPU_HASH, | ||
38 | .size_key = sizeof(__u64), | ||
39 | .size_value = sizeof(__u64), | ||
40 | .pinning = 2, | ||
41 | .max_elem = 1024, | ||
42 | }; | ||
43 | |||
44 | static unsigned int log2(unsigned int v) | ||
45 | { | ||
46 | unsigned int r; | ||
47 | unsigned int shift; | ||
48 | |||
49 | r = (v > 0xFFFF) << 4; v >>= r; | ||
50 | shift = (v > 0xFF) << 3; v >>= shift; r |= shift; | ||
51 | shift = (v > 0xF) << 2; v >>= shift; r |= shift; | ||
52 | shift = (v > 0x3) << 1; v >>= shift; r |= shift; | ||
53 | r |= (v >> 1); | ||
54 | return r; | ||
55 | } | ||
56 | |||
57 | static unsigned int log2l(unsigned long v) | ||
58 | { | ||
59 | unsigned int hi = v >> 32; | ||
60 | if (hi) | ||
61 | return log2(hi) + 32; | ||
62 | else | ||
63 | return log2(v); | ||
64 | } | ||
65 | |||
66 | SEC("len_hist") | ||
67 | int do_len_hist(struct __sk_buff *skb) | ||
68 | { | ||
69 | __u64 *value, key, init_val = 1; | ||
70 | |||
71 | key = log2l(skb->len); | ||
72 | |||
73 | value = bpf_map_lookup_elem(&lwt_len_hist_map, &key); | ||
74 | if (value) | ||
75 | __sync_fetch_and_add(value, 1); | ||
76 | else | ||
77 | bpf_map_update_elem(&lwt_len_hist_map, &key, &init_val, BPF_ANY); | ||
78 | |||
79 | return BPF_OK; | ||
80 | } | ||
81 | |||
82 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/lwt_len_hist_user.c b/samples/bpf/lwt_len_hist_user.c new file mode 100644 index 000000000..430a4b7e3 --- /dev/null +++ b/samples/bpf/lwt_len_hist_user.c | |||
@@ -0,0 +1,77 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #include <linux/unistd.h> | ||
3 | #include <linux/bpf.h> | ||
4 | |||
5 | #include <stdlib.h> | ||
6 | #include <stdio.h> | ||
7 | #include <unistd.h> | ||
8 | #include <string.h> | ||
9 | #include <errno.h> | ||
10 | #include <arpa/inet.h> | ||
11 | |||
12 | #include <bpf/bpf.h> | ||
13 | #include "bpf_util.h" | ||
14 | |||
15 | #define MAX_INDEX 64 | ||
16 | #define MAX_STARS 38 | ||
17 | |||
18 | static void stars(char *str, long val, long max, int width) | ||
19 | { | ||
20 | int i; | ||
21 | |||
22 | for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++) | ||
23 | str[i] = '*'; | ||
24 | if (val > max) | ||
25 | str[i - 1] = '+'; | ||
26 | str[i] = '\0'; | ||
27 | } | ||
28 | |||
29 | int main(int argc, char **argv) | ||
30 | { | ||
31 | unsigned int nr_cpus = bpf_num_possible_cpus(); | ||
32 | const char *map_filename = "/sys/fs/bpf/tc/globals/lwt_len_hist_map"; | ||
33 | uint64_t values[nr_cpus], sum, max_value = 0, data[MAX_INDEX] = {}; | ||
34 | uint64_t key = 0, next_key, max_key = 0; | ||
35 | char starstr[MAX_STARS]; | ||
36 | int i, map_fd; | ||
37 | |||
38 | map_fd = bpf_obj_get(map_filename); | ||
39 | if (map_fd < 0) { | ||
40 | fprintf(stderr, "bpf_obj_get(%s): %s(%d)\n", | ||
41 | map_filename, strerror(errno), errno); | ||
42 | return -1; | ||
43 | } | ||
44 | |||
45 | while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) { | ||
46 | if (next_key >= MAX_INDEX) { | ||
47 | fprintf(stderr, "Key %lu out of bounds\n", next_key); | ||
48 | continue; | ||
49 | } | ||
50 | |||
51 | bpf_map_lookup_elem(map_fd, &next_key, values); | ||
52 | |||
53 | sum = 0; | ||
54 | for (i = 0; i < nr_cpus; i++) | ||
55 | sum += values[i]; | ||
56 | |||
57 | data[next_key] = sum; | ||
58 | if (sum && next_key > max_key) | ||
59 | max_key = next_key; | ||
60 | |||
61 | if (sum > max_value) | ||
62 | max_value = sum; | ||
63 | |||
64 | key = next_key; | ||
65 | } | ||
66 | |||
67 | for (i = 1; i <= max_key + 1; i++) { | ||
68 | stars(starstr, data[i - 1], max_value, MAX_STARS); | ||
69 | printf("%8ld -> %-8ld : %-8ld |%-*s|\n", | ||
70 | (1l << i) >> 1, (1l << i) - 1, data[i - 1], | ||
71 | MAX_STARS, starstr); | ||
72 | } | ||
73 | |||
74 | close(map_fd); | ||
75 | |||
76 | return 0; | ||
77 | } | ||
diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c new file mode 100644 index 000000000..8773f22b6 --- /dev/null +++ b/samples/bpf/map_perf_test_kern.c | |||
@@ -0,0 +1,291 @@ | |||
1 | /* Copyright (c) 2016 Facebook | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #include <linux/skbuff.h> | ||
8 | #include <linux/netdevice.h> | ||
9 | #include <linux/version.h> | ||
10 | #include <uapi/linux/bpf.h> | ||
11 | #include <bpf/bpf_helpers.h> | ||
12 | #include <bpf/bpf_tracing.h> | ||
13 | #include <bpf/bpf_core_read.h> | ||
14 | #include "trace_common.h" | ||
15 | |||
16 | #define MAX_ENTRIES 1000 | ||
17 | #define MAX_NR_CPUS 1024 | ||
18 | |||
19 | struct { | ||
20 | __uint(type, BPF_MAP_TYPE_HASH); | ||
21 | __type(key, u32); | ||
22 | __type(value, long); | ||
23 | __uint(max_entries, MAX_ENTRIES); | ||
24 | } hash_map SEC(".maps"); | ||
25 | |||
26 | struct { | ||
27 | __uint(type, BPF_MAP_TYPE_LRU_HASH); | ||
28 | __type(key, u32); | ||
29 | __type(value, long); | ||
30 | __uint(max_entries, 10000); | ||
31 | } lru_hash_map SEC(".maps"); | ||
32 | |||
33 | struct { | ||
34 | __uint(type, BPF_MAP_TYPE_LRU_HASH); | ||
35 | __type(key, u32); | ||
36 | __type(value, long); | ||
37 | __uint(max_entries, 10000); | ||
38 | __uint(map_flags, BPF_F_NO_COMMON_LRU); | ||
39 | } nocommon_lru_hash_map SEC(".maps"); | ||
40 | |||
41 | struct inner_lru { | ||
42 | __uint(type, BPF_MAP_TYPE_LRU_HASH); | ||
43 | __type(key, u32); | ||
44 | __type(value, long); | ||
45 | __uint(max_entries, MAX_ENTRIES); | ||
46 | __uint(map_flags, BPF_F_NUMA_NODE); | ||
47 | __uint(numa_node, 0); | ||
48 | } inner_lru_hash_map SEC(".maps"); | ||
49 | |||
50 | struct { | ||
51 | __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); | ||
52 | __uint(max_entries, MAX_NR_CPUS); | ||
53 | __uint(key_size, sizeof(u32)); | ||
54 | __array(values, struct inner_lru); /* use inner_lru as inner map */ | ||
55 | } array_of_lru_hashs SEC(".maps") = { | ||
56 | /* statically initialize the first element */ | ||
57 | .values = { &inner_lru_hash_map }, | ||
58 | }; | ||
59 | |||
60 | struct { | ||
61 | __uint(type, BPF_MAP_TYPE_PERCPU_HASH); | ||
62 | __uint(key_size, sizeof(u32)); | ||
63 | __uint(value_size, sizeof(long)); | ||
64 | __uint(max_entries, MAX_ENTRIES); | ||
65 | } percpu_hash_map SEC(".maps"); | ||
66 | |||
67 | struct { | ||
68 | __uint(type, BPF_MAP_TYPE_HASH); | ||
69 | __type(key, u32); | ||
70 | __type(value, long); | ||
71 | __uint(max_entries, MAX_ENTRIES); | ||
72 | __uint(map_flags, BPF_F_NO_PREALLOC); | ||
73 | } hash_map_alloc SEC(".maps"); | ||
74 | |||
75 | struct { | ||
76 | __uint(type, BPF_MAP_TYPE_PERCPU_HASH); | ||
77 | __uint(key_size, sizeof(u32)); | ||
78 | __uint(value_size, sizeof(long)); | ||
79 | __uint(max_entries, MAX_ENTRIES); | ||
80 | __uint(map_flags, BPF_F_NO_PREALLOC); | ||
81 | } percpu_hash_map_alloc SEC(".maps"); | ||
82 | |||
83 | struct { | ||
84 | __uint(type, BPF_MAP_TYPE_LPM_TRIE); | ||
85 | __uint(key_size, 8); | ||
86 | __uint(value_size, sizeof(long)); | ||
87 | __uint(max_entries, 10000); | ||
88 | __uint(map_flags, BPF_F_NO_PREALLOC); | ||
89 | } lpm_trie_map_alloc SEC(".maps"); | ||
90 | |||
91 | struct { | ||
92 | __uint(type, BPF_MAP_TYPE_ARRAY); | ||
93 | __type(key, u32); | ||
94 | __type(value, long); | ||
95 | __uint(max_entries, MAX_ENTRIES); | ||
96 | } array_map SEC(".maps"); | ||
97 | |||
98 | struct { | ||
99 | __uint(type, BPF_MAP_TYPE_LRU_HASH); | ||
100 | __type(key, u32); | ||
101 | __type(value, long); | ||
102 | __uint(max_entries, MAX_ENTRIES); | ||
103 | } lru_hash_lookup_map SEC(".maps"); | ||
104 | |||
105 | SEC("kprobe/" SYSCALL(sys_getuid)) | ||
106 | int stress_hmap(struct pt_regs *ctx) | ||
107 | { | ||
108 | u32 key = bpf_get_current_pid_tgid(); | ||
109 | long init_val = 1; | ||
110 | long *value; | ||
111 | |||
112 | bpf_map_update_elem(&hash_map, &key, &init_val, BPF_ANY); | ||
113 | value = bpf_map_lookup_elem(&hash_map, &key); | ||
114 | if (value) | ||
115 | bpf_map_delete_elem(&hash_map, &key); | ||
116 | |||
117 | return 0; | ||
118 | } | ||
119 | |||
120 | SEC("kprobe/" SYSCALL(sys_geteuid)) | ||
121 | int stress_percpu_hmap(struct pt_regs *ctx) | ||
122 | { | ||
123 | u32 key = bpf_get_current_pid_tgid(); | ||
124 | long init_val = 1; | ||
125 | long *value; | ||
126 | |||
127 | bpf_map_update_elem(&percpu_hash_map, &key, &init_val, BPF_ANY); | ||
128 | value = bpf_map_lookup_elem(&percpu_hash_map, &key); | ||
129 | if (value) | ||
130 | bpf_map_delete_elem(&percpu_hash_map, &key); | ||
131 | return 0; | ||
132 | } | ||
133 | |||
134 | SEC("kprobe/" SYSCALL(sys_getgid)) | ||
135 | int stress_hmap_alloc(struct pt_regs *ctx) | ||
136 | { | ||
137 | u32 key = bpf_get_current_pid_tgid(); | ||
138 | long init_val = 1; | ||
139 | long *value; | ||
140 | |||
141 | bpf_map_update_elem(&hash_map_alloc, &key, &init_val, BPF_ANY); | ||
142 | value = bpf_map_lookup_elem(&hash_map_alloc, &key); | ||
143 | if (value) | ||
144 | bpf_map_delete_elem(&hash_map_alloc, &key); | ||
145 | return 0; | ||
146 | } | ||
147 | |||
148 | SEC("kprobe/" SYSCALL(sys_getegid)) | ||
149 | int stress_percpu_hmap_alloc(struct pt_regs *ctx) | ||
150 | { | ||
151 | u32 key = bpf_get_current_pid_tgid(); | ||
152 | long init_val = 1; | ||
153 | long *value; | ||
154 | |||
155 | bpf_map_update_elem(&percpu_hash_map_alloc, &key, &init_val, BPF_ANY); | ||
156 | value = bpf_map_lookup_elem(&percpu_hash_map_alloc, &key); | ||
157 | if (value) | ||
158 | bpf_map_delete_elem(&percpu_hash_map_alloc, &key); | ||
159 | return 0; | ||
160 | } | ||
161 | |||
162 | SEC("kprobe/" SYSCALL(sys_connect)) | ||
163 | int stress_lru_hmap_alloc(struct pt_regs *ctx) | ||
164 | { | ||
165 | struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx); | ||
166 | char fmt[] = "Failed at stress_lru_hmap_alloc. ret:%dn"; | ||
167 | union { | ||
168 | u16 dst6[8]; | ||
169 | struct { | ||
170 | u16 magic0; | ||
171 | u16 magic1; | ||
172 | u16 tcase; | ||
173 | u16 unused16; | ||
174 | u32 unused32; | ||
175 | u32 key; | ||
176 | }; | ||
177 | } test_params; | ||
178 | struct sockaddr_in6 *in6; | ||
179 | u16 test_case; | ||
180 | int addrlen, ret; | ||
181 | long val = 1; | ||
182 | u32 key = 0; | ||
183 | |||
184 | in6 = (struct sockaddr_in6 *)PT_REGS_PARM2_CORE(real_regs); | ||
185 | addrlen = (int)PT_REGS_PARM3_CORE(real_regs); | ||
186 | |||
187 | if (addrlen != sizeof(*in6)) | ||
188 | return 0; | ||
189 | |||
190 | ret = bpf_probe_read_user(test_params.dst6, sizeof(test_params.dst6), | ||
191 | &in6->sin6_addr); | ||
192 | if (ret) | ||
193 | goto done; | ||
194 | |||
195 | if (test_params.magic0 != 0xdead || | ||
196 | test_params.magic1 != 0xbeef) | ||
197 | return 0; | ||
198 | |||
199 | test_case = test_params.tcase; | ||
200 | if (test_case != 3) | ||
201 | key = bpf_get_prandom_u32(); | ||
202 | |||
203 | if (test_case == 0) { | ||
204 | ret = bpf_map_update_elem(&lru_hash_map, &key, &val, BPF_ANY); | ||
205 | } else if (test_case == 1) { | ||
206 | ret = bpf_map_update_elem(&nocommon_lru_hash_map, &key, &val, | ||
207 | BPF_ANY); | ||
208 | } else if (test_case == 2) { | ||
209 | void *nolocal_lru_map; | ||
210 | int cpu = bpf_get_smp_processor_id(); | ||
211 | |||
212 | nolocal_lru_map = bpf_map_lookup_elem(&array_of_lru_hashs, | ||
213 | &cpu); | ||
214 | if (!nolocal_lru_map) { | ||
215 | ret = -ENOENT; | ||
216 | goto done; | ||
217 | } | ||
218 | |||
219 | ret = bpf_map_update_elem(nolocal_lru_map, &key, &val, | ||
220 | BPF_ANY); | ||
221 | } else if (test_case == 3) { | ||
222 | u32 i; | ||
223 | |||
224 | key = test_params.key; | ||
225 | |||
226 | #pragma clang loop unroll(full) | ||
227 | for (i = 0; i < 32; i++) { | ||
228 | bpf_map_lookup_elem(&lru_hash_lookup_map, &key); | ||
229 | key++; | ||
230 | } | ||
231 | } else { | ||
232 | ret = -EINVAL; | ||
233 | } | ||
234 | |||
235 | done: | ||
236 | if (ret) | ||
237 | bpf_trace_printk(fmt, sizeof(fmt), ret); | ||
238 | |||
239 | return 0; | ||
240 | } | ||
241 | |||
242 | SEC("kprobe/" SYSCALL(sys_gettid)) | ||
243 | int stress_lpm_trie_map_alloc(struct pt_regs *ctx) | ||
244 | { | ||
245 | union { | ||
246 | u32 b32[2]; | ||
247 | u8 b8[8]; | ||
248 | } key; | ||
249 | unsigned int i; | ||
250 | |||
251 | key.b32[0] = 32; | ||
252 | key.b8[4] = 192; | ||
253 | key.b8[5] = 168; | ||
254 | key.b8[6] = 0; | ||
255 | key.b8[7] = 1; | ||
256 | |||
257 | #pragma clang loop unroll(full) | ||
258 | for (i = 0; i < 32; ++i) | ||
259 | bpf_map_lookup_elem(&lpm_trie_map_alloc, &key); | ||
260 | |||
261 | return 0; | ||
262 | } | ||
263 | |||
264 | SEC("kprobe/" SYSCALL(sys_getpgid)) | ||
265 | int stress_hash_map_lookup(struct pt_regs *ctx) | ||
266 | { | ||
267 | u32 key = 1, i; | ||
268 | long *value; | ||
269 | |||
270 | #pragma clang loop unroll(full) | ||
271 | for (i = 0; i < 64; ++i) | ||
272 | value = bpf_map_lookup_elem(&hash_map, &key); | ||
273 | |||
274 | return 0; | ||
275 | } | ||
276 | |||
277 | SEC("kprobe/" SYSCALL(sys_getppid)) | ||
278 | int stress_array_map_lookup(struct pt_regs *ctx) | ||
279 | { | ||
280 | u32 key = 1, i; | ||
281 | long *value; | ||
282 | |||
283 | #pragma clang loop unroll(full) | ||
284 | for (i = 0; i < 64; ++i) | ||
285 | value = bpf_map_lookup_elem(&array_map, &key); | ||
286 | |||
287 | return 0; | ||
288 | } | ||
289 | |||
290 | char _license[] SEC("license") = "GPL"; | ||
291 | u32 _version SEC("version") = LINUX_VERSION_CODE; | ||
diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c new file mode 100644 index 000000000..8b13230b4 --- /dev/null +++ b/samples/bpf/map_perf_test_user.c | |||
@@ -0,0 +1,507 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* Copyright (c) 2016 Facebook | ||
3 | */ | ||
4 | #define _GNU_SOURCE | ||
5 | #include <sched.h> | ||
6 | #include <stdio.h> | ||
7 | #include <sys/types.h> | ||
8 | #include <asm/unistd.h> | ||
9 | #include <unistd.h> | ||
10 | #include <assert.h> | ||
11 | #include <sys/wait.h> | ||
12 | #include <stdlib.h> | ||
13 | #include <signal.h> | ||
14 | #include <string.h> | ||
15 | #include <time.h> | ||
16 | #include <sys/resource.h> | ||
17 | #include <arpa/inet.h> | ||
18 | #include <errno.h> | ||
19 | |||
20 | #include <bpf/bpf.h> | ||
21 | #include <bpf/libbpf.h> | ||
22 | |||
23 | #define TEST_BIT(t) (1U << (t)) | ||
24 | #define MAX_NR_CPUS 1024 | ||
25 | |||
26 | static __u64 time_get_ns(void) | ||
27 | { | ||
28 | struct timespec ts; | ||
29 | |||
30 | clock_gettime(CLOCK_MONOTONIC, &ts); | ||
31 | return ts.tv_sec * 1000000000ull + ts.tv_nsec; | ||
32 | } | ||
33 | |||
34 | enum test_type { | ||
35 | HASH_PREALLOC, | ||
36 | PERCPU_HASH_PREALLOC, | ||
37 | HASH_KMALLOC, | ||
38 | PERCPU_HASH_KMALLOC, | ||
39 | LRU_HASH_PREALLOC, | ||
40 | NOCOMMON_LRU_HASH_PREALLOC, | ||
41 | LPM_KMALLOC, | ||
42 | HASH_LOOKUP, | ||
43 | ARRAY_LOOKUP, | ||
44 | INNER_LRU_HASH_PREALLOC, | ||
45 | LRU_HASH_LOOKUP, | ||
46 | NR_TESTS, | ||
47 | }; | ||
48 | |||
49 | const char *test_map_names[NR_TESTS] = { | ||
50 | [HASH_PREALLOC] = "hash_map", | ||
51 | [PERCPU_HASH_PREALLOC] = "percpu_hash_map", | ||
52 | [HASH_KMALLOC] = "hash_map_alloc", | ||
53 | [PERCPU_HASH_KMALLOC] = "percpu_hash_map_alloc", | ||
54 | [LRU_HASH_PREALLOC] = "lru_hash_map", | ||
55 | [NOCOMMON_LRU_HASH_PREALLOC] = "nocommon_lru_hash_map", | ||
56 | [LPM_KMALLOC] = "lpm_trie_map_alloc", | ||
57 | [HASH_LOOKUP] = "hash_map", | ||
58 | [ARRAY_LOOKUP] = "array_map", | ||
59 | [INNER_LRU_HASH_PREALLOC] = "inner_lru_hash_map", | ||
60 | [LRU_HASH_LOOKUP] = "lru_hash_lookup_map", | ||
61 | }; | ||
62 | |||
63 | enum map_idx { | ||
64 | array_of_lru_hashs_idx, | ||
65 | hash_map_alloc_idx, | ||
66 | lru_hash_lookup_idx, | ||
67 | NR_IDXES, | ||
68 | }; | ||
69 | |||
70 | static int map_fd[NR_IDXES]; | ||
71 | |||
72 | static int test_flags = ~0; | ||
73 | static uint32_t num_map_entries; | ||
74 | static uint32_t inner_lru_hash_size; | ||
75 | static int lru_hash_lookup_test_entries = 32; | ||
76 | static uint32_t max_cnt = 1000000; | ||
77 | |||
78 | static int check_test_flags(enum test_type t) | ||
79 | { | ||
80 | return test_flags & TEST_BIT(t); | ||
81 | } | ||
82 | |||
83 | static void test_hash_prealloc(int cpu) | ||
84 | { | ||
85 | __u64 start_time; | ||
86 | int i; | ||
87 | |||
88 | start_time = time_get_ns(); | ||
89 | for (i = 0; i < max_cnt; i++) | ||
90 | syscall(__NR_getuid); | ||
91 | printf("%d:hash_map_perf pre-alloc %lld events per sec\n", | ||
92 | cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time)); | ||
93 | } | ||
94 | |||
95 | static int pre_test_lru_hash_lookup(int tasks) | ||
96 | { | ||
97 | int fd = map_fd[lru_hash_lookup_idx]; | ||
98 | uint32_t key; | ||
99 | long val = 1; | ||
100 | int ret; | ||
101 | |||
102 | if (num_map_entries > lru_hash_lookup_test_entries) | ||
103 | lru_hash_lookup_test_entries = num_map_entries; | ||
104 | |||
105 | /* Populate the lru_hash_map for LRU_HASH_LOOKUP perf test. | ||
106 | * | ||
107 | * It is fine that the user requests for a map with | ||
108 | * num_map_entries < 32 and some of the later lru hash lookup | ||
109 | * may return not found. For LRU map, we are not interested | ||
110 | * in such small map performance. | ||
111 | */ | ||
112 | for (key = 0; key < lru_hash_lookup_test_entries; key++) { | ||
113 | ret = bpf_map_update_elem(fd, &key, &val, BPF_NOEXIST); | ||
114 | if (ret) | ||
115 | return ret; | ||
116 | } | ||
117 | |||
118 | return 0; | ||
119 | } | ||
120 | |||
121 | static void do_test_lru(enum test_type test, int cpu) | ||
122 | { | ||
123 | static int inner_lru_map_fds[MAX_NR_CPUS]; | ||
124 | |||
125 | struct sockaddr_in6 in6 = { .sin6_family = AF_INET6 }; | ||
126 | const char *test_name; | ||
127 | __u64 start_time; | ||
128 | int i, ret; | ||
129 | |||
130 | if (test == INNER_LRU_HASH_PREALLOC && cpu) { | ||
131 | /* If CPU is not 0, create inner_lru hash map and insert the fd | ||
132 | * value into the array_of_lru_hash map. In case of CPU 0, | ||
133 | * 'inner_lru_hash_map' was statically inserted on the map init | ||
134 | */ | ||
135 | int outer_fd = map_fd[array_of_lru_hashs_idx]; | ||
136 | unsigned int mycpu, mynode; | ||
137 | |||
138 | assert(cpu < MAX_NR_CPUS); | ||
139 | |||
140 | ret = syscall(__NR_getcpu, &mycpu, &mynode, NULL); | ||
141 | assert(!ret); | ||
142 | |||
143 | inner_lru_map_fds[cpu] = | ||
144 | bpf_create_map_node(BPF_MAP_TYPE_LRU_HASH, | ||
145 | test_map_names[INNER_LRU_HASH_PREALLOC], | ||
146 | sizeof(uint32_t), | ||
147 | sizeof(long), | ||
148 | inner_lru_hash_size, 0, | ||
149 | mynode); | ||
150 | if (inner_lru_map_fds[cpu] == -1) { | ||
151 | printf("cannot create BPF_MAP_TYPE_LRU_HASH %s(%d)\n", | ||
152 | strerror(errno), errno); | ||
153 | exit(1); | ||
154 | } | ||
155 | |||
156 | ret = bpf_map_update_elem(outer_fd, &cpu, | ||
157 | &inner_lru_map_fds[cpu], | ||
158 | BPF_ANY); | ||
159 | if (ret) { | ||
160 | printf("cannot update ARRAY_OF_LRU_HASHS with key:%u. %s(%d)\n", | ||
161 | cpu, strerror(errno), errno); | ||
162 | exit(1); | ||
163 | } | ||
164 | } | ||
165 | |||
166 | in6.sin6_addr.s6_addr16[0] = 0xdead; | ||
167 | in6.sin6_addr.s6_addr16[1] = 0xbeef; | ||
168 | |||
169 | if (test == LRU_HASH_PREALLOC) { | ||
170 | test_name = "lru_hash_map_perf"; | ||
171 | in6.sin6_addr.s6_addr16[2] = 0; | ||
172 | } else if (test == NOCOMMON_LRU_HASH_PREALLOC) { | ||
173 | test_name = "nocommon_lru_hash_map_perf"; | ||
174 | in6.sin6_addr.s6_addr16[2] = 1; | ||
175 | } else if (test == INNER_LRU_HASH_PREALLOC) { | ||
176 | test_name = "inner_lru_hash_map_perf"; | ||
177 | in6.sin6_addr.s6_addr16[2] = 2; | ||
178 | } else if (test == LRU_HASH_LOOKUP) { | ||
179 | test_name = "lru_hash_lookup_perf"; | ||
180 | in6.sin6_addr.s6_addr16[2] = 3; | ||
181 | in6.sin6_addr.s6_addr32[3] = 0; | ||
182 | } else { | ||
183 | assert(0); | ||
184 | } | ||
185 | |||
186 | start_time = time_get_ns(); | ||
187 | for (i = 0; i < max_cnt; i++) { | ||
188 | ret = connect(-1, (const struct sockaddr *)&in6, sizeof(in6)); | ||
189 | assert(ret == -1 && errno == EBADF); | ||
190 | if (in6.sin6_addr.s6_addr32[3] < | ||
191 | lru_hash_lookup_test_entries - 32) | ||
192 | in6.sin6_addr.s6_addr32[3] += 32; | ||
193 | else | ||
194 | in6.sin6_addr.s6_addr32[3] = 0; | ||
195 | } | ||
196 | printf("%d:%s pre-alloc %lld events per sec\n", | ||
197 | cpu, test_name, | ||
198 | max_cnt * 1000000000ll / (time_get_ns() - start_time)); | ||
199 | } | ||
200 | |||
201 | static void test_lru_hash_prealloc(int cpu) | ||
202 | { | ||
203 | do_test_lru(LRU_HASH_PREALLOC, cpu); | ||
204 | } | ||
205 | |||
206 | static void test_nocommon_lru_hash_prealloc(int cpu) | ||
207 | { | ||
208 | do_test_lru(NOCOMMON_LRU_HASH_PREALLOC, cpu); | ||
209 | } | ||
210 | |||
211 | static void test_inner_lru_hash_prealloc(int cpu) | ||
212 | { | ||
213 | do_test_lru(INNER_LRU_HASH_PREALLOC, cpu); | ||
214 | } | ||
215 | |||
216 | static void test_lru_hash_lookup(int cpu) | ||
217 | { | ||
218 | do_test_lru(LRU_HASH_LOOKUP, cpu); | ||
219 | } | ||
220 | |||
221 | static void test_percpu_hash_prealloc(int cpu) | ||
222 | { | ||
223 | __u64 start_time; | ||
224 | int i; | ||
225 | |||
226 | start_time = time_get_ns(); | ||
227 | for (i = 0; i < max_cnt; i++) | ||
228 | syscall(__NR_geteuid); | ||
229 | printf("%d:percpu_hash_map_perf pre-alloc %lld events per sec\n", | ||
230 | cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time)); | ||
231 | } | ||
232 | |||
233 | static void test_hash_kmalloc(int cpu) | ||
234 | { | ||
235 | __u64 start_time; | ||
236 | int i; | ||
237 | |||
238 | start_time = time_get_ns(); | ||
239 | for (i = 0; i < max_cnt; i++) | ||
240 | syscall(__NR_getgid); | ||
241 | printf("%d:hash_map_perf kmalloc %lld events per sec\n", | ||
242 | cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time)); | ||
243 | } | ||
244 | |||
245 | static void test_percpu_hash_kmalloc(int cpu) | ||
246 | { | ||
247 | __u64 start_time; | ||
248 | int i; | ||
249 | |||
250 | start_time = time_get_ns(); | ||
251 | for (i = 0; i < max_cnt; i++) | ||
252 | syscall(__NR_getegid); | ||
253 | printf("%d:percpu_hash_map_perf kmalloc %lld events per sec\n", | ||
254 | cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time)); | ||
255 | } | ||
256 | |||
257 | static void test_lpm_kmalloc(int cpu) | ||
258 | { | ||
259 | __u64 start_time; | ||
260 | int i; | ||
261 | |||
262 | start_time = time_get_ns(); | ||
263 | for (i = 0; i < max_cnt; i++) | ||
264 | syscall(__NR_gettid); | ||
265 | printf("%d:lpm_perf kmalloc %lld events per sec\n", | ||
266 | cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time)); | ||
267 | } | ||
268 | |||
269 | static void test_hash_lookup(int cpu) | ||
270 | { | ||
271 | __u64 start_time; | ||
272 | int i; | ||
273 | |||
274 | start_time = time_get_ns(); | ||
275 | for (i = 0; i < max_cnt; i++) | ||
276 | syscall(__NR_getpgid, 0); | ||
277 | printf("%d:hash_lookup %lld lookups per sec\n", | ||
278 | cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time)); | ||
279 | } | ||
280 | |||
281 | static void test_array_lookup(int cpu) | ||
282 | { | ||
283 | __u64 start_time; | ||
284 | int i; | ||
285 | |||
286 | start_time = time_get_ns(); | ||
287 | for (i = 0; i < max_cnt; i++) | ||
288 | syscall(__NR_getppid, 0); | ||
289 | printf("%d:array_lookup %lld lookups per sec\n", | ||
290 | cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time)); | ||
291 | } | ||
292 | |||
293 | typedef int (*pre_test_func)(int tasks); | ||
294 | const pre_test_func pre_test_funcs[] = { | ||
295 | [LRU_HASH_LOOKUP] = pre_test_lru_hash_lookup, | ||
296 | }; | ||
297 | |||
298 | typedef void (*test_func)(int cpu); | ||
299 | const test_func test_funcs[] = { | ||
300 | [HASH_PREALLOC] = test_hash_prealloc, | ||
301 | [PERCPU_HASH_PREALLOC] = test_percpu_hash_prealloc, | ||
302 | [HASH_KMALLOC] = test_hash_kmalloc, | ||
303 | [PERCPU_HASH_KMALLOC] = test_percpu_hash_kmalloc, | ||
304 | [LRU_HASH_PREALLOC] = test_lru_hash_prealloc, | ||
305 | [NOCOMMON_LRU_HASH_PREALLOC] = test_nocommon_lru_hash_prealloc, | ||
306 | [LPM_KMALLOC] = test_lpm_kmalloc, | ||
307 | [HASH_LOOKUP] = test_hash_lookup, | ||
308 | [ARRAY_LOOKUP] = test_array_lookup, | ||
309 | [INNER_LRU_HASH_PREALLOC] = test_inner_lru_hash_prealloc, | ||
310 | [LRU_HASH_LOOKUP] = test_lru_hash_lookup, | ||
311 | }; | ||
312 | |||
313 | static int pre_test(int tasks) | ||
314 | { | ||
315 | int i; | ||
316 | |||
317 | for (i = 0; i < NR_TESTS; i++) { | ||
318 | if (pre_test_funcs[i] && check_test_flags(i)) { | ||
319 | int ret = pre_test_funcs[i](tasks); | ||
320 | |||
321 | if (ret) | ||
322 | return ret; | ||
323 | } | ||
324 | } | ||
325 | |||
326 | return 0; | ||
327 | } | ||
328 | |||
329 | static void loop(int cpu) | ||
330 | { | ||
331 | cpu_set_t cpuset; | ||
332 | int i; | ||
333 | |||
334 | CPU_ZERO(&cpuset); | ||
335 | CPU_SET(cpu, &cpuset); | ||
336 | sched_setaffinity(0, sizeof(cpuset), &cpuset); | ||
337 | |||
338 | for (i = 0; i < NR_TESTS; i++) { | ||
339 | if (check_test_flags(i)) | ||
340 | test_funcs[i](cpu); | ||
341 | } | ||
342 | } | ||
343 | |||
344 | static void run_perf_test(int tasks) | ||
345 | { | ||
346 | pid_t pid[tasks]; | ||
347 | int i; | ||
348 | |||
349 | assert(!pre_test(tasks)); | ||
350 | |||
351 | for (i = 0; i < tasks; i++) { | ||
352 | pid[i] = fork(); | ||
353 | if (pid[i] == 0) { | ||
354 | loop(i); | ||
355 | exit(0); | ||
356 | } else if (pid[i] == -1) { | ||
357 | printf("couldn't spawn #%d process\n", i); | ||
358 | exit(1); | ||
359 | } | ||
360 | } | ||
361 | for (i = 0; i < tasks; i++) { | ||
362 | int status; | ||
363 | |||
364 | assert(waitpid(pid[i], &status, 0) == pid[i]); | ||
365 | assert(status == 0); | ||
366 | } | ||
367 | } | ||
368 | |||
369 | static void fill_lpm_trie(void) | ||
370 | { | ||
371 | struct bpf_lpm_trie_key *key; | ||
372 | unsigned long value = 0; | ||
373 | unsigned int i; | ||
374 | int r; | ||
375 | |||
376 | key = alloca(sizeof(*key) + 4); | ||
377 | key->prefixlen = 32; | ||
378 | |||
379 | for (i = 0; i < 512; ++i) { | ||
380 | key->prefixlen = rand() % 33; | ||
381 | key->data[0] = rand() & 0xff; | ||
382 | key->data[1] = rand() & 0xff; | ||
383 | key->data[2] = rand() & 0xff; | ||
384 | key->data[3] = rand() & 0xff; | ||
385 | r = bpf_map_update_elem(map_fd[hash_map_alloc_idx], | ||
386 | key, &value, 0); | ||
387 | assert(!r); | ||
388 | } | ||
389 | |||
390 | key->prefixlen = 32; | ||
391 | key->data[0] = 192; | ||
392 | key->data[1] = 168; | ||
393 | key->data[2] = 0; | ||
394 | key->data[3] = 1; | ||
395 | value = 128; | ||
396 | |||
397 | r = bpf_map_update_elem(map_fd[hash_map_alloc_idx], key, &value, 0); | ||
398 | assert(!r); | ||
399 | } | ||
400 | |||
401 | static void fixup_map(struct bpf_object *obj) | ||
402 | { | ||
403 | struct bpf_map *map; | ||
404 | int i; | ||
405 | |||
406 | bpf_object__for_each_map(map, obj) { | ||
407 | const char *name = bpf_map__name(map); | ||
408 | |||
409 | /* Only change the max_entries for the enabled test(s) */ | ||
410 | for (i = 0; i < NR_TESTS; i++) { | ||
411 | if (!strcmp(test_map_names[i], name) && | ||
412 | (check_test_flags(i))) { | ||
413 | bpf_map__resize(map, num_map_entries); | ||
414 | continue; | ||
415 | } | ||
416 | } | ||
417 | } | ||
418 | |||
419 | inner_lru_hash_size = num_map_entries; | ||
420 | } | ||
421 | |||
422 | int main(int argc, char **argv) | ||
423 | { | ||
424 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
425 | int nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); | ||
426 | struct bpf_link *links[8]; | ||
427 | struct bpf_program *prog; | ||
428 | struct bpf_object *obj; | ||
429 | struct bpf_map *map; | ||
430 | char filename[256]; | ||
431 | int i = 0; | ||
432 | |||
433 | if (setrlimit(RLIMIT_MEMLOCK, &r)) { | ||
434 | perror("setrlimit(RLIMIT_MEMLOCK)"); | ||
435 | return 1; | ||
436 | } | ||
437 | |||
438 | if (argc > 1) | ||
439 | test_flags = atoi(argv[1]) ? : test_flags; | ||
440 | |||
441 | if (argc > 2) | ||
442 | nr_cpus = atoi(argv[2]) ? : nr_cpus; | ||
443 | |||
444 | if (argc > 3) | ||
445 | num_map_entries = atoi(argv[3]); | ||
446 | |||
447 | if (argc > 4) | ||
448 | max_cnt = atoi(argv[4]); | ||
449 | |||
450 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
451 | obj = bpf_object__open_file(filename, NULL); | ||
452 | if (libbpf_get_error(obj)) { | ||
453 | fprintf(stderr, "ERROR: opening BPF object file failed\n"); | ||
454 | return 0; | ||
455 | } | ||
456 | |||
457 | map = bpf_object__find_map_by_name(obj, "inner_lru_hash_map"); | ||
458 | if (libbpf_get_error(map)) { | ||
459 | fprintf(stderr, "ERROR: finding a map in obj file failed\n"); | ||
460 | goto cleanup; | ||
461 | } | ||
462 | |||
463 | inner_lru_hash_size = bpf_map__max_entries(map); | ||
464 | if (!inner_lru_hash_size) { | ||
465 | fprintf(stderr, "ERROR: failed to get map attribute\n"); | ||
466 | goto cleanup; | ||
467 | } | ||
468 | |||
469 | /* resize BPF map prior to loading */ | ||
470 | if (num_map_entries > 0) | ||
471 | fixup_map(obj); | ||
472 | |||
473 | /* load BPF program */ | ||
474 | if (bpf_object__load(obj)) { | ||
475 | fprintf(stderr, "ERROR: loading BPF object file failed\n"); | ||
476 | goto cleanup; | ||
477 | } | ||
478 | |||
479 | map_fd[0] = bpf_object__find_map_fd_by_name(obj, "array_of_lru_hashs"); | ||
480 | map_fd[1] = bpf_object__find_map_fd_by_name(obj, "hash_map_alloc"); | ||
481 | map_fd[2] = bpf_object__find_map_fd_by_name(obj, "lru_hash_lookup_map"); | ||
482 | if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0) { | ||
483 | fprintf(stderr, "ERROR: finding a map in obj file failed\n"); | ||
484 | goto cleanup; | ||
485 | } | ||
486 | |||
487 | bpf_object__for_each_program(prog, obj) { | ||
488 | links[i] = bpf_program__attach(prog); | ||
489 | if (libbpf_get_error(links[i])) { | ||
490 | fprintf(stderr, "ERROR: bpf_program__attach failed\n"); | ||
491 | links[i] = NULL; | ||
492 | goto cleanup; | ||
493 | } | ||
494 | i++; | ||
495 | } | ||
496 | |||
497 | fill_lpm_trie(); | ||
498 | |||
499 | run_perf_test(nr_cpus); | ||
500 | |||
501 | cleanup: | ||
502 | for (i--; i >= 0; i--) | ||
503 | bpf_link__destroy(links[i]); | ||
504 | |||
505 | bpf_object__close(obj); | ||
506 | return 0; | ||
507 | } | ||
diff --git a/samples/bpf/offwaketime_kern.c b/samples/bpf/offwaketime_kern.c new file mode 100644 index 000000000..14b792915 --- /dev/null +++ b/samples/bpf/offwaketime_kern.c | |||
@@ -0,0 +1,157 @@ | |||
1 | /* Copyright (c) 2016 Facebook | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #include <uapi/linux/bpf.h> | ||
8 | #include <uapi/linux/ptrace.h> | ||
9 | #include <uapi/linux/perf_event.h> | ||
10 | #include <linux/version.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <bpf/bpf_helpers.h> | ||
13 | #include <bpf/bpf_tracing.h> | ||
14 | |||
15 | #define _(P) \ | ||
16 | ({ \ | ||
17 | typeof(P) val; \ | ||
18 | bpf_probe_read_kernel(&val, sizeof(val), &(P)); \ | ||
19 | val; \ | ||
20 | }) | ||
21 | |||
22 | #define MINBLOCK_US 1 | ||
23 | |||
24 | struct key_t { | ||
25 | char waker[TASK_COMM_LEN]; | ||
26 | char target[TASK_COMM_LEN]; | ||
27 | u32 wret; | ||
28 | u32 tret; | ||
29 | }; | ||
30 | |||
31 | struct { | ||
32 | __uint(type, BPF_MAP_TYPE_HASH); | ||
33 | __type(key, struct key_t); | ||
34 | __type(value, u64); | ||
35 | __uint(max_entries, 10000); | ||
36 | } counts SEC(".maps"); | ||
37 | |||
38 | struct { | ||
39 | __uint(type, BPF_MAP_TYPE_HASH); | ||
40 | __type(key, u32); | ||
41 | __type(value, u64); | ||
42 | __uint(max_entries, 10000); | ||
43 | } start SEC(".maps"); | ||
44 | |||
45 | struct wokeby_t { | ||
46 | char name[TASK_COMM_LEN]; | ||
47 | u32 ret; | ||
48 | }; | ||
49 | |||
50 | struct { | ||
51 | __uint(type, BPF_MAP_TYPE_HASH); | ||
52 | __type(key, u32); | ||
53 | __type(value, struct wokeby_t); | ||
54 | __uint(max_entries, 10000); | ||
55 | } wokeby SEC(".maps"); | ||
56 | |||
57 | struct { | ||
58 | __uint(type, BPF_MAP_TYPE_STACK_TRACE); | ||
59 | __uint(key_size, sizeof(u32)); | ||
60 | __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64)); | ||
61 | __uint(max_entries, 10000); | ||
62 | } stackmap SEC(".maps"); | ||
63 | |||
64 | #define STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP) | ||
65 | |||
66 | SEC("kprobe/try_to_wake_up") | ||
67 | int waker(struct pt_regs *ctx) | ||
68 | { | ||
69 | struct task_struct *p = (void *) PT_REGS_PARM1(ctx); | ||
70 | struct wokeby_t woke; | ||
71 | u32 pid; | ||
72 | |||
73 | pid = _(p->pid); | ||
74 | |||
75 | bpf_get_current_comm(&woke.name, sizeof(woke.name)); | ||
76 | woke.ret = bpf_get_stackid(ctx, &stackmap, STACKID_FLAGS); | ||
77 | |||
78 | bpf_map_update_elem(&wokeby, &pid, &woke, BPF_ANY); | ||
79 | return 0; | ||
80 | } | ||
81 | |||
82 | static inline int update_counts(void *ctx, u32 pid, u64 delta) | ||
83 | { | ||
84 | struct wokeby_t *woke; | ||
85 | u64 zero = 0, *val; | ||
86 | struct key_t key; | ||
87 | |||
88 | __builtin_memset(&key.waker, 0, sizeof(key.waker)); | ||
89 | bpf_get_current_comm(&key.target, sizeof(key.target)); | ||
90 | key.tret = bpf_get_stackid(ctx, &stackmap, STACKID_FLAGS); | ||
91 | key.wret = 0; | ||
92 | |||
93 | woke = bpf_map_lookup_elem(&wokeby, &pid); | ||
94 | if (woke) { | ||
95 | key.wret = woke->ret; | ||
96 | __builtin_memcpy(&key.waker, woke->name, sizeof(key.waker)); | ||
97 | bpf_map_delete_elem(&wokeby, &pid); | ||
98 | } | ||
99 | |||
100 | val = bpf_map_lookup_elem(&counts, &key); | ||
101 | if (!val) { | ||
102 | bpf_map_update_elem(&counts, &key, &zero, BPF_NOEXIST); | ||
103 | val = bpf_map_lookup_elem(&counts, &key); | ||
104 | if (!val) | ||
105 | return 0; | ||
106 | } | ||
107 | (*val) += delta; | ||
108 | return 0; | ||
109 | } | ||
110 | |||
111 | #if 1 | ||
112 | /* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */ | ||
113 | struct sched_switch_args { | ||
114 | unsigned long long pad; | ||
115 | char prev_comm[16]; | ||
116 | int prev_pid; | ||
117 | int prev_prio; | ||
118 | long long prev_state; | ||
119 | char next_comm[16]; | ||
120 | int next_pid; | ||
121 | int next_prio; | ||
122 | }; | ||
123 | SEC("tracepoint/sched/sched_switch") | ||
124 | int oncpu(struct sched_switch_args *ctx) | ||
125 | { | ||
126 | /* record previous thread sleep time */ | ||
127 | u32 pid = ctx->prev_pid; | ||
128 | #else | ||
129 | SEC("kprobe/finish_task_switch") | ||
130 | int oncpu(struct pt_regs *ctx) | ||
131 | { | ||
132 | struct task_struct *p = (void *) PT_REGS_PARM1(ctx); | ||
133 | /* record previous thread sleep time */ | ||
134 | u32 pid = _(p->pid); | ||
135 | #endif | ||
136 | u64 delta, ts, *tsp; | ||
137 | |||
138 | ts = bpf_ktime_get_ns(); | ||
139 | bpf_map_update_elem(&start, &pid, &ts, BPF_ANY); | ||
140 | |||
141 | /* calculate current thread's delta time */ | ||
142 | pid = bpf_get_current_pid_tgid(); | ||
143 | tsp = bpf_map_lookup_elem(&start, &pid); | ||
144 | if (!tsp) | ||
145 | /* missed start or filtered */ | ||
146 | return 0; | ||
147 | |||
148 | delta = bpf_ktime_get_ns() - *tsp; | ||
149 | bpf_map_delete_elem(&start, &pid); | ||
150 | delta = delta / 1000; | ||
151 | if (delta < MINBLOCK_US) | ||
152 | return 0; | ||
153 | |||
154 | return update_counts(ctx, pid, delta); | ||
155 | } | ||
156 | char _license[] SEC("license") = "GPL"; | ||
157 | u32 _version SEC("version") = LINUX_VERSION_CODE; | ||
diff --git a/samples/bpf/offwaketime_user.c b/samples/bpf/offwaketime_user.c new file mode 100644 index 000000000..5734cfdaa --- /dev/null +++ b/samples/bpf/offwaketime_user.c | |||
@@ -0,0 +1,160 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* Copyright (c) 2016 Facebook | ||
3 | */ | ||
4 | #include <stdio.h> | ||
5 | #include <unistd.h> | ||
6 | #include <stdlib.h> | ||
7 | #include <signal.h> | ||
8 | #include <linux/perf_event.h> | ||
9 | #include <errno.h> | ||
10 | #include <stdbool.h> | ||
11 | #include <sys/resource.h> | ||
12 | #include <bpf/libbpf.h> | ||
13 | #include <bpf/bpf.h> | ||
14 | #include "trace_helpers.h" | ||
15 | |||
16 | #define PRINT_RAW_ADDR 0 | ||
17 | |||
18 | /* counts, stackmap */ | ||
19 | static int map_fd[2]; | ||
20 | |||
21 | static void print_ksym(__u64 addr) | ||
22 | { | ||
23 | struct ksym *sym; | ||
24 | |||
25 | if (!addr) | ||
26 | return; | ||
27 | sym = ksym_search(addr); | ||
28 | if (!sym) { | ||
29 | printf("ksym not found. Is kallsyms loaded?\n"); | ||
30 | return; | ||
31 | } | ||
32 | |||
33 | if (PRINT_RAW_ADDR) | ||
34 | printf("%s/%llx;", sym->name, addr); | ||
35 | else | ||
36 | printf("%s;", sym->name); | ||
37 | } | ||
38 | |||
39 | #define TASK_COMM_LEN 16 | ||
40 | |||
41 | struct key_t { | ||
42 | char waker[TASK_COMM_LEN]; | ||
43 | char target[TASK_COMM_LEN]; | ||
44 | __u32 wret; | ||
45 | __u32 tret; | ||
46 | }; | ||
47 | |||
48 | static void print_stack(struct key_t *key, __u64 count) | ||
49 | { | ||
50 | __u64 ip[PERF_MAX_STACK_DEPTH] = {}; | ||
51 | static bool warned; | ||
52 | int i; | ||
53 | |||
54 | printf("%s;", key->target); | ||
55 | if (bpf_map_lookup_elem(map_fd[1], &key->tret, ip) != 0) { | ||
56 | printf("---;"); | ||
57 | } else { | ||
58 | for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--) | ||
59 | print_ksym(ip[i]); | ||
60 | } | ||
61 | printf("-;"); | ||
62 | if (bpf_map_lookup_elem(map_fd[1], &key->wret, ip) != 0) { | ||
63 | printf("---;"); | ||
64 | } else { | ||
65 | for (i = 0; i < PERF_MAX_STACK_DEPTH; i++) | ||
66 | print_ksym(ip[i]); | ||
67 | } | ||
68 | printf(";%s %lld\n", key->waker, count); | ||
69 | |||
70 | if ((key->tret == -EEXIST || key->wret == -EEXIST) && !warned) { | ||
71 | printf("stackmap collisions seen. Consider increasing size\n"); | ||
72 | warned = true; | ||
73 | } else if (((int)(key->tret) < 0 || (int)(key->wret) < 0)) { | ||
74 | printf("err stackid %d %d\n", key->tret, key->wret); | ||
75 | } | ||
76 | } | ||
77 | |||
78 | static void print_stacks(int fd) | ||
79 | { | ||
80 | struct key_t key = {}, next_key; | ||
81 | __u64 value; | ||
82 | |||
83 | while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { | ||
84 | bpf_map_lookup_elem(fd, &next_key, &value); | ||
85 | print_stack(&next_key, value); | ||
86 | key = next_key; | ||
87 | } | ||
88 | } | ||
89 | |||
90 | static void int_exit(int sig) | ||
91 | { | ||
92 | print_stacks(map_fd[0]); | ||
93 | exit(0); | ||
94 | } | ||
95 | |||
96 | int main(int argc, char **argv) | ||
97 | { | ||
98 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
99 | struct bpf_object *obj = NULL; | ||
100 | struct bpf_link *links[2]; | ||
101 | struct bpf_program *prog; | ||
102 | int delay = 1, i = 0; | ||
103 | char filename[256]; | ||
104 | |||
105 | if (setrlimit(RLIMIT_MEMLOCK, &r)) { | ||
106 | perror("setrlimit(RLIMIT_MEMLOCK)"); | ||
107 | return 1; | ||
108 | } | ||
109 | |||
110 | if (load_kallsyms()) { | ||
111 | printf("failed to process /proc/kallsyms\n"); | ||
112 | return 2; | ||
113 | } | ||
114 | |||
115 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
116 | obj = bpf_object__open_file(filename, NULL); | ||
117 | if (libbpf_get_error(obj)) { | ||
118 | fprintf(stderr, "ERROR: opening BPF object file failed\n"); | ||
119 | obj = NULL; | ||
120 | goto cleanup; | ||
121 | } | ||
122 | |||
123 | /* load BPF program */ | ||
124 | if (bpf_object__load(obj)) { | ||
125 | fprintf(stderr, "ERROR: loading BPF object file failed\n"); | ||
126 | goto cleanup; | ||
127 | } | ||
128 | |||
129 | map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counts"); | ||
130 | map_fd[1] = bpf_object__find_map_fd_by_name(obj, "stackmap"); | ||
131 | if (map_fd[0] < 0 || map_fd[1] < 0) { | ||
132 | fprintf(stderr, "ERROR: finding a map in obj file failed\n"); | ||
133 | goto cleanup; | ||
134 | } | ||
135 | |||
136 | signal(SIGINT, int_exit); | ||
137 | signal(SIGTERM, int_exit); | ||
138 | |||
139 | bpf_object__for_each_program(prog, obj) { | ||
140 | links[i] = bpf_program__attach(prog); | ||
141 | if (libbpf_get_error(links[i])) { | ||
142 | fprintf(stderr, "ERROR: bpf_program__attach failed\n"); | ||
143 | links[i] = NULL; | ||
144 | goto cleanup; | ||
145 | } | ||
146 | i++; | ||
147 | } | ||
148 | |||
149 | if (argc > 1) | ||
150 | delay = atoi(argv[1]); | ||
151 | sleep(delay); | ||
152 | print_stacks(map_fd[0]); | ||
153 | |||
154 | cleanup: | ||
155 | for (i--; i >= 0; i--) | ||
156 | bpf_link__destroy(links[i]); | ||
157 | |||
158 | bpf_object__close(obj); | ||
159 | return 0; | ||
160 | } | ||
diff --git a/samples/bpf/parse_ldabs.c b/samples/bpf/parse_ldabs.c new file mode 100644 index 000000000..c6f65f90a --- /dev/null +++ b/samples/bpf/parse_ldabs.c | |||
@@ -0,0 +1,43 @@ | |||
1 | /* Copyright (c) 2016 Facebook | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #define KBUILD_MODNAME "foo" | ||
8 | #include <linux/ip.h> | ||
9 | #include <linux/ipv6.h> | ||
10 | #include <linux/in.h> | ||
11 | #include <linux/tcp.h> | ||
12 | #include <linux/udp.h> | ||
13 | #include <uapi/linux/bpf.h> | ||
14 | #include <bpf/bpf_helpers.h> | ||
15 | #include "bpf_legacy.h" | ||
16 | |||
17 | #define DEFAULT_PKTGEN_UDP_PORT 9 | ||
18 | #define IP_MF 0x2000 | ||
19 | #define IP_OFFSET 0x1FFF | ||
20 | |||
21 | static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff) | ||
22 | { | ||
23 | return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off)) | ||
24 | & (IP_MF | IP_OFFSET); | ||
25 | } | ||
26 | |||
27 | SEC("ldabs") | ||
28 | int handle_ingress(struct __sk_buff *skb) | ||
29 | { | ||
30 | __u64 troff = ETH_HLEN + sizeof(struct iphdr); | ||
31 | |||
32 | if (load_half(skb, offsetof(struct ethhdr, h_proto)) != ETH_P_IP) | ||
33 | return 0; | ||
34 | if (load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)) != IPPROTO_UDP || | ||
35 | load_byte(skb, ETH_HLEN) != 0x45) | ||
36 | return 0; | ||
37 | if (ip_is_fragment(skb, ETH_HLEN)) | ||
38 | return 0; | ||
39 | if (load_half(skb, troff + offsetof(struct udphdr, dest)) == DEFAULT_PKTGEN_UDP_PORT) | ||
40 | return TC_ACT_SHOT; | ||
41 | return 0; | ||
42 | } | ||
43 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/parse_simple.c b/samples/bpf/parse_simple.c new file mode 100644 index 000000000..4a486cb1e --- /dev/null +++ b/samples/bpf/parse_simple.c | |||
@@ -0,0 +1,49 @@ | |||
1 | /* Copyright (c) 2016 Facebook | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #define KBUILD_MODNAME "foo" | ||
8 | #include <linux/ip.h> | ||
9 | #include <linux/ipv6.h> | ||
10 | #include <linux/in.h> | ||
11 | #include <linux/tcp.h> | ||
12 | #include <linux/udp.h> | ||
13 | #include <uapi/linux/bpf.h> | ||
14 | #include <net/ip.h> | ||
15 | #include <bpf/bpf_helpers.h> | ||
16 | |||
17 | #define DEFAULT_PKTGEN_UDP_PORT 9 | ||
18 | |||
19 | /* copy of 'struct ethhdr' without __packed */ | ||
20 | struct eth_hdr { | ||
21 | unsigned char h_dest[ETH_ALEN]; | ||
22 | unsigned char h_source[ETH_ALEN]; | ||
23 | unsigned short h_proto; | ||
24 | }; | ||
25 | |||
26 | SEC("simple") | ||
27 | int handle_ingress(struct __sk_buff *skb) | ||
28 | { | ||
29 | void *data = (void *)(long)skb->data; | ||
30 | struct eth_hdr *eth = data; | ||
31 | struct iphdr *iph = data + sizeof(*eth); | ||
32 | struct udphdr *udp = data + sizeof(*eth) + sizeof(*iph); | ||
33 | void *data_end = (void *)(long)skb->data_end; | ||
34 | |||
35 | /* single length check */ | ||
36 | if (data + sizeof(*eth) + sizeof(*iph) + sizeof(*udp) > data_end) | ||
37 | return 0; | ||
38 | |||
39 | if (eth->h_proto != htons(ETH_P_IP)) | ||
40 | return 0; | ||
41 | if (iph->protocol != IPPROTO_UDP || iph->ihl != 5) | ||
42 | return 0; | ||
43 | if (ip_is_fragment(iph)) | ||
44 | return 0; | ||
45 | if (udp->dest == htons(DEFAULT_PKTGEN_UDP_PORT)) | ||
46 | return TC_ACT_SHOT; | ||
47 | return 0; | ||
48 | } | ||
49 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/parse_varlen.c b/samples/bpf/parse_varlen.c new file mode 100644 index 000000000..d8623846e --- /dev/null +++ b/samples/bpf/parse_varlen.c | |||
@@ -0,0 +1,150 @@ | |||
1 | /* Copyright (c) 2016 Facebook | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #define KBUILD_MODNAME "foo" | ||
8 | #include <linux/if_ether.h> | ||
9 | #include <linux/if_vlan.h> | ||
10 | #include <linux/ip.h> | ||
11 | #include <linux/ipv6.h> | ||
12 | #include <linux/in.h> | ||
13 | #include <linux/tcp.h> | ||
14 | #include <linux/udp.h> | ||
15 | #include <uapi/linux/bpf.h> | ||
16 | #include <net/ip.h> | ||
17 | #include <bpf/bpf_helpers.h> | ||
18 | |||
19 | #define DEFAULT_PKTGEN_UDP_PORT 9 | ||
20 | #define DEBUG 0 | ||
21 | |||
22 | static int tcp(void *data, uint64_t tp_off, void *data_end) | ||
23 | { | ||
24 | struct tcphdr *tcp = data + tp_off; | ||
25 | |||
26 | if (tcp + 1 > data_end) | ||
27 | return 0; | ||
28 | if (tcp->dest == htons(80) || tcp->source == htons(80)) | ||
29 | return TC_ACT_SHOT; | ||
30 | return 0; | ||
31 | } | ||
32 | |||
33 | static int udp(void *data, uint64_t tp_off, void *data_end) | ||
34 | { | ||
35 | struct udphdr *udp = data + tp_off; | ||
36 | |||
37 | if (udp + 1 > data_end) | ||
38 | return 0; | ||
39 | if (udp->dest == htons(DEFAULT_PKTGEN_UDP_PORT) || | ||
40 | udp->source == htons(DEFAULT_PKTGEN_UDP_PORT)) { | ||
41 | if (DEBUG) { | ||
42 | char fmt[] = "udp port 9 indeed\n"; | ||
43 | |||
44 | bpf_trace_printk(fmt, sizeof(fmt)); | ||
45 | } | ||
46 | return TC_ACT_SHOT; | ||
47 | } | ||
48 | return 0; | ||
49 | } | ||
50 | |||
51 | static int parse_ipv4(void *data, uint64_t nh_off, void *data_end) | ||
52 | { | ||
53 | struct iphdr *iph; | ||
54 | uint64_t ihl_len; | ||
55 | |||
56 | iph = data + nh_off; | ||
57 | if (iph + 1 > data_end) | ||
58 | return 0; | ||
59 | |||
60 | if (ip_is_fragment(iph)) | ||
61 | return 0; | ||
62 | ihl_len = iph->ihl * 4; | ||
63 | |||
64 | if (iph->protocol == IPPROTO_IPIP) { | ||
65 | iph = data + nh_off + ihl_len; | ||
66 | if (iph + 1 > data_end) | ||
67 | return 0; | ||
68 | ihl_len += iph->ihl * 4; | ||
69 | } | ||
70 | |||
71 | if (iph->protocol == IPPROTO_TCP) | ||
72 | return tcp(data, nh_off + ihl_len, data_end); | ||
73 | else if (iph->protocol == IPPROTO_UDP) | ||
74 | return udp(data, nh_off + ihl_len, data_end); | ||
75 | return 0; | ||
76 | } | ||
77 | |||
78 | static int parse_ipv6(void *data, uint64_t nh_off, void *data_end) | ||
79 | { | ||
80 | struct ipv6hdr *ip6h; | ||
81 | struct iphdr *iph; | ||
82 | uint64_t ihl_len = sizeof(struct ipv6hdr); | ||
83 | uint64_t nexthdr; | ||
84 | |||
85 | ip6h = data + nh_off; | ||
86 | if (ip6h + 1 > data_end) | ||
87 | return 0; | ||
88 | |||
89 | nexthdr = ip6h->nexthdr; | ||
90 | |||
91 | if (nexthdr == IPPROTO_IPIP) { | ||
92 | iph = data + nh_off + ihl_len; | ||
93 | if (iph + 1 > data_end) | ||
94 | return 0; | ||
95 | ihl_len += iph->ihl * 4; | ||
96 | nexthdr = iph->protocol; | ||
97 | } else if (nexthdr == IPPROTO_IPV6) { | ||
98 | ip6h = data + nh_off + ihl_len; | ||
99 | if (ip6h + 1 > data_end) | ||
100 | return 0; | ||
101 | ihl_len += sizeof(struct ipv6hdr); | ||
102 | nexthdr = ip6h->nexthdr; | ||
103 | } | ||
104 | |||
105 | if (nexthdr == IPPROTO_TCP) | ||
106 | return tcp(data, nh_off + ihl_len, data_end); | ||
107 | else if (nexthdr == IPPROTO_UDP) | ||
108 | return udp(data, nh_off + ihl_len, data_end); | ||
109 | return 0; | ||
110 | } | ||
111 | |||
112 | SEC("varlen") | ||
113 | int handle_ingress(struct __sk_buff *skb) | ||
114 | { | ||
115 | void *data = (void *)(long)skb->data; | ||
116 | struct ethhdr *eth = data; | ||
117 | void *data_end = (void *)(long)skb->data_end; | ||
118 | uint64_t h_proto, nh_off; | ||
119 | |||
120 | nh_off = sizeof(*eth); | ||
121 | if (data + nh_off > data_end) | ||
122 | return 0; | ||
123 | |||
124 | h_proto = eth->h_proto; | ||
125 | |||
126 | if (h_proto == ETH_P_8021Q || h_proto == ETH_P_8021AD) { | ||
127 | struct vlan_hdr *vhdr; | ||
128 | |||
129 | vhdr = data + nh_off; | ||
130 | nh_off += sizeof(struct vlan_hdr); | ||
131 | if (data + nh_off > data_end) | ||
132 | return 0; | ||
133 | h_proto = vhdr->h_vlan_encapsulated_proto; | ||
134 | } | ||
135 | if (h_proto == ETH_P_8021Q || h_proto == ETH_P_8021AD) { | ||
136 | struct vlan_hdr *vhdr; | ||
137 | |||
138 | vhdr = data + nh_off; | ||
139 | nh_off += sizeof(struct vlan_hdr); | ||
140 | if (data + nh_off > data_end) | ||
141 | return 0; | ||
142 | h_proto = vhdr->h_vlan_encapsulated_proto; | ||
143 | } | ||
144 | if (h_proto == htons(ETH_P_IP)) | ||
145 | return parse_ipv4(data, nh_off, data_end); | ||
146 | else if (h_proto == htons(ETH_P_IPV6)) | ||
147 | return parse_ipv6(data, nh_off, data_end); | ||
148 | return 0; | ||
149 | } | ||
150 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/run_cookie_uid_helper_example.sh b/samples/bpf/run_cookie_uid_helper_example.sh new file mode 100755 index 000000000..fc6bc0451 --- /dev/null +++ b/samples/bpf/run_cookie_uid_helper_example.sh | |||
@@ -0,0 +1,15 @@ | |||
1 | #!/bin/bash | ||
2 | # SPDX-License-Identifier: GPL-2.0 | ||
3 | local_dir="$(pwd)" | ||
4 | root_dir=$local_dir/../.. | ||
5 | mnt_dir=$(mktemp -d --tmp) | ||
6 | |||
7 | on_exit() { | ||
8 | iptables -D OUTPUT -m bpf --object-pinned ${mnt_dir}/bpf_prog -j ACCEPT | ||
9 | umount ${mnt_dir} | ||
10 | rm -r ${mnt_dir} | ||
11 | } | ||
12 | |||
13 | trap on_exit EXIT | ||
14 | mount -t bpf bpf ${mnt_dir} | ||
15 | ./per_socket_stats_example ${mnt_dir}/bpf_prog $1 | ||
diff --git a/samples/bpf/sampleip_kern.c b/samples/bpf/sampleip_kern.c new file mode 100644 index 000000000..f24806ac2 --- /dev/null +++ b/samples/bpf/sampleip_kern.c | |||
@@ -0,0 +1,39 @@ | |||
1 | /* Copyright 2016 Netflix, Inc. | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #include <linux/version.h> | ||
8 | #include <linux/ptrace.h> | ||
9 | #include <uapi/linux/bpf.h> | ||
10 | #include <uapi/linux/bpf_perf_event.h> | ||
11 | #include <bpf/bpf_helpers.h> | ||
12 | #include <bpf/bpf_tracing.h> | ||
13 | |||
14 | #define MAX_IPS 8192 | ||
15 | |||
16 | struct { | ||
17 | __uint(type, BPF_MAP_TYPE_HASH); | ||
18 | __type(key, u64); | ||
19 | __type(value, u32); | ||
20 | __uint(max_entries, MAX_IPS); | ||
21 | } ip_map SEC(".maps"); | ||
22 | |||
23 | SEC("perf_event") | ||
24 | int do_sample(struct bpf_perf_event_data *ctx) | ||
25 | { | ||
26 | u64 ip; | ||
27 | u32 *value, init_val = 1; | ||
28 | |||
29 | ip = PT_REGS_IP(&ctx->regs); | ||
30 | value = bpf_map_lookup_elem(&ip_map, &ip); | ||
31 | if (value) | ||
32 | *value += 1; | ||
33 | else | ||
34 | /* E2BIG not tested for this example only */ | ||
35 | bpf_map_update_elem(&ip_map, &ip, &init_val, BPF_NOEXIST); | ||
36 | |||
37 | return 0; | ||
38 | } | ||
39 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c new file mode 100644 index 000000000..921c505bb --- /dev/null +++ b/samples/bpf/sampleip_user.c | |||
@@ -0,0 +1,227 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * sampleip: sample instruction pointer and frequency count in a BPF map. | ||
4 | * | ||
5 | * Copyright 2016 Netflix, Inc. | ||
6 | */ | ||
7 | #include <stdio.h> | ||
8 | #include <stdlib.h> | ||
9 | #include <unistd.h> | ||
10 | #include <errno.h> | ||
11 | #include <signal.h> | ||
12 | #include <string.h> | ||
13 | #include <linux/perf_event.h> | ||
14 | #include <linux/ptrace.h> | ||
15 | #include <linux/bpf.h> | ||
16 | #include <bpf/bpf.h> | ||
17 | #include <bpf/libbpf.h> | ||
18 | #include "perf-sys.h" | ||
19 | #include "trace_helpers.h" | ||
20 | |||
21 | #define DEFAULT_FREQ 99 | ||
22 | #define DEFAULT_SECS 5 | ||
23 | #define MAX_IPS 8192 | ||
24 | #define PAGE_OFFSET 0xffff880000000000 | ||
25 | |||
26 | static int map_fd; | ||
27 | static int nr_cpus; | ||
28 | |||
29 | static void usage(void) | ||
30 | { | ||
31 | printf("USAGE: sampleip [-F freq] [duration]\n"); | ||
32 | printf(" -F freq # sample frequency (Hertz), default 99\n"); | ||
33 | printf(" duration # sampling duration (seconds), default 5\n"); | ||
34 | } | ||
35 | |||
36 | static int sampling_start(int freq, struct bpf_program *prog, | ||
37 | struct bpf_link *links[]) | ||
38 | { | ||
39 | int i, pmu_fd; | ||
40 | |||
41 | struct perf_event_attr pe_sample_attr = { | ||
42 | .type = PERF_TYPE_SOFTWARE, | ||
43 | .freq = 1, | ||
44 | .sample_period = freq, | ||
45 | .config = PERF_COUNT_SW_CPU_CLOCK, | ||
46 | .inherit = 1, | ||
47 | }; | ||
48 | |||
49 | for (i = 0; i < nr_cpus; i++) { | ||
50 | pmu_fd = sys_perf_event_open(&pe_sample_attr, -1 /* pid */, i, | ||
51 | -1 /* group_fd */, 0 /* flags */); | ||
52 | if (pmu_fd < 0) { | ||
53 | fprintf(stderr, "ERROR: Initializing perf sampling\n"); | ||
54 | return 1; | ||
55 | } | ||
56 | links[i] = bpf_program__attach_perf_event(prog, pmu_fd); | ||
57 | if (libbpf_get_error(links[i])) { | ||
58 | fprintf(stderr, "ERROR: Attach perf event\n"); | ||
59 | links[i] = NULL; | ||
60 | close(pmu_fd); | ||
61 | return 1; | ||
62 | } | ||
63 | } | ||
64 | |||
65 | return 0; | ||
66 | } | ||
67 | |||
68 | static void sampling_end(struct bpf_link *links[]) | ||
69 | { | ||
70 | int i; | ||
71 | |||
72 | for (i = 0; i < nr_cpus; i++) | ||
73 | bpf_link__destroy(links[i]); | ||
74 | } | ||
75 | |||
76 | struct ipcount { | ||
77 | __u64 ip; | ||
78 | __u32 count; | ||
79 | }; | ||
80 | |||
81 | /* used for sorting */ | ||
82 | struct ipcount counts[MAX_IPS]; | ||
83 | |||
84 | static int count_cmp(const void *p1, const void *p2) | ||
85 | { | ||
86 | return ((struct ipcount *)p1)->count - ((struct ipcount *)p2)->count; | ||
87 | } | ||
88 | |||
89 | static void print_ip_map(int fd) | ||
90 | { | ||
91 | struct ksym *sym; | ||
92 | __u64 key, next_key; | ||
93 | __u32 value; | ||
94 | int i, max; | ||
95 | |||
96 | printf("%-19s %-32s %s\n", "ADDR", "KSYM", "COUNT"); | ||
97 | |||
98 | /* fetch IPs and counts */ | ||
99 | key = 0, i = 0; | ||
100 | while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { | ||
101 | bpf_map_lookup_elem(fd, &next_key, &value); | ||
102 | counts[i].ip = next_key; | ||
103 | counts[i++].count = value; | ||
104 | key = next_key; | ||
105 | } | ||
106 | max = i; | ||
107 | |||
108 | /* sort and print */ | ||
109 | qsort(counts, max, sizeof(struct ipcount), count_cmp); | ||
110 | for (i = 0; i < max; i++) { | ||
111 | if (counts[i].ip > PAGE_OFFSET) { | ||
112 | sym = ksym_search(counts[i].ip); | ||
113 | if (!sym) { | ||
114 | printf("ksym not found. Is kallsyms loaded?\n"); | ||
115 | continue; | ||
116 | } | ||
117 | |||
118 | printf("0x%-17llx %-32s %u\n", counts[i].ip, sym->name, | ||
119 | counts[i].count); | ||
120 | } else { | ||
121 | printf("0x%-17llx %-32s %u\n", counts[i].ip, "(user)", | ||
122 | counts[i].count); | ||
123 | } | ||
124 | } | ||
125 | |||
126 | if (max == MAX_IPS) { | ||
127 | printf("WARNING: IP hash was full (max %d entries); ", max); | ||
128 | printf("may have dropped samples\n"); | ||
129 | } | ||
130 | } | ||
131 | |||
132 | static void int_exit(int sig) | ||
133 | { | ||
134 | printf("\n"); | ||
135 | print_ip_map(map_fd); | ||
136 | exit(0); | ||
137 | } | ||
138 | |||
139 | int main(int argc, char **argv) | ||
140 | { | ||
141 | int opt, freq = DEFAULT_FREQ, secs = DEFAULT_SECS, error = 1; | ||
142 | struct bpf_object *obj = NULL; | ||
143 | struct bpf_program *prog; | ||
144 | struct bpf_link **links; | ||
145 | char filename[256]; | ||
146 | |||
147 | /* process arguments */ | ||
148 | while ((opt = getopt(argc, argv, "F:h")) != -1) { | ||
149 | switch (opt) { | ||
150 | case 'F': | ||
151 | freq = atoi(optarg); | ||
152 | break; | ||
153 | case 'h': | ||
154 | default: | ||
155 | usage(); | ||
156 | return 0; | ||
157 | } | ||
158 | } | ||
159 | if (argc - optind == 1) | ||
160 | secs = atoi(argv[optind]); | ||
161 | if (freq == 0 || secs == 0) { | ||
162 | usage(); | ||
163 | return 1; | ||
164 | } | ||
165 | |||
166 | /* initialize kernel symbol translation */ | ||
167 | if (load_kallsyms()) { | ||
168 | fprintf(stderr, "ERROR: loading /proc/kallsyms\n"); | ||
169 | return 2; | ||
170 | } | ||
171 | |||
172 | /* create perf FDs for each CPU */ | ||
173 | nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); | ||
174 | links = calloc(nr_cpus, sizeof(struct bpf_link *)); | ||
175 | if (!links) { | ||
176 | fprintf(stderr, "ERROR: malloc of links\n"); | ||
177 | goto cleanup; | ||
178 | } | ||
179 | |||
180 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
181 | obj = bpf_object__open_file(filename, NULL); | ||
182 | if (libbpf_get_error(obj)) { | ||
183 | fprintf(stderr, "ERROR: opening BPF object file failed\n"); | ||
184 | obj = NULL; | ||
185 | goto cleanup; | ||
186 | } | ||
187 | |||
188 | prog = bpf_object__find_program_by_name(obj, "do_sample"); | ||
189 | if (!prog) { | ||
190 | fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); | ||
191 | goto cleanup; | ||
192 | } | ||
193 | |||
194 | /* load BPF program */ | ||
195 | if (bpf_object__load(obj)) { | ||
196 | fprintf(stderr, "ERROR: loading BPF object file failed\n"); | ||
197 | goto cleanup; | ||
198 | } | ||
199 | |||
200 | map_fd = bpf_object__find_map_fd_by_name(obj, "ip_map"); | ||
201 | if (map_fd < 0) { | ||
202 | fprintf(stderr, "ERROR: finding a map in obj file failed\n"); | ||
203 | goto cleanup; | ||
204 | } | ||
205 | |||
206 | signal(SIGINT, int_exit); | ||
207 | signal(SIGTERM, int_exit); | ||
208 | |||
209 | /* do sampling */ | ||
210 | printf("Sampling at %d Hertz for %d seconds. Ctrl-C also ends.\n", | ||
211 | freq, secs); | ||
212 | if (sampling_start(freq, prog, links) != 0) | ||
213 | goto cleanup; | ||
214 | |||
215 | sleep(secs); | ||
216 | error = 0; | ||
217 | |||
218 | cleanup: | ||
219 | sampling_end(links); | ||
220 | /* output sample counts */ | ||
221 | if (!error) | ||
222 | print_ip_map(map_fd); | ||
223 | |||
224 | free(links); | ||
225 | bpf_object__close(obj); | ||
226 | return error; | ||
227 | } | ||
diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c new file mode 100644 index 000000000..00aae1d33 --- /dev/null +++ b/samples/bpf/sock_example.c | |||
@@ -0,0 +1,106 @@ | |||
1 | /* eBPF example program: | ||
2 | * - creates arraymap in kernel with key 4 bytes and value 8 bytes | ||
3 | * | ||
4 | * - loads eBPF program: | ||
5 | * r0 = skb->data[ETH_HLEN + offsetof(struct iphdr, protocol)]; | ||
6 | * *(u32*)(fp - 4) = r0; | ||
7 | * // assuming packet is IPv4, lookup ip->proto in a map | ||
8 | * value = bpf_map_lookup_elem(map_fd, fp - 4); | ||
9 | * if (value) | ||
10 | * (*(u64*)value) += 1; | ||
11 | * | ||
12 | * - attaches this program to loopback interface "lo" raw socket | ||
13 | * | ||
14 | * - every second user space reads map[tcp], map[udp], map[icmp] to see | ||
15 | * how many packets of given protocol were seen on "lo" | ||
16 | */ | ||
17 | #include <stdio.h> | ||
18 | #include <unistd.h> | ||
19 | #include <assert.h> | ||
20 | #include <linux/bpf.h> | ||
21 | #include <string.h> | ||
22 | #include <stdlib.h> | ||
23 | #include <errno.h> | ||
24 | #include <sys/socket.h> | ||
25 | #include <arpa/inet.h> | ||
26 | #include <linux/if_ether.h> | ||
27 | #include <linux/ip.h> | ||
28 | #include <stddef.h> | ||
29 | #include <bpf/bpf.h> | ||
30 | #include "bpf_insn.h" | ||
31 | #include "sock_example.h" | ||
32 | |||
33 | char bpf_log_buf[BPF_LOG_BUF_SIZE]; | ||
34 | |||
35 | static int test_sock(void) | ||
36 | { | ||
37 | int sock = -1, map_fd, prog_fd, i, key; | ||
38 | long long value = 0, tcp_cnt, udp_cnt, icmp_cnt; | ||
39 | |||
40 | map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value), | ||
41 | 256, 0); | ||
42 | if (map_fd < 0) { | ||
43 | printf("failed to create map '%s'\n", strerror(errno)); | ||
44 | goto cleanup; | ||
45 | } | ||
46 | |||
47 | struct bpf_insn prog[] = { | ||
48 | BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), | ||
49 | BPF_LD_ABS(BPF_B, ETH_HLEN + offsetof(struct iphdr, protocol) /* R0 = ip->proto */), | ||
50 | BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ | ||
51 | BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), | ||
52 | BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ | ||
53 | BPF_LD_MAP_FD(BPF_REG_1, map_fd), | ||
54 | BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), | ||
55 | BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), | ||
56 | BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ | ||
57 | BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ | ||
58 | BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */ | ||
59 | BPF_EXIT_INSN(), | ||
60 | }; | ||
61 | size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); | ||
62 | |||
63 | prog_fd = bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog, insns_cnt, | ||
64 | "GPL", 0, bpf_log_buf, BPF_LOG_BUF_SIZE); | ||
65 | if (prog_fd < 0) { | ||
66 | printf("failed to load prog '%s'\n", strerror(errno)); | ||
67 | goto cleanup; | ||
68 | } | ||
69 | |||
70 | sock = open_raw_sock("lo"); | ||
71 | |||
72 | if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, | ||
73 | sizeof(prog_fd)) < 0) { | ||
74 | printf("setsockopt %s\n", strerror(errno)); | ||
75 | goto cleanup; | ||
76 | } | ||
77 | |||
78 | for (i = 0; i < 10; i++) { | ||
79 | key = IPPROTO_TCP; | ||
80 | assert(bpf_map_lookup_elem(map_fd, &key, &tcp_cnt) == 0); | ||
81 | |||
82 | key = IPPROTO_UDP; | ||
83 | assert(bpf_map_lookup_elem(map_fd, &key, &udp_cnt) == 0); | ||
84 | |||
85 | key = IPPROTO_ICMP; | ||
86 | assert(bpf_map_lookup_elem(map_fd, &key, &icmp_cnt) == 0); | ||
87 | |||
88 | printf("TCP %lld UDP %lld ICMP %lld packets\n", | ||
89 | tcp_cnt, udp_cnt, icmp_cnt); | ||
90 | sleep(1); | ||
91 | } | ||
92 | |||
93 | cleanup: | ||
94 | /* maps, programs, raw sockets will auto cleanup on process exit */ | ||
95 | return 0; | ||
96 | } | ||
97 | |||
98 | int main(void) | ||
99 | { | ||
100 | FILE *f; | ||
101 | |||
102 | f = popen("ping -4 -c5 localhost", "r"); | ||
103 | (void)f; | ||
104 | |||
105 | return test_sock(); | ||
106 | } | ||
diff --git a/samples/bpf/sock_example.h b/samples/bpf/sock_example.h new file mode 100644 index 000000000..a27d7579b --- /dev/null +++ b/samples/bpf/sock_example.h | |||
@@ -0,0 +1,35 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
2 | #include <stdlib.h> | ||
3 | #include <stdio.h> | ||
4 | #include <linux/unistd.h> | ||
5 | #include <unistd.h> | ||
6 | #include <string.h> | ||
7 | #include <errno.h> | ||
8 | #include <linux/if_ether.h> | ||
9 | #include <net/if.h> | ||
10 | #include <linux/if_packet.h> | ||
11 | #include <arpa/inet.h> | ||
12 | |||
13 | static inline int open_raw_sock(const char *name) | ||
14 | { | ||
15 | struct sockaddr_ll sll; | ||
16 | int sock; | ||
17 | |||
18 | sock = socket(PF_PACKET, SOCK_RAW | SOCK_NONBLOCK | SOCK_CLOEXEC, htons(ETH_P_ALL)); | ||
19 | if (sock < 0) { | ||
20 | printf("cannot create raw socket\n"); | ||
21 | return -1; | ||
22 | } | ||
23 | |||
24 | memset(&sll, 0, sizeof(sll)); | ||
25 | sll.sll_family = AF_PACKET; | ||
26 | sll.sll_ifindex = if_nametoindex(name); | ||
27 | sll.sll_protocol = htons(ETH_P_ALL); | ||
28 | if (bind(sock, (struct sockaddr *)&sll, sizeof(sll)) < 0) { | ||
29 | printf("bind to %s: %s\n", name, strerror(errno)); | ||
30 | close(sock); | ||
31 | return -1; | ||
32 | } | ||
33 | |||
34 | return sock; | ||
35 | } | ||
diff --git a/samples/bpf/sock_flags_kern.c b/samples/bpf/sock_flags_kern.c new file mode 100644 index 000000000..6d0ac7569 --- /dev/null +++ b/samples/bpf/sock_flags_kern.c | |||
@@ -0,0 +1,49 @@ | |||
1 | #include <uapi/linux/bpf.h> | ||
2 | #include <linux/socket.h> | ||
3 | #include <linux/net.h> | ||
4 | #include <uapi/linux/in.h> | ||
5 | #include <uapi/linux/in6.h> | ||
6 | #include <bpf/bpf_helpers.h> | ||
7 | |||
8 | SEC("cgroup/sock1") | ||
9 | int bpf_prog1(struct bpf_sock *sk) | ||
10 | { | ||
11 | char fmt[] = "socket: family %d type %d protocol %d\n"; | ||
12 | char fmt2[] = "socket: uid %u gid %u\n"; | ||
13 | __u64 gid_uid = bpf_get_current_uid_gid(); | ||
14 | __u32 uid = gid_uid & 0xffffffff; | ||
15 | __u32 gid = gid_uid >> 32; | ||
16 | |||
17 | bpf_trace_printk(fmt, sizeof(fmt), sk->family, sk->type, sk->protocol); | ||
18 | bpf_trace_printk(fmt2, sizeof(fmt2), uid, gid); | ||
19 | |||
20 | /* block PF_INET6, SOCK_RAW, IPPROTO_ICMPV6 sockets | ||
21 | * ie., make ping6 fail | ||
22 | */ | ||
23 | if (sk->family == PF_INET6 && | ||
24 | sk->type == SOCK_RAW && | ||
25 | sk->protocol == IPPROTO_ICMPV6) | ||
26 | return 0; | ||
27 | |||
28 | return 1; | ||
29 | } | ||
30 | |||
31 | SEC("cgroup/sock2") | ||
32 | int bpf_prog2(struct bpf_sock *sk) | ||
33 | { | ||
34 | char fmt[] = "socket: family %d type %d protocol %d\n"; | ||
35 | |||
36 | bpf_trace_printk(fmt, sizeof(fmt), sk->family, sk->type, sk->protocol); | ||
37 | |||
38 | /* block PF_INET, SOCK_RAW, IPPROTO_ICMP sockets | ||
39 | * ie., make ping fail | ||
40 | */ | ||
41 | if (sk->family == PF_INET && | ||
42 | sk->type == SOCK_RAW && | ||
43 | sk->protocol == IPPROTO_ICMP) | ||
44 | return 0; | ||
45 | |||
46 | return 1; | ||
47 | } | ||
48 | |||
49 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/sockex1_kern.c b/samples/bpf/sockex1_kern.c new file mode 100644 index 000000000..431c95646 --- /dev/null +++ b/samples/bpf/sockex1_kern.c | |||
@@ -0,0 +1,30 @@ | |||
1 | #include <uapi/linux/bpf.h> | ||
2 | #include <uapi/linux/if_ether.h> | ||
3 | #include <uapi/linux/if_packet.h> | ||
4 | #include <uapi/linux/ip.h> | ||
5 | #include <bpf/bpf_helpers.h> | ||
6 | #include "bpf_legacy.h" | ||
7 | |||
8 | struct { | ||
9 | __uint(type, BPF_MAP_TYPE_ARRAY); | ||
10 | __type(key, u32); | ||
11 | __type(value, long); | ||
12 | __uint(max_entries, 256); | ||
13 | } my_map SEC(".maps"); | ||
14 | |||
15 | SEC("socket1") | ||
16 | int bpf_prog1(struct __sk_buff *skb) | ||
17 | { | ||
18 | int index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); | ||
19 | long *value; | ||
20 | |||
21 | if (skb->pkt_type != PACKET_OUTGOING) | ||
22 | return 0; | ||
23 | |||
24 | value = bpf_map_lookup_elem(&my_map, &index); | ||
25 | if (value) | ||
26 | __sync_fetch_and_add(value, skb->len); | ||
27 | |||
28 | return 0; | ||
29 | } | ||
30 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/sockex1_user.c b/samples/bpf/sockex1_user.c new file mode 100644 index 000000000..3c8372287 --- /dev/null +++ b/samples/bpf/sockex1_user.c | |||
@@ -0,0 +1,54 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #include <stdio.h> | ||
3 | #include <assert.h> | ||
4 | #include <linux/bpf.h> | ||
5 | #include <bpf/bpf.h> | ||
6 | #include <bpf/libbpf.h> | ||
7 | #include "sock_example.h" | ||
8 | #include <unistd.h> | ||
9 | #include <arpa/inet.h> | ||
10 | |||
11 | int main(int ac, char **argv) | ||
12 | { | ||
13 | struct bpf_object *obj; | ||
14 | int map_fd, prog_fd; | ||
15 | char filename[256]; | ||
16 | int i, sock; | ||
17 | FILE *f; | ||
18 | |||
19 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
20 | |||
21 | if (bpf_prog_load(filename, BPF_PROG_TYPE_SOCKET_FILTER, | ||
22 | &obj, &prog_fd)) | ||
23 | return 1; | ||
24 | |||
25 | map_fd = bpf_object__find_map_fd_by_name(obj, "my_map"); | ||
26 | |||
27 | sock = open_raw_sock("lo"); | ||
28 | |||
29 | assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, | ||
30 | sizeof(prog_fd)) == 0); | ||
31 | |||
32 | f = popen("ping -4 -c5 localhost", "r"); | ||
33 | (void) f; | ||
34 | |||
35 | for (i = 0; i < 5; i++) { | ||
36 | long long tcp_cnt, udp_cnt, icmp_cnt; | ||
37 | int key; | ||
38 | |||
39 | key = IPPROTO_TCP; | ||
40 | assert(bpf_map_lookup_elem(map_fd, &key, &tcp_cnt) == 0); | ||
41 | |||
42 | key = IPPROTO_UDP; | ||
43 | assert(bpf_map_lookup_elem(map_fd, &key, &udp_cnt) == 0); | ||
44 | |||
45 | key = IPPROTO_ICMP; | ||
46 | assert(bpf_map_lookup_elem(map_fd, &key, &icmp_cnt) == 0); | ||
47 | |||
48 | printf("TCP %lld UDP %lld ICMP %lld bytes\n", | ||
49 | tcp_cnt, udp_cnt, icmp_cnt); | ||
50 | sleep(1); | ||
51 | } | ||
52 | |||
53 | return 0; | ||
54 | } | ||
diff --git a/samples/bpf/sockex2_kern.c b/samples/bpf/sockex2_kern.c new file mode 100644 index 000000000..b7997541f --- /dev/null +++ b/samples/bpf/sockex2_kern.c | |||
@@ -0,0 +1,223 @@ | |||
1 | #include <uapi/linux/bpf.h> | ||
2 | #include <uapi/linux/in.h> | ||
3 | #include <uapi/linux/if.h> | ||
4 | #include <uapi/linux/if_ether.h> | ||
5 | #include <uapi/linux/ip.h> | ||
6 | #include <uapi/linux/ipv6.h> | ||
7 | #include <uapi/linux/if_tunnel.h> | ||
8 | #include <bpf/bpf_helpers.h> | ||
9 | #include "bpf_legacy.h" | ||
10 | #define IP_MF 0x2000 | ||
11 | #define IP_OFFSET 0x1FFF | ||
12 | |||
13 | struct vlan_hdr { | ||
14 | __be16 h_vlan_TCI; | ||
15 | __be16 h_vlan_encapsulated_proto; | ||
16 | }; | ||
17 | |||
18 | struct flow_key_record { | ||
19 | __be32 src; | ||
20 | __be32 dst; | ||
21 | union { | ||
22 | __be32 ports; | ||
23 | __be16 port16[2]; | ||
24 | }; | ||
25 | __u16 thoff; | ||
26 | __u8 ip_proto; | ||
27 | }; | ||
28 | |||
29 | static inline int proto_ports_offset(__u64 proto) | ||
30 | { | ||
31 | switch (proto) { | ||
32 | case IPPROTO_TCP: | ||
33 | case IPPROTO_UDP: | ||
34 | case IPPROTO_DCCP: | ||
35 | case IPPROTO_ESP: | ||
36 | case IPPROTO_SCTP: | ||
37 | case IPPROTO_UDPLITE: | ||
38 | return 0; | ||
39 | case IPPROTO_AH: | ||
40 | return 4; | ||
41 | default: | ||
42 | return 0; | ||
43 | } | ||
44 | } | ||
45 | |||
46 | static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff) | ||
47 | { | ||
48 | return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off)) | ||
49 | & (IP_MF | IP_OFFSET); | ||
50 | } | ||
51 | |||
52 | static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off) | ||
53 | { | ||
54 | __u64 w0 = load_word(ctx, off); | ||
55 | __u64 w1 = load_word(ctx, off + 4); | ||
56 | __u64 w2 = load_word(ctx, off + 8); | ||
57 | __u64 w3 = load_word(ctx, off + 12); | ||
58 | |||
59 | return (__u32)(w0 ^ w1 ^ w2 ^ w3); | ||
60 | } | ||
61 | |||
62 | static inline __u64 parse_ip(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto, | ||
63 | struct flow_key_record *flow) | ||
64 | { | ||
65 | __u64 verlen; | ||
66 | |||
67 | if (unlikely(ip_is_fragment(skb, nhoff))) | ||
68 | *ip_proto = 0; | ||
69 | else | ||
70 | *ip_proto = load_byte(skb, nhoff + offsetof(struct iphdr, protocol)); | ||
71 | |||
72 | if (*ip_proto != IPPROTO_GRE) { | ||
73 | flow->src = load_word(skb, nhoff + offsetof(struct iphdr, saddr)); | ||
74 | flow->dst = load_word(skb, nhoff + offsetof(struct iphdr, daddr)); | ||
75 | } | ||
76 | |||
77 | verlen = load_byte(skb, nhoff + 0/*offsetof(struct iphdr, ihl)*/); | ||
78 | if (likely(verlen == 0x45)) | ||
79 | nhoff += 20; | ||
80 | else | ||
81 | nhoff += (verlen & 0xF) << 2; | ||
82 | |||
83 | return nhoff; | ||
84 | } | ||
85 | |||
86 | static inline __u64 parse_ipv6(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto, | ||
87 | struct flow_key_record *flow) | ||
88 | { | ||
89 | *ip_proto = load_byte(skb, | ||
90 | nhoff + offsetof(struct ipv6hdr, nexthdr)); | ||
91 | flow->src = ipv6_addr_hash(skb, | ||
92 | nhoff + offsetof(struct ipv6hdr, saddr)); | ||
93 | flow->dst = ipv6_addr_hash(skb, | ||
94 | nhoff + offsetof(struct ipv6hdr, daddr)); | ||
95 | nhoff += sizeof(struct ipv6hdr); | ||
96 | |||
97 | return nhoff; | ||
98 | } | ||
99 | |||
100 | static inline bool flow_dissector(struct __sk_buff *skb, | ||
101 | struct flow_key_record *flow) | ||
102 | { | ||
103 | __u64 nhoff = ETH_HLEN; | ||
104 | __u64 ip_proto; | ||
105 | __u64 proto = load_half(skb, 12); | ||
106 | int poff; | ||
107 | |||
108 | if (proto == ETH_P_8021AD) { | ||
109 | proto = load_half(skb, nhoff + offsetof(struct vlan_hdr, | ||
110 | h_vlan_encapsulated_proto)); | ||
111 | nhoff += sizeof(struct vlan_hdr); | ||
112 | } | ||
113 | |||
114 | if (proto == ETH_P_8021Q) { | ||
115 | proto = load_half(skb, nhoff + offsetof(struct vlan_hdr, | ||
116 | h_vlan_encapsulated_proto)); | ||
117 | nhoff += sizeof(struct vlan_hdr); | ||
118 | } | ||
119 | |||
120 | if (likely(proto == ETH_P_IP)) | ||
121 | nhoff = parse_ip(skb, nhoff, &ip_proto, flow); | ||
122 | else if (proto == ETH_P_IPV6) | ||
123 | nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow); | ||
124 | else | ||
125 | return false; | ||
126 | |||
127 | switch (ip_proto) { | ||
128 | case IPPROTO_GRE: { | ||
129 | struct gre_hdr { | ||
130 | __be16 flags; | ||
131 | __be16 proto; | ||
132 | }; | ||
133 | |||
134 | __u64 gre_flags = load_half(skb, | ||
135 | nhoff + offsetof(struct gre_hdr, flags)); | ||
136 | __u64 gre_proto = load_half(skb, | ||
137 | nhoff + offsetof(struct gre_hdr, proto)); | ||
138 | |||
139 | if (gre_flags & (GRE_VERSION|GRE_ROUTING)) | ||
140 | break; | ||
141 | |||
142 | proto = gre_proto; | ||
143 | nhoff += 4; | ||
144 | if (gre_flags & GRE_CSUM) | ||
145 | nhoff += 4; | ||
146 | if (gre_flags & GRE_KEY) | ||
147 | nhoff += 4; | ||
148 | if (gre_flags & GRE_SEQ) | ||
149 | nhoff += 4; | ||
150 | |||
151 | if (proto == ETH_P_8021Q) { | ||
152 | proto = load_half(skb, | ||
153 | nhoff + offsetof(struct vlan_hdr, | ||
154 | h_vlan_encapsulated_proto)); | ||
155 | nhoff += sizeof(struct vlan_hdr); | ||
156 | } | ||
157 | |||
158 | if (proto == ETH_P_IP) | ||
159 | nhoff = parse_ip(skb, nhoff, &ip_proto, flow); | ||
160 | else if (proto == ETH_P_IPV6) | ||
161 | nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow); | ||
162 | else | ||
163 | return false; | ||
164 | break; | ||
165 | } | ||
166 | case IPPROTO_IPIP: | ||
167 | nhoff = parse_ip(skb, nhoff, &ip_proto, flow); | ||
168 | break; | ||
169 | case IPPROTO_IPV6: | ||
170 | nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow); | ||
171 | break; | ||
172 | default: | ||
173 | break; | ||
174 | } | ||
175 | |||
176 | flow->ip_proto = ip_proto; | ||
177 | poff = proto_ports_offset(ip_proto); | ||
178 | if (poff >= 0) { | ||
179 | nhoff += poff; | ||
180 | flow->ports = load_word(skb, nhoff); | ||
181 | } | ||
182 | |||
183 | flow->thoff = (__u16) nhoff; | ||
184 | |||
185 | return true; | ||
186 | } | ||
187 | |||
188 | struct pair { | ||
189 | long packets; | ||
190 | long bytes; | ||
191 | }; | ||
192 | |||
193 | struct { | ||
194 | __uint(type, BPF_MAP_TYPE_HASH); | ||
195 | __type(key, __be32); | ||
196 | __type(value, struct pair); | ||
197 | __uint(max_entries, 1024); | ||
198 | } hash_map SEC(".maps"); | ||
199 | |||
200 | SEC("socket2") | ||
201 | int bpf_prog2(struct __sk_buff *skb) | ||
202 | { | ||
203 | struct flow_key_record flow = {}; | ||
204 | struct pair *value; | ||
205 | u32 key; | ||
206 | |||
207 | if (!flow_dissector(skb, &flow)) | ||
208 | return 0; | ||
209 | |||
210 | key = flow.dst; | ||
211 | value = bpf_map_lookup_elem(&hash_map, &key); | ||
212 | if (value) { | ||
213 | __sync_fetch_and_add(&value->packets, 1); | ||
214 | __sync_fetch_and_add(&value->bytes, skb->len); | ||
215 | } else { | ||
216 | struct pair val = {1, skb->len}; | ||
217 | |||
218 | bpf_map_update_elem(&hash_map, &key, &val, BPF_ANY); | ||
219 | } | ||
220 | return 0; | ||
221 | } | ||
222 | |||
223 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/sockex2_user.c b/samples/bpf/sockex2_user.c new file mode 100644 index 000000000..af925a5af --- /dev/null +++ b/samples/bpf/sockex2_user.c | |||
@@ -0,0 +1,57 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #include <stdio.h> | ||
3 | #include <assert.h> | ||
4 | #include <linux/bpf.h> | ||
5 | #include <bpf/bpf.h> | ||
6 | #include <bpf/libbpf.h> | ||
7 | #include "sock_example.h" | ||
8 | #include <unistd.h> | ||
9 | #include <arpa/inet.h> | ||
10 | #include <sys/resource.h> | ||
11 | |||
12 | struct pair { | ||
13 | __u64 packets; | ||
14 | __u64 bytes; | ||
15 | }; | ||
16 | |||
17 | int main(int ac, char **argv) | ||
18 | { | ||
19 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
20 | struct bpf_object *obj; | ||
21 | int map_fd, prog_fd; | ||
22 | char filename[256]; | ||
23 | int i, sock; | ||
24 | FILE *f; | ||
25 | |||
26 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
27 | setrlimit(RLIMIT_MEMLOCK, &r); | ||
28 | |||
29 | if (bpf_prog_load(filename, BPF_PROG_TYPE_SOCKET_FILTER, | ||
30 | &obj, &prog_fd)) | ||
31 | return 1; | ||
32 | |||
33 | map_fd = bpf_object__find_map_fd_by_name(obj, "hash_map"); | ||
34 | |||
35 | sock = open_raw_sock("lo"); | ||
36 | |||
37 | assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, | ||
38 | sizeof(prog_fd)) == 0); | ||
39 | |||
40 | f = popen("ping -4 -c5 localhost", "r"); | ||
41 | (void) f; | ||
42 | |||
43 | for (i = 0; i < 5; i++) { | ||
44 | int key = 0, next_key; | ||
45 | struct pair value; | ||
46 | |||
47 | while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) { | ||
48 | bpf_map_lookup_elem(map_fd, &next_key, &value); | ||
49 | printf("ip %s bytes %lld packets %lld\n", | ||
50 | inet_ntoa((struct in_addr){htonl(next_key)}), | ||
51 | value.bytes, value.packets); | ||
52 | key = next_key; | ||
53 | } | ||
54 | sleep(1); | ||
55 | } | ||
56 | return 0; | ||
57 | } | ||
diff --git a/samples/bpf/sockex3_kern.c b/samples/bpf/sockex3_kern.c new file mode 100644 index 000000000..b36350335 --- /dev/null +++ b/samples/bpf/sockex3_kern.c | |||
@@ -0,0 +1,293 @@ | |||
1 | /* Copyright (c) 2015 PLUMgrid, http://plumgrid.com | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #include <uapi/linux/bpf.h> | ||
8 | #include <uapi/linux/in.h> | ||
9 | #include <uapi/linux/if.h> | ||
10 | #include <uapi/linux/if_ether.h> | ||
11 | #include <uapi/linux/ip.h> | ||
12 | #include <uapi/linux/ipv6.h> | ||
13 | #include <uapi/linux/if_tunnel.h> | ||
14 | #include <uapi/linux/mpls.h> | ||
15 | #include <bpf/bpf_helpers.h> | ||
16 | #include "bpf_legacy.h" | ||
17 | #define IP_MF 0x2000 | ||
18 | #define IP_OFFSET 0x1FFF | ||
19 | |||
20 | #define PROG(F) SEC("socket/"__stringify(F)) int bpf_func_##F | ||
21 | |||
22 | struct { | ||
23 | __uint(type, BPF_MAP_TYPE_PROG_ARRAY); | ||
24 | __uint(key_size, sizeof(u32)); | ||
25 | __uint(value_size, sizeof(u32)); | ||
26 | __uint(max_entries, 8); | ||
27 | } jmp_table SEC(".maps"); | ||
28 | |||
29 | #define PARSE_VLAN 1 | ||
30 | #define PARSE_MPLS 2 | ||
31 | #define PARSE_IP 3 | ||
32 | #define PARSE_IPV6 4 | ||
33 | |||
34 | /* Protocol dispatch routine. It tail-calls next BPF program depending | ||
35 | * on eth proto. Note, we could have used ... | ||
36 | * | ||
37 | * bpf_tail_call(skb, &jmp_table, proto); | ||
38 | * | ||
39 | * ... but it would need large prog_array and cannot be optimised given | ||
40 | * the map key is not static. | ||
41 | */ | ||
42 | static inline void parse_eth_proto(struct __sk_buff *skb, u32 proto) | ||
43 | { | ||
44 | switch (proto) { | ||
45 | case ETH_P_8021Q: | ||
46 | case ETH_P_8021AD: | ||
47 | bpf_tail_call(skb, &jmp_table, PARSE_VLAN); | ||
48 | break; | ||
49 | case ETH_P_MPLS_UC: | ||
50 | case ETH_P_MPLS_MC: | ||
51 | bpf_tail_call(skb, &jmp_table, PARSE_MPLS); | ||
52 | break; | ||
53 | case ETH_P_IP: | ||
54 | bpf_tail_call(skb, &jmp_table, PARSE_IP); | ||
55 | break; | ||
56 | case ETH_P_IPV6: | ||
57 | bpf_tail_call(skb, &jmp_table, PARSE_IPV6); | ||
58 | break; | ||
59 | } | ||
60 | } | ||
61 | |||
62 | struct vlan_hdr { | ||
63 | __be16 h_vlan_TCI; | ||
64 | __be16 h_vlan_encapsulated_proto; | ||
65 | }; | ||
66 | |||
67 | struct flow_key_record { | ||
68 | __be32 src; | ||
69 | __be32 dst; | ||
70 | union { | ||
71 | __be32 ports; | ||
72 | __be16 port16[2]; | ||
73 | }; | ||
74 | __u32 ip_proto; | ||
75 | }; | ||
76 | |||
77 | static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff) | ||
78 | { | ||
79 | return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off)) | ||
80 | & (IP_MF | IP_OFFSET); | ||
81 | } | ||
82 | |||
83 | static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off) | ||
84 | { | ||
85 | __u64 w0 = load_word(ctx, off); | ||
86 | __u64 w1 = load_word(ctx, off + 4); | ||
87 | __u64 w2 = load_word(ctx, off + 8); | ||
88 | __u64 w3 = load_word(ctx, off + 12); | ||
89 | |||
90 | return (__u32)(w0 ^ w1 ^ w2 ^ w3); | ||
91 | } | ||
92 | |||
93 | struct globals { | ||
94 | struct flow_key_record flow; | ||
95 | }; | ||
96 | |||
97 | struct { | ||
98 | __uint(type, BPF_MAP_TYPE_ARRAY); | ||
99 | __type(key, __u32); | ||
100 | __type(value, struct globals); | ||
101 | __uint(max_entries, 32); | ||
102 | } percpu_map SEC(".maps"); | ||
103 | |||
104 | /* user poor man's per_cpu until native support is ready */ | ||
105 | static struct globals *this_cpu_globals(void) | ||
106 | { | ||
107 | u32 key = bpf_get_smp_processor_id(); | ||
108 | |||
109 | return bpf_map_lookup_elem(&percpu_map, &key); | ||
110 | } | ||
111 | |||
112 | /* some simple stats for user space consumption */ | ||
113 | struct pair { | ||
114 | __u64 packets; | ||
115 | __u64 bytes; | ||
116 | }; | ||
117 | |||
118 | struct { | ||
119 | __uint(type, BPF_MAP_TYPE_HASH); | ||
120 | __type(key, struct flow_key_record); | ||
121 | __type(value, struct pair); | ||
122 | __uint(max_entries, 1024); | ||
123 | } hash_map SEC(".maps"); | ||
124 | |||
125 | static void update_stats(struct __sk_buff *skb, struct globals *g) | ||
126 | { | ||
127 | struct flow_key_record key = g->flow; | ||
128 | struct pair *value; | ||
129 | |||
130 | value = bpf_map_lookup_elem(&hash_map, &key); | ||
131 | if (value) { | ||
132 | __sync_fetch_and_add(&value->packets, 1); | ||
133 | __sync_fetch_and_add(&value->bytes, skb->len); | ||
134 | } else { | ||
135 | struct pair val = {1, skb->len}; | ||
136 | |||
137 | bpf_map_update_elem(&hash_map, &key, &val, BPF_ANY); | ||
138 | } | ||
139 | } | ||
140 | |||
141 | static __always_inline void parse_ip_proto(struct __sk_buff *skb, | ||
142 | struct globals *g, __u32 ip_proto) | ||
143 | { | ||
144 | __u32 nhoff = skb->cb[0]; | ||
145 | int poff; | ||
146 | |||
147 | switch (ip_proto) { | ||
148 | case IPPROTO_GRE: { | ||
149 | struct gre_hdr { | ||
150 | __be16 flags; | ||
151 | __be16 proto; | ||
152 | }; | ||
153 | |||
154 | __u32 gre_flags = load_half(skb, | ||
155 | nhoff + offsetof(struct gre_hdr, flags)); | ||
156 | __u32 gre_proto = load_half(skb, | ||
157 | nhoff + offsetof(struct gre_hdr, proto)); | ||
158 | |||
159 | if (gre_flags & (GRE_VERSION|GRE_ROUTING)) | ||
160 | break; | ||
161 | |||
162 | nhoff += 4; | ||
163 | if (gre_flags & GRE_CSUM) | ||
164 | nhoff += 4; | ||
165 | if (gre_flags & GRE_KEY) | ||
166 | nhoff += 4; | ||
167 | if (gre_flags & GRE_SEQ) | ||
168 | nhoff += 4; | ||
169 | |||
170 | skb->cb[0] = nhoff; | ||
171 | parse_eth_proto(skb, gre_proto); | ||
172 | break; | ||
173 | } | ||
174 | case IPPROTO_IPIP: | ||
175 | parse_eth_proto(skb, ETH_P_IP); | ||
176 | break; | ||
177 | case IPPROTO_IPV6: | ||
178 | parse_eth_proto(skb, ETH_P_IPV6); | ||
179 | break; | ||
180 | case IPPROTO_TCP: | ||
181 | case IPPROTO_UDP: | ||
182 | g->flow.ports = load_word(skb, nhoff); | ||
183 | case IPPROTO_ICMP: | ||
184 | g->flow.ip_proto = ip_proto; | ||
185 | update_stats(skb, g); | ||
186 | break; | ||
187 | default: | ||
188 | break; | ||
189 | } | ||
190 | } | ||
191 | |||
192 | PROG(PARSE_IP)(struct __sk_buff *skb) | ||
193 | { | ||
194 | struct globals *g = this_cpu_globals(); | ||
195 | __u32 nhoff, verlen, ip_proto; | ||
196 | |||
197 | if (!g) | ||
198 | return 0; | ||
199 | |||
200 | nhoff = skb->cb[0]; | ||
201 | |||
202 | if (unlikely(ip_is_fragment(skb, nhoff))) | ||
203 | return 0; | ||
204 | |||
205 | ip_proto = load_byte(skb, nhoff + offsetof(struct iphdr, protocol)); | ||
206 | |||
207 | if (ip_proto != IPPROTO_GRE) { | ||
208 | g->flow.src = load_word(skb, nhoff + offsetof(struct iphdr, saddr)); | ||
209 | g->flow.dst = load_word(skb, nhoff + offsetof(struct iphdr, daddr)); | ||
210 | } | ||
211 | |||
212 | verlen = load_byte(skb, nhoff + 0/*offsetof(struct iphdr, ihl)*/); | ||
213 | nhoff += (verlen & 0xF) << 2; | ||
214 | |||
215 | skb->cb[0] = nhoff; | ||
216 | parse_ip_proto(skb, g, ip_proto); | ||
217 | return 0; | ||
218 | } | ||
219 | |||
220 | PROG(PARSE_IPV6)(struct __sk_buff *skb) | ||
221 | { | ||
222 | struct globals *g = this_cpu_globals(); | ||
223 | __u32 nhoff, ip_proto; | ||
224 | |||
225 | if (!g) | ||
226 | return 0; | ||
227 | |||
228 | nhoff = skb->cb[0]; | ||
229 | |||
230 | ip_proto = load_byte(skb, | ||
231 | nhoff + offsetof(struct ipv6hdr, nexthdr)); | ||
232 | g->flow.src = ipv6_addr_hash(skb, | ||
233 | nhoff + offsetof(struct ipv6hdr, saddr)); | ||
234 | g->flow.dst = ipv6_addr_hash(skb, | ||
235 | nhoff + offsetof(struct ipv6hdr, daddr)); | ||
236 | nhoff += sizeof(struct ipv6hdr); | ||
237 | |||
238 | skb->cb[0] = nhoff; | ||
239 | parse_ip_proto(skb, g, ip_proto); | ||
240 | return 0; | ||
241 | } | ||
242 | |||
243 | PROG(PARSE_VLAN)(struct __sk_buff *skb) | ||
244 | { | ||
245 | __u32 nhoff, proto; | ||
246 | |||
247 | nhoff = skb->cb[0]; | ||
248 | |||
249 | proto = load_half(skb, nhoff + offsetof(struct vlan_hdr, | ||
250 | h_vlan_encapsulated_proto)); | ||
251 | nhoff += sizeof(struct vlan_hdr); | ||
252 | skb->cb[0] = nhoff; | ||
253 | |||
254 | parse_eth_proto(skb, proto); | ||
255 | |||
256 | return 0; | ||
257 | } | ||
258 | |||
259 | PROG(PARSE_MPLS)(struct __sk_buff *skb) | ||
260 | { | ||
261 | __u32 nhoff, label; | ||
262 | |||
263 | nhoff = skb->cb[0]; | ||
264 | |||
265 | label = load_word(skb, nhoff); | ||
266 | nhoff += sizeof(struct mpls_label); | ||
267 | skb->cb[0] = nhoff; | ||
268 | |||
269 | if (label & MPLS_LS_S_MASK) { | ||
270 | __u8 verlen = load_byte(skb, nhoff); | ||
271 | if ((verlen & 0xF0) == 4) | ||
272 | parse_eth_proto(skb, ETH_P_IP); | ||
273 | else | ||
274 | parse_eth_proto(skb, ETH_P_IPV6); | ||
275 | } else { | ||
276 | parse_eth_proto(skb, ETH_P_MPLS_UC); | ||
277 | } | ||
278 | |||
279 | return 0; | ||
280 | } | ||
281 | |||
282 | SEC("socket/0") | ||
283 | int main_prog(struct __sk_buff *skb) | ||
284 | { | ||
285 | __u32 nhoff = ETH_HLEN; | ||
286 | __u32 proto = load_half(skb, 12); | ||
287 | |||
288 | skb->cb[0] = nhoff; | ||
289 | parse_eth_proto(skb, proto); | ||
290 | return 0; | ||
291 | } | ||
292 | |||
293 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c new file mode 100644 index 000000000..7793f6a6a --- /dev/null +++ b/samples/bpf/sockex3_user.c | |||
@@ -0,0 +1,106 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #include <stdio.h> | ||
3 | #include <assert.h> | ||
4 | #include <bpf/bpf.h> | ||
5 | #include <bpf/libbpf.h> | ||
6 | #include "sock_example.h" | ||
7 | #include <unistd.h> | ||
8 | #include <arpa/inet.h> | ||
9 | #include <sys/resource.h> | ||
10 | |||
11 | struct flow_key_record { | ||
12 | __be32 src; | ||
13 | __be32 dst; | ||
14 | union { | ||
15 | __be32 ports; | ||
16 | __be16 port16[2]; | ||
17 | }; | ||
18 | __u32 ip_proto; | ||
19 | }; | ||
20 | |||
21 | struct pair { | ||
22 | __u64 packets; | ||
23 | __u64 bytes; | ||
24 | }; | ||
25 | |||
26 | int main(int argc, char **argv) | ||
27 | { | ||
28 | int i, sock, key, fd, main_prog_fd, jmp_table_fd, hash_map_fd; | ||
29 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
30 | struct bpf_program *prog; | ||
31 | struct bpf_object *obj; | ||
32 | const char *section; | ||
33 | char filename[256]; | ||
34 | FILE *f; | ||
35 | |||
36 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
37 | setrlimit(RLIMIT_MEMLOCK, &r); | ||
38 | |||
39 | obj = bpf_object__open_file(filename, NULL); | ||
40 | if (libbpf_get_error(obj)) { | ||
41 | fprintf(stderr, "ERROR: opening BPF object file failed\n"); | ||
42 | return 0; | ||
43 | } | ||
44 | |||
45 | /* load BPF program */ | ||
46 | if (bpf_object__load(obj)) { | ||
47 | fprintf(stderr, "ERROR: loading BPF object file failed\n"); | ||
48 | goto cleanup; | ||
49 | } | ||
50 | |||
51 | jmp_table_fd = bpf_object__find_map_fd_by_name(obj, "jmp_table"); | ||
52 | hash_map_fd = bpf_object__find_map_fd_by_name(obj, "hash_map"); | ||
53 | if (jmp_table_fd < 0 || hash_map_fd < 0) { | ||
54 | fprintf(stderr, "ERROR: finding a map in obj file failed\n"); | ||
55 | goto cleanup; | ||
56 | } | ||
57 | |||
58 | bpf_object__for_each_program(prog, obj) { | ||
59 | fd = bpf_program__fd(prog); | ||
60 | |||
61 | section = bpf_program__section_name(prog); | ||
62 | if (sscanf(section, "socket/%d", &key) != 1) { | ||
63 | fprintf(stderr, "ERROR: finding prog failed\n"); | ||
64 | goto cleanup; | ||
65 | } | ||
66 | |||
67 | if (key == 0) | ||
68 | main_prog_fd = fd; | ||
69 | else | ||
70 | bpf_map_update_elem(jmp_table_fd, &key, &fd, BPF_ANY); | ||
71 | } | ||
72 | |||
73 | sock = open_raw_sock("lo"); | ||
74 | |||
75 | /* attach BPF program to socket */ | ||
76 | assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &main_prog_fd, | ||
77 | sizeof(__u32)) == 0); | ||
78 | |||
79 | if (argc > 1) | ||
80 | f = popen("ping -4 -c5 localhost", "r"); | ||
81 | else | ||
82 | f = popen("netperf -l 4 localhost", "r"); | ||
83 | (void) f; | ||
84 | |||
85 | for (i = 0; i < 5; i++) { | ||
86 | struct flow_key_record key = {}, next_key; | ||
87 | struct pair value; | ||
88 | |||
89 | sleep(1); | ||
90 | printf("IP src.port -> dst.port bytes packets\n"); | ||
91 | while (bpf_map_get_next_key(hash_map_fd, &key, &next_key) == 0) { | ||
92 | bpf_map_lookup_elem(hash_map_fd, &next_key, &value); | ||
93 | printf("%s.%05d -> %s.%05d %12lld %12lld\n", | ||
94 | inet_ntoa((struct in_addr){htonl(next_key.src)}), | ||
95 | next_key.port16[0], | ||
96 | inet_ntoa((struct in_addr){htonl(next_key.dst)}), | ||
97 | next_key.port16[1], | ||
98 | value.bytes, value.packets); | ||
99 | key = next_key; | ||
100 | } | ||
101 | } | ||
102 | |||
103 | cleanup: | ||
104 | bpf_object__close(obj); | ||
105 | return 0; | ||
106 | } | ||
diff --git a/samples/bpf/spintest_kern.c b/samples/bpf/spintest_kern.c new file mode 100644 index 000000000..455da7731 --- /dev/null +++ b/samples/bpf/spintest_kern.c | |||
@@ -0,0 +1,69 @@ | |||
1 | /* Copyright (c) 2016, Facebook | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #include <linux/skbuff.h> | ||
8 | #include <linux/netdevice.h> | ||
9 | #include <linux/version.h> | ||
10 | #include <uapi/linux/bpf.h> | ||
11 | #include <uapi/linux/perf_event.h> | ||
12 | #include <bpf/bpf_helpers.h> | ||
13 | #include <bpf/bpf_tracing.h> | ||
14 | |||
15 | struct { | ||
16 | __uint(type, BPF_MAP_TYPE_HASH); | ||
17 | __type(key, long); | ||
18 | __type(value, long); | ||
19 | __uint(max_entries, 1024); | ||
20 | } my_map SEC(".maps"); | ||
21 | struct { | ||
22 | __uint(type, BPF_MAP_TYPE_PERCPU_HASH); | ||
23 | __uint(key_size, sizeof(long)); | ||
24 | __uint(value_size, sizeof(long)); | ||
25 | __uint(max_entries, 1024); | ||
26 | } my_map2 SEC(".maps"); | ||
27 | |||
28 | struct { | ||
29 | __uint(type, BPF_MAP_TYPE_STACK_TRACE); | ||
30 | __uint(key_size, sizeof(u32)); | ||
31 | __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64)); | ||
32 | __uint(max_entries, 10000); | ||
33 | } stackmap SEC(".maps"); | ||
34 | |||
35 | #define PROG(foo) \ | ||
36 | int foo(struct pt_regs *ctx) \ | ||
37 | { \ | ||
38 | long v = PT_REGS_IP(ctx), *val; \ | ||
39 | \ | ||
40 | val = bpf_map_lookup_elem(&my_map, &v); \ | ||
41 | bpf_map_update_elem(&my_map, &v, &v, BPF_ANY); \ | ||
42 | bpf_map_update_elem(&my_map2, &v, &v, BPF_ANY); \ | ||
43 | bpf_map_delete_elem(&my_map2, &v); \ | ||
44 | bpf_get_stackid(ctx, &stackmap, BPF_F_REUSE_STACKID); \ | ||
45 | return 0; \ | ||
46 | } | ||
47 | |||
48 | /* add kprobes to all possible *spin* functions */ | ||
49 | SEC("kprobe/spin_unlock")PROG(p1) | ||
50 | SEC("kprobe/spin_lock")PROG(p2) | ||
51 | SEC("kprobe/mutex_spin_on_owner")PROG(p3) | ||
52 | SEC("kprobe/rwsem_spin_on_owner")PROG(p4) | ||
53 | SEC("kprobe/spin_unlock_irqrestore")PROG(p5) | ||
54 | SEC("kprobe/_raw_spin_unlock_irqrestore")PROG(p6) | ||
55 | SEC("kprobe/_raw_spin_unlock_bh")PROG(p7) | ||
56 | SEC("kprobe/_raw_spin_unlock")PROG(p8) | ||
57 | SEC("kprobe/_raw_spin_lock_irqsave")PROG(p9) | ||
58 | SEC("kprobe/_raw_spin_trylock_bh")PROG(p10) | ||
59 | SEC("kprobe/_raw_spin_lock_irq")PROG(p11) | ||
60 | SEC("kprobe/_raw_spin_trylock")PROG(p12) | ||
61 | SEC("kprobe/_raw_spin_lock")PROG(p13) | ||
62 | SEC("kprobe/_raw_spin_lock_bh")PROG(p14) | ||
63 | /* and to inner bpf helpers */ | ||
64 | SEC("kprobe/htab_map_update_elem")PROG(p15) | ||
65 | SEC("kprobe/__htab_percpu_map_update_elem")PROG(p16) | ||
66 | SEC("kprobe/htab_map_alloc")PROG(p17) | ||
67 | |||
68 | char _license[] SEC("license") = "GPL"; | ||
69 | u32 _version SEC("version") = LINUX_VERSION_CODE; | ||
diff --git a/samples/bpf/spintest_user.c b/samples/bpf/spintest_user.c new file mode 100644 index 000000000..f090d0dc6 --- /dev/null +++ b/samples/bpf/spintest_user.c | |||
@@ -0,0 +1,99 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #include <stdio.h> | ||
3 | #include <unistd.h> | ||
4 | #include <string.h> | ||
5 | #include <assert.h> | ||
6 | #include <sys/resource.h> | ||
7 | #include <bpf/libbpf.h> | ||
8 | #include <bpf/bpf.h> | ||
9 | #include "trace_helpers.h" | ||
10 | |||
11 | int main(int ac, char **argv) | ||
12 | { | ||
13 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
14 | char filename[256], symbol[256]; | ||
15 | struct bpf_object *obj = NULL; | ||
16 | struct bpf_link *links[20]; | ||
17 | long key, next_key, value; | ||
18 | struct bpf_program *prog; | ||
19 | int map_fd, i, j = 0; | ||
20 | const char *section; | ||
21 | struct ksym *sym; | ||
22 | |||
23 | if (setrlimit(RLIMIT_MEMLOCK, &r)) { | ||
24 | perror("setrlimit(RLIMIT_MEMLOCK)"); | ||
25 | return 1; | ||
26 | } | ||
27 | |||
28 | if (load_kallsyms()) { | ||
29 | printf("failed to process /proc/kallsyms\n"); | ||
30 | return 2; | ||
31 | } | ||
32 | |||
33 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
34 | obj = bpf_object__open_file(filename, NULL); | ||
35 | if (libbpf_get_error(obj)) { | ||
36 | fprintf(stderr, "ERROR: opening BPF object file failed\n"); | ||
37 | obj = NULL; | ||
38 | goto cleanup; | ||
39 | } | ||
40 | |||
41 | /* load BPF program */ | ||
42 | if (bpf_object__load(obj)) { | ||
43 | fprintf(stderr, "ERROR: loading BPF object file failed\n"); | ||
44 | goto cleanup; | ||
45 | } | ||
46 | |||
47 | map_fd = bpf_object__find_map_fd_by_name(obj, "my_map"); | ||
48 | if (map_fd < 0) { | ||
49 | fprintf(stderr, "ERROR: finding a map in obj file failed\n"); | ||
50 | goto cleanup; | ||
51 | } | ||
52 | |||
53 | bpf_object__for_each_program(prog, obj) { | ||
54 | section = bpf_program__section_name(prog); | ||
55 | if (sscanf(section, "kprobe/%s", symbol) != 1) | ||
56 | continue; | ||
57 | |||
58 | /* Attach prog only when symbol exists */ | ||
59 | if (ksym_get_addr(symbol)) { | ||
60 | links[j] = bpf_program__attach(prog); | ||
61 | if (libbpf_get_error(links[j])) { | ||
62 | fprintf(stderr, "bpf_program__attach failed\n"); | ||
63 | links[j] = NULL; | ||
64 | goto cleanup; | ||
65 | } | ||
66 | j++; | ||
67 | } | ||
68 | } | ||
69 | |||
70 | for (i = 0; i < 5; i++) { | ||
71 | key = 0; | ||
72 | printf("kprobing funcs:"); | ||
73 | while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) { | ||
74 | bpf_map_lookup_elem(map_fd, &next_key, &value); | ||
75 | assert(next_key == value); | ||
76 | sym = ksym_search(value); | ||
77 | key = next_key; | ||
78 | if (!sym) { | ||
79 | printf("ksym not found. Is kallsyms loaded?\n"); | ||
80 | continue; | ||
81 | } | ||
82 | |||
83 | printf(" %s", sym->name); | ||
84 | } | ||
85 | if (key) | ||
86 | printf("\n"); | ||
87 | key = 0; | ||
88 | while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) | ||
89 | bpf_map_delete_elem(map_fd, &next_key); | ||
90 | sleep(1); | ||
91 | } | ||
92 | |||
93 | cleanup: | ||
94 | for (j--; j >= 0; j--) | ||
95 | bpf_link__destroy(links[j]); | ||
96 | |||
97 | bpf_object__close(obj); | ||
98 | return 0; | ||
99 | } | ||
diff --git a/samples/bpf/syscall_nrs.c b/samples/bpf/syscall_nrs.c new file mode 100644 index 000000000..88f940052 --- /dev/null +++ b/samples/bpf/syscall_nrs.c | |||
@@ -0,0 +1,19 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #include <uapi/linux/unistd.h> | ||
3 | #include <linux/kbuild.h> | ||
4 | |||
5 | #define SYSNR(_NR) DEFINE(SYS ## _NR, _NR) | ||
6 | |||
7 | void syscall_defines(void) | ||
8 | { | ||
9 | COMMENT("Linux system call numbers."); | ||
10 | SYSNR(__NR_write); | ||
11 | SYSNR(__NR_read); | ||
12 | #ifdef __NR_mmap2 | ||
13 | SYSNR(__NR_mmap2); | ||
14 | #endif | ||
15 | #ifdef __NR_mmap | ||
16 | SYSNR(__NR_mmap); | ||
17 | #endif | ||
18 | |||
19 | } | ||
diff --git a/samples/bpf/syscall_tp_kern.c b/samples/bpf/syscall_tp_kern.c new file mode 100644 index 000000000..50231c2ef --- /dev/null +++ b/samples/bpf/syscall_tp_kern.c | |||
@@ -0,0 +1,73 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* Copyright (c) 2017 Facebook | ||
3 | */ | ||
4 | #include <uapi/linux/bpf.h> | ||
5 | #include <bpf/bpf_helpers.h> | ||
6 | |||
7 | struct syscalls_enter_open_args { | ||
8 | unsigned long long unused; | ||
9 | long syscall_nr; | ||
10 | long filename_ptr; | ||
11 | long flags; | ||
12 | long mode; | ||
13 | }; | ||
14 | |||
15 | struct syscalls_exit_open_args { | ||
16 | unsigned long long unused; | ||
17 | long syscall_nr; | ||
18 | long ret; | ||
19 | }; | ||
20 | |||
21 | struct { | ||
22 | __uint(type, BPF_MAP_TYPE_ARRAY); | ||
23 | __type(key, u32); | ||
24 | __type(value, u32); | ||
25 | __uint(max_entries, 1); | ||
26 | } enter_open_map SEC(".maps"); | ||
27 | |||
28 | struct { | ||
29 | __uint(type, BPF_MAP_TYPE_ARRAY); | ||
30 | __type(key, u32); | ||
31 | __type(value, u32); | ||
32 | __uint(max_entries, 1); | ||
33 | } exit_open_map SEC(".maps"); | ||
34 | |||
35 | static __always_inline void count(void *map) | ||
36 | { | ||
37 | u32 key = 0; | ||
38 | u32 *value, init_val = 1; | ||
39 | |||
40 | value = bpf_map_lookup_elem(map, &key); | ||
41 | if (value) | ||
42 | *value += 1; | ||
43 | else | ||
44 | bpf_map_update_elem(map, &key, &init_val, BPF_NOEXIST); | ||
45 | } | ||
46 | |||
47 | SEC("tracepoint/syscalls/sys_enter_open") | ||
48 | int trace_enter_open(struct syscalls_enter_open_args *ctx) | ||
49 | { | ||
50 | count(&enter_open_map); | ||
51 | return 0; | ||
52 | } | ||
53 | |||
54 | SEC("tracepoint/syscalls/sys_enter_openat") | ||
55 | int trace_enter_open_at(struct syscalls_enter_open_args *ctx) | ||
56 | { | ||
57 | count(&enter_open_map); | ||
58 | return 0; | ||
59 | } | ||
60 | |||
61 | SEC("tracepoint/syscalls/sys_exit_open") | ||
62 | int trace_enter_exit(struct syscalls_exit_open_args *ctx) | ||
63 | { | ||
64 | count(&exit_open_map); | ||
65 | return 0; | ||
66 | } | ||
67 | |||
68 | SEC("tracepoint/syscalls/sys_exit_openat") | ||
69 | int trace_enter_exit_at(struct syscalls_exit_open_args *ctx) | ||
70 | { | ||
71 | count(&exit_open_map); | ||
72 | return 0; | ||
73 | } | ||
diff --git a/samples/bpf/syscall_tp_user.c b/samples/bpf/syscall_tp_user.c new file mode 100644 index 000000000..76a1d0012 --- /dev/null +++ b/samples/bpf/syscall_tp_user.c | |||
@@ -0,0 +1,138 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* Copyright (c) 2017 Facebook | ||
3 | */ | ||
4 | #include <stdio.h> | ||
5 | #include <unistd.h> | ||
6 | #include <fcntl.h> | ||
7 | #include <stdlib.h> | ||
8 | #include <string.h> | ||
9 | #include <linux/perf_event.h> | ||
10 | #include <errno.h> | ||
11 | #include <sys/resource.h> | ||
12 | #include <bpf/libbpf.h> | ||
13 | #include <bpf/bpf.h> | ||
14 | |||
15 | /* This program verifies bpf attachment to tracepoint sys_enter_* and sys_exit_*. | ||
16 | * This requires kernel CONFIG_FTRACE_SYSCALLS to be set. | ||
17 | */ | ||
18 | |||
19 | static void usage(const char *cmd) | ||
20 | { | ||
21 | printf("USAGE: %s [-i num_progs] [-h]\n", cmd); | ||
22 | printf(" -i num_progs # number of progs of the test\n"); | ||
23 | printf(" -h # help\n"); | ||
24 | } | ||
25 | |||
26 | static void verify_map(int map_id) | ||
27 | { | ||
28 | __u32 key = 0; | ||
29 | __u32 val; | ||
30 | |||
31 | if (bpf_map_lookup_elem(map_id, &key, &val) != 0) { | ||
32 | fprintf(stderr, "map_lookup failed: %s\n", strerror(errno)); | ||
33 | return; | ||
34 | } | ||
35 | if (val == 0) { | ||
36 | fprintf(stderr, "failed: map #%d returns value 0\n", map_id); | ||
37 | return; | ||
38 | } | ||
39 | val = 0; | ||
40 | if (bpf_map_update_elem(map_id, &key, &val, BPF_ANY) != 0) { | ||
41 | fprintf(stderr, "map_update failed: %s\n", strerror(errno)); | ||
42 | return; | ||
43 | } | ||
44 | } | ||
45 | |||
46 | static int test(char *filename, int num_progs) | ||
47 | { | ||
48 | int map0_fds[num_progs], map1_fds[num_progs], fd, i, j = 0; | ||
49 | struct bpf_link *links[num_progs * 4]; | ||
50 | struct bpf_object *objs[num_progs]; | ||
51 | struct bpf_program *prog; | ||
52 | |||
53 | for (i = 0; i < num_progs; i++) { | ||
54 | objs[i] = bpf_object__open_file(filename, NULL); | ||
55 | if (libbpf_get_error(objs[i])) { | ||
56 | fprintf(stderr, "opening BPF object file failed\n"); | ||
57 | objs[i] = NULL; | ||
58 | goto cleanup; | ||
59 | } | ||
60 | |||
61 | /* load BPF program */ | ||
62 | if (bpf_object__load(objs[i])) { | ||
63 | fprintf(stderr, "loading BPF object file failed\n"); | ||
64 | goto cleanup; | ||
65 | } | ||
66 | |||
67 | map0_fds[i] = bpf_object__find_map_fd_by_name(objs[i], | ||
68 | "enter_open_map"); | ||
69 | map1_fds[i] = bpf_object__find_map_fd_by_name(objs[i], | ||
70 | "exit_open_map"); | ||
71 | if (map0_fds[i] < 0 || map1_fds[i] < 0) { | ||
72 | fprintf(stderr, "finding a map in obj file failed\n"); | ||
73 | goto cleanup; | ||
74 | } | ||
75 | |||
76 | bpf_object__for_each_program(prog, objs[i]) { | ||
77 | links[j] = bpf_program__attach(prog); | ||
78 | if (libbpf_get_error(links[j])) { | ||
79 | fprintf(stderr, "bpf_program__attach failed\n"); | ||
80 | links[j] = NULL; | ||
81 | goto cleanup; | ||
82 | } | ||
83 | j++; | ||
84 | } | ||
85 | printf("prog #%d: map ids %d %d\n", i, map0_fds[i], map1_fds[i]); | ||
86 | } | ||
87 | |||
88 | /* current load_bpf_file has perf_event_open default pid = -1 | ||
89 | * and cpu = 0, which permits attached bpf execution on | ||
90 | * all cpus for all pid's. bpf program execution ignores | ||
91 | * cpu affinity. | ||
92 | */ | ||
93 | /* trigger some "open" operations */ | ||
94 | fd = open(filename, O_RDONLY); | ||
95 | if (fd < 0) { | ||
96 | fprintf(stderr, "open failed: %s\n", strerror(errno)); | ||
97 | return 1; | ||
98 | } | ||
99 | close(fd); | ||
100 | |||
101 | /* verify the map */ | ||
102 | for (i = 0; i < num_progs; i++) { | ||
103 | verify_map(map0_fds[i]); | ||
104 | verify_map(map1_fds[i]); | ||
105 | } | ||
106 | |||
107 | cleanup: | ||
108 | for (j--; j >= 0; j--) | ||
109 | bpf_link__destroy(links[j]); | ||
110 | |||
111 | for (i--; i >= 0; i--) | ||
112 | bpf_object__close(objs[i]); | ||
113 | return 0; | ||
114 | } | ||
115 | |||
116 | int main(int argc, char **argv) | ||
117 | { | ||
118 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
119 | int opt, num_progs = 1; | ||
120 | char filename[256]; | ||
121 | |||
122 | while ((opt = getopt(argc, argv, "i:h")) != -1) { | ||
123 | switch (opt) { | ||
124 | case 'i': | ||
125 | num_progs = atoi(optarg); | ||
126 | break; | ||
127 | case 'h': | ||
128 | default: | ||
129 | usage(argv[0]); | ||
130 | return 0; | ||
131 | } | ||
132 | } | ||
133 | |||
134 | setrlimit(RLIMIT_MEMLOCK, &r); | ||
135 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
136 | |||
137 | return test(filename, num_progs); | ||
138 | } | ||
diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c new file mode 100644 index 000000000..c821294e1 --- /dev/null +++ b/samples/bpf/task_fd_query_kern.c | |||
@@ -0,0 +1,19 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #include <linux/version.h> | ||
3 | #include <linux/ptrace.h> | ||
4 | #include <uapi/linux/bpf.h> | ||
5 | #include <bpf/bpf_helpers.h> | ||
6 | |||
7 | SEC("kprobe/blk_mq_start_request") | ||
8 | int bpf_prog1(struct pt_regs *ctx) | ||
9 | { | ||
10 | return 0; | ||
11 | } | ||
12 | |||
13 | SEC("kretprobe/blk_account_io_done") | ||
14 | int bpf_prog2(struct pt_regs *ctx) | ||
15 | { | ||
16 | return 0; | ||
17 | } | ||
18 | char _license[] SEC("license") = "GPL"; | ||
19 | u32 _version SEC("version") = LINUX_VERSION_CODE; | ||
diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c new file mode 100644 index 000000000..b68bd2f8f --- /dev/null +++ b/samples/bpf/task_fd_query_user.c | |||
@@ -0,0 +1,383 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | #include <stdio.h> | ||
4 | #include <stdlib.h> | ||
5 | #include <signal.h> | ||
6 | #include <unistd.h> | ||
7 | #include <stdbool.h> | ||
8 | #include <string.h> | ||
9 | #include <stdint.h> | ||
10 | #include <fcntl.h> | ||
11 | #include <linux/bpf.h> | ||
12 | #include <sys/ioctl.h> | ||
13 | #include <sys/resource.h> | ||
14 | #include <sys/types.h> | ||
15 | #include <sys/stat.h> | ||
16 | #include <linux/perf_event.h> | ||
17 | |||
18 | #include <bpf/libbpf.h> | ||
19 | #include "bpf_load.h" | ||
20 | #include "bpf_util.h" | ||
21 | #include "perf-sys.h" | ||
22 | #include "trace_helpers.h" | ||
23 | |||
24 | #define CHECK_PERROR_RET(condition) ({ \ | ||
25 | int __ret = !!(condition); \ | ||
26 | if (__ret) { \ | ||
27 | printf("FAIL: %s:\n", __func__); \ | ||
28 | perror(" "); \ | ||
29 | return -1; \ | ||
30 | } \ | ||
31 | }) | ||
32 | |||
33 | #define CHECK_AND_RET(condition) ({ \ | ||
34 | int __ret = !!(condition); \ | ||
35 | if (__ret) \ | ||
36 | return -1; \ | ||
37 | }) | ||
38 | |||
39 | static __u64 ptr_to_u64(void *ptr) | ||
40 | { | ||
41 | return (__u64) (unsigned long) ptr; | ||
42 | } | ||
43 | |||
44 | #define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type" | ||
45 | static int bpf_find_probe_type(const char *event_type) | ||
46 | { | ||
47 | char buf[256]; | ||
48 | int fd, ret; | ||
49 | |||
50 | ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type); | ||
51 | CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf)); | ||
52 | |||
53 | fd = open(buf, O_RDONLY); | ||
54 | CHECK_PERROR_RET(fd < 0); | ||
55 | |||
56 | ret = read(fd, buf, sizeof(buf)); | ||
57 | close(fd); | ||
58 | CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf)); | ||
59 | |||
60 | errno = 0; | ||
61 | ret = (int)strtol(buf, NULL, 10); | ||
62 | CHECK_PERROR_RET(errno); | ||
63 | return ret; | ||
64 | } | ||
65 | |||
66 | #define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe" | ||
67 | static int bpf_get_retprobe_bit(const char *event_type) | ||
68 | { | ||
69 | char buf[256]; | ||
70 | int fd, ret; | ||
71 | |||
72 | ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type); | ||
73 | CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf)); | ||
74 | |||
75 | fd = open(buf, O_RDONLY); | ||
76 | CHECK_PERROR_RET(fd < 0); | ||
77 | |||
78 | ret = read(fd, buf, sizeof(buf)); | ||
79 | close(fd); | ||
80 | CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf)); | ||
81 | CHECK_PERROR_RET(strlen(buf) < strlen("config:")); | ||
82 | |||
83 | errno = 0; | ||
84 | ret = (int)strtol(buf + strlen("config:"), NULL, 10); | ||
85 | CHECK_PERROR_RET(errno); | ||
86 | return ret; | ||
87 | } | ||
88 | |||
89 | static int test_debug_fs_kprobe(int prog_fd_idx, const char *fn_name, | ||
90 | __u32 expected_fd_type) | ||
91 | { | ||
92 | __u64 probe_offset, probe_addr; | ||
93 | __u32 len, prog_id, fd_type; | ||
94 | char buf[256]; | ||
95 | int err; | ||
96 | |||
97 | len = sizeof(buf); | ||
98 | err = bpf_task_fd_query(getpid(), event_fd[prog_fd_idx], 0, buf, &len, | ||
99 | &prog_id, &fd_type, &probe_offset, | ||
100 | &probe_addr); | ||
101 | if (err < 0) { | ||
102 | printf("FAIL: %s, for event_fd idx %d, fn_name %s\n", | ||
103 | __func__, prog_fd_idx, fn_name); | ||
104 | perror(" :"); | ||
105 | return -1; | ||
106 | } | ||
107 | if (strcmp(buf, fn_name) != 0 || | ||
108 | fd_type != expected_fd_type || | ||
109 | probe_offset != 0x0 || probe_addr != 0x0) { | ||
110 | printf("FAIL: bpf_trace_event_query(event_fd[%d]):\n", | ||
111 | prog_fd_idx); | ||
112 | printf("buf: %s, fd_type: %u, probe_offset: 0x%llx," | ||
113 | " probe_addr: 0x%llx\n", | ||
114 | buf, fd_type, probe_offset, probe_addr); | ||
115 | return -1; | ||
116 | } | ||
117 | return 0; | ||
118 | } | ||
119 | |||
120 | static int test_nondebug_fs_kuprobe_common(const char *event_type, | ||
121 | const char *name, __u64 offset, __u64 addr, bool is_return, | ||
122 | char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, | ||
123 | __u64 *probe_offset, __u64 *probe_addr) | ||
124 | { | ||
125 | int is_return_bit = bpf_get_retprobe_bit(event_type); | ||
126 | int type = bpf_find_probe_type(event_type); | ||
127 | struct perf_event_attr attr = {}; | ||
128 | int fd; | ||
129 | |||
130 | if (type < 0 || is_return_bit < 0) { | ||
131 | printf("FAIL: %s incorrect type (%d) or is_return_bit (%d)\n", | ||
132 | __func__, type, is_return_bit); | ||
133 | return -1; | ||
134 | } | ||
135 | |||
136 | attr.sample_period = 1; | ||
137 | attr.wakeup_events = 1; | ||
138 | if (is_return) | ||
139 | attr.config |= 1 << is_return_bit; | ||
140 | |||
141 | if (name) { | ||
142 | attr.config1 = ptr_to_u64((void *)name); | ||
143 | attr.config2 = offset; | ||
144 | } else { | ||
145 | attr.config1 = 0; | ||
146 | attr.config2 = addr; | ||
147 | } | ||
148 | attr.size = sizeof(attr); | ||
149 | attr.type = type; | ||
150 | |||
151 | fd = sys_perf_event_open(&attr, -1, 0, -1, 0); | ||
152 | CHECK_PERROR_RET(fd < 0); | ||
153 | |||
154 | CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0); | ||
155 | CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0); | ||
156 | CHECK_PERROR_RET(bpf_task_fd_query(getpid(), fd, 0, buf, buf_len, | ||
157 | prog_id, fd_type, probe_offset, probe_addr) < 0); | ||
158 | |||
159 | return 0; | ||
160 | } | ||
161 | |||
162 | static int test_nondebug_fs_probe(const char *event_type, const char *name, | ||
163 | __u64 offset, __u64 addr, bool is_return, | ||
164 | __u32 expected_fd_type, | ||
165 | __u32 expected_ret_fd_type, | ||
166 | char *buf, __u32 buf_len) | ||
167 | { | ||
168 | __u64 probe_offset, probe_addr; | ||
169 | __u32 prog_id, fd_type; | ||
170 | int err; | ||
171 | |||
172 | err = test_nondebug_fs_kuprobe_common(event_type, name, | ||
173 | offset, addr, is_return, | ||
174 | buf, &buf_len, &prog_id, | ||
175 | &fd_type, &probe_offset, | ||
176 | &probe_addr); | ||
177 | if (err < 0) { | ||
178 | printf("FAIL: %s, " | ||
179 | "for name %s, offset 0x%llx, addr 0x%llx, is_return %d\n", | ||
180 | __func__, name ? name : "", offset, addr, is_return); | ||
181 | perror(" :"); | ||
182 | return -1; | ||
183 | } | ||
184 | if ((is_return && fd_type != expected_ret_fd_type) || | ||
185 | (!is_return && fd_type != expected_fd_type)) { | ||
186 | printf("FAIL: %s, incorrect fd_type %u\n", | ||
187 | __func__, fd_type); | ||
188 | return -1; | ||
189 | } | ||
190 | if (name) { | ||
191 | if (strcmp(name, buf) != 0) { | ||
192 | printf("FAIL: %s, incorrect buf %s\n", __func__, buf); | ||
193 | return -1; | ||
194 | } | ||
195 | if (probe_offset != offset) { | ||
196 | printf("FAIL: %s, incorrect probe_offset 0x%llx\n", | ||
197 | __func__, probe_offset); | ||
198 | return -1; | ||
199 | } | ||
200 | } else { | ||
201 | if (buf_len != 0) { | ||
202 | printf("FAIL: %s, incorrect buf %p\n", | ||
203 | __func__, buf); | ||
204 | return -1; | ||
205 | } | ||
206 | |||
207 | if (probe_addr != addr) { | ||
208 | printf("FAIL: %s, incorrect probe_addr 0x%llx\n", | ||
209 | __func__, probe_addr); | ||
210 | return -1; | ||
211 | } | ||
212 | } | ||
213 | return 0; | ||
214 | } | ||
215 | |||
216 | static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return) | ||
217 | { | ||
218 | const char *event_type = "uprobe"; | ||
219 | struct perf_event_attr attr = {}; | ||
220 | char buf[256], event_alias[sizeof("test_1234567890")]; | ||
221 | __u64 probe_offset, probe_addr; | ||
222 | __u32 len, prog_id, fd_type; | ||
223 | int err, res, kfd, efd; | ||
224 | ssize_t bytes; | ||
225 | |||
226 | snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", | ||
227 | event_type); | ||
228 | kfd = open(buf, O_WRONLY | O_APPEND, 0); | ||
229 | CHECK_PERROR_RET(kfd < 0); | ||
230 | |||
231 | res = snprintf(event_alias, sizeof(event_alias), "test_%d", getpid()); | ||
232 | CHECK_PERROR_RET(res < 0 || res >= sizeof(event_alias)); | ||
233 | |||
234 | res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx", | ||
235 | is_return ? 'r' : 'p', event_type, event_alias, | ||
236 | binary_path, offset); | ||
237 | CHECK_PERROR_RET(res < 0 || res >= sizeof(buf)); | ||
238 | CHECK_PERROR_RET(write(kfd, buf, strlen(buf)) < 0); | ||
239 | |||
240 | close(kfd); | ||
241 | kfd = -1; | ||
242 | |||
243 | snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s/id", | ||
244 | event_type, event_alias); | ||
245 | efd = open(buf, O_RDONLY, 0); | ||
246 | CHECK_PERROR_RET(efd < 0); | ||
247 | |||
248 | bytes = read(efd, buf, sizeof(buf)); | ||
249 | CHECK_PERROR_RET(bytes <= 0 || bytes >= sizeof(buf)); | ||
250 | close(efd); | ||
251 | buf[bytes] = '\0'; | ||
252 | |||
253 | attr.config = strtol(buf, NULL, 0); | ||
254 | attr.type = PERF_TYPE_TRACEPOINT; | ||
255 | attr.sample_period = 1; | ||
256 | attr.wakeup_events = 1; | ||
257 | kfd = sys_perf_event_open(&attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); | ||
258 | CHECK_PERROR_RET(kfd < 0); | ||
259 | CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0); | ||
260 | CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_ENABLE, 0) < 0); | ||
261 | |||
262 | len = sizeof(buf); | ||
263 | err = bpf_task_fd_query(getpid(), kfd, 0, buf, &len, | ||
264 | &prog_id, &fd_type, &probe_offset, | ||
265 | &probe_addr); | ||
266 | if (err < 0) { | ||
267 | printf("FAIL: %s, binary_path %s\n", __func__, binary_path); | ||
268 | perror(" :"); | ||
269 | return -1; | ||
270 | } | ||
271 | if ((is_return && fd_type != BPF_FD_TYPE_URETPROBE) || | ||
272 | (!is_return && fd_type != BPF_FD_TYPE_UPROBE)) { | ||
273 | printf("FAIL: %s, incorrect fd_type %u\n", __func__, | ||
274 | fd_type); | ||
275 | return -1; | ||
276 | } | ||
277 | if (strcmp(binary_path, buf) != 0) { | ||
278 | printf("FAIL: %s, incorrect buf %s\n", __func__, buf); | ||
279 | return -1; | ||
280 | } | ||
281 | if (probe_offset != offset) { | ||
282 | printf("FAIL: %s, incorrect probe_offset 0x%llx\n", __func__, | ||
283 | probe_offset); | ||
284 | return -1; | ||
285 | } | ||
286 | |||
287 | close(kfd); | ||
288 | return 0; | ||
289 | } | ||
290 | |||
291 | int main(int argc, char **argv) | ||
292 | { | ||
293 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
294 | extern char __executable_start; | ||
295 | char filename[256], buf[256]; | ||
296 | __u64 uprobe_file_offset; | ||
297 | |||
298 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
299 | if (setrlimit(RLIMIT_MEMLOCK, &r)) { | ||
300 | perror("setrlimit(RLIMIT_MEMLOCK)"); | ||
301 | return 1; | ||
302 | } | ||
303 | |||
304 | if (load_kallsyms()) { | ||
305 | printf("failed to process /proc/kallsyms\n"); | ||
306 | return 1; | ||
307 | } | ||
308 | |||
309 | if (load_bpf_file(filename)) { | ||
310 | printf("%s", bpf_log_buf); | ||
311 | return 1; | ||
312 | } | ||
313 | |||
314 | /* test two functions in the corresponding *_kern.c file */ | ||
315 | CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_mq_start_request", | ||
316 | BPF_FD_TYPE_KPROBE)); | ||
317 | CHECK_AND_RET(test_debug_fs_kprobe(1, "blk_account_io_done", | ||
318 | BPF_FD_TYPE_KRETPROBE)); | ||
319 | |||
320 | /* test nondebug fs kprobe */ | ||
321 | CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0, | ||
322 | false, BPF_FD_TYPE_KPROBE, | ||
323 | BPF_FD_TYPE_KRETPROBE, | ||
324 | buf, sizeof(buf))); | ||
325 | #ifdef __x86_64__ | ||
326 | /* set a kprobe on "bpf_check + 0x5", which is x64 specific */ | ||
327 | CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x5, 0x0, | ||
328 | false, BPF_FD_TYPE_KPROBE, | ||
329 | BPF_FD_TYPE_KRETPROBE, | ||
330 | buf, sizeof(buf))); | ||
331 | #endif | ||
332 | CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0, | ||
333 | true, BPF_FD_TYPE_KPROBE, | ||
334 | BPF_FD_TYPE_KRETPROBE, | ||
335 | buf, sizeof(buf))); | ||
336 | CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0, | ||
337 | ksym_get_addr("bpf_check"), false, | ||
338 | BPF_FD_TYPE_KPROBE, | ||
339 | BPF_FD_TYPE_KRETPROBE, | ||
340 | buf, sizeof(buf))); | ||
341 | CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0, | ||
342 | ksym_get_addr("bpf_check"), false, | ||
343 | BPF_FD_TYPE_KPROBE, | ||
344 | BPF_FD_TYPE_KRETPROBE, | ||
345 | NULL, 0)); | ||
346 | CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0, | ||
347 | ksym_get_addr("bpf_check"), true, | ||
348 | BPF_FD_TYPE_KPROBE, | ||
349 | BPF_FD_TYPE_KRETPROBE, | ||
350 | buf, sizeof(buf))); | ||
351 | CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0, | ||
352 | ksym_get_addr("bpf_check"), true, | ||
353 | BPF_FD_TYPE_KPROBE, | ||
354 | BPF_FD_TYPE_KRETPROBE, | ||
355 | 0, 0)); | ||
356 | |||
357 | /* test nondebug fs uprobe */ | ||
358 | /* the calculation of uprobe file offset is based on gcc 7.3.1 on x64 | ||
359 | * and the default linker script, which defines __executable_start as | ||
360 | * the start of the .text section. The calculation could be different | ||
361 | * on different systems with different compilers. The right way is | ||
362 | * to parse the ELF file. We took a shortcut here. | ||
363 | */ | ||
364 | uprobe_file_offset = (__u64)main - (__u64)&__executable_start; | ||
365 | CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0], | ||
366 | uprobe_file_offset, 0x0, false, | ||
367 | BPF_FD_TYPE_UPROBE, | ||
368 | BPF_FD_TYPE_URETPROBE, | ||
369 | buf, sizeof(buf))); | ||
370 | CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0], | ||
371 | uprobe_file_offset, 0x0, true, | ||
372 | BPF_FD_TYPE_UPROBE, | ||
373 | BPF_FD_TYPE_URETPROBE, | ||
374 | buf, sizeof(buf))); | ||
375 | |||
376 | /* test debug fs uprobe */ | ||
377 | CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset, | ||
378 | false)); | ||
379 | CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset, | ||
380 | true)); | ||
381 | |||
382 | return 0; | ||
383 | } | ||
diff --git a/samples/bpf/tc_l2_redirect.sh b/samples/bpf/tc_l2_redirect.sh new file mode 100755 index 000000000..37d95ef3c --- /dev/null +++ b/samples/bpf/tc_l2_redirect.sh | |||
@@ -0,0 +1,174 @@ | |||
1 | #!/bin/bash | ||
2 | # SPDX-License-Identifier: GPL-2.0 | ||
3 | |||
4 | [[ -z $TC ]] && TC='tc' | ||
5 | [[ -z $IP ]] && IP='ip' | ||
6 | |||
7 | REDIRECT_USER='./tc_l2_redirect' | ||
8 | REDIRECT_BPF='./tc_l2_redirect_kern.o' | ||
9 | |||
10 | RP_FILTER=$(< /proc/sys/net/ipv4/conf/all/rp_filter) | ||
11 | IPV6_FORWARDING=$(< /proc/sys/net/ipv6/conf/all/forwarding) | ||
12 | |||
13 | function config_common { | ||
14 | local tun_type=$1 | ||
15 | |||
16 | $IP netns add ns1 | ||
17 | $IP netns add ns2 | ||
18 | $IP link add ve1 type veth peer name vens1 | ||
19 | $IP link add ve2 type veth peer name vens2 | ||
20 | $IP link set dev ve1 up | ||
21 | $IP link set dev ve2 up | ||
22 | $IP link set dev ve1 mtu 1500 | ||
23 | $IP link set dev ve2 mtu 1500 | ||
24 | $IP link set dev vens1 netns ns1 | ||
25 | $IP link set dev vens2 netns ns2 | ||
26 | |||
27 | $IP -n ns1 link set dev lo up | ||
28 | $IP -n ns1 link set dev vens1 up | ||
29 | $IP -n ns1 addr add 10.1.1.101/24 dev vens1 | ||
30 | $IP -n ns1 addr add 2401:db01::65/64 dev vens1 nodad | ||
31 | $IP -n ns1 route add default via 10.1.1.1 dev vens1 | ||
32 | $IP -n ns1 route add default via 2401:db01::1 dev vens1 | ||
33 | |||
34 | $IP -n ns2 link set dev lo up | ||
35 | $IP -n ns2 link set dev vens2 up | ||
36 | $IP -n ns2 addr add 10.2.1.102/24 dev vens2 | ||
37 | $IP -n ns2 addr add 2401:db02::66/64 dev vens2 nodad | ||
38 | $IP -n ns2 addr add 10.10.1.102 dev lo | ||
39 | $IP -n ns2 addr add 2401:face::66/64 dev lo nodad | ||
40 | $IP -n ns2 link add ipt2 type ipip local 10.2.1.102 remote 10.2.1.1 | ||
41 | $IP -n ns2 link add ip6t2 type ip6tnl mode any local 2401:db02::66 remote 2401:db02::1 | ||
42 | $IP -n ns2 link set dev ipt2 up | ||
43 | $IP -n ns2 link set dev ip6t2 up | ||
44 | $IP netns exec ns2 $TC qdisc add dev vens2 clsact | ||
45 | $IP netns exec ns2 $TC filter add dev vens2 ingress bpf da obj $REDIRECT_BPF sec drop_non_tun_vip | ||
46 | if [[ $tun_type == "ipip" ]]; then | ||
47 | $IP -n ns2 route add 10.1.1.0/24 dev ipt2 | ||
48 | $IP netns exec ns2 sysctl -q -w net.ipv4.conf.all.rp_filter=0 | ||
49 | $IP netns exec ns2 sysctl -q -w net.ipv4.conf.ipt2.rp_filter=0 | ||
50 | else | ||
51 | $IP -n ns2 route add 10.1.1.0/24 dev ip6t2 | ||
52 | $IP -n ns2 route add 2401:db01::/64 dev ip6t2 | ||
53 | $IP netns exec ns2 sysctl -q -w net.ipv4.conf.all.rp_filter=0 | ||
54 | $IP netns exec ns2 sysctl -q -w net.ipv4.conf.ip6t2.rp_filter=0 | ||
55 | fi | ||
56 | |||
57 | $IP addr add 10.1.1.1/24 dev ve1 | ||
58 | $IP addr add 2401:db01::1/64 dev ve1 nodad | ||
59 | $IP addr add 10.2.1.1/24 dev ve2 | ||
60 | $IP addr add 2401:db02::1/64 dev ve2 nodad | ||
61 | |||
62 | $TC qdisc add dev ve2 clsact | ||
63 | $TC filter add dev ve2 ingress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_forward | ||
64 | |||
65 | sysctl -q -w net.ipv4.conf.all.rp_filter=0 | ||
66 | sysctl -q -w net.ipv6.conf.all.forwarding=1 | ||
67 | } | ||
68 | |||
69 | function cleanup { | ||
70 | set +e | ||
71 | [[ -z $DEBUG ]] || set +x | ||
72 | $IP netns delete ns1 >& /dev/null | ||
73 | $IP netns delete ns2 >& /dev/null | ||
74 | $IP link del ve1 >& /dev/null | ||
75 | $IP link del ve2 >& /dev/null | ||
76 | $IP link del ipt >& /dev/null | ||
77 | $IP link del ip6t >& /dev/null | ||
78 | sysctl -q -w net.ipv4.conf.all.rp_filter=$RP_FILTER | ||
79 | sysctl -q -w net.ipv6.conf.all.forwarding=$IPV6_FORWARDING | ||
80 | rm -f /sys/fs/bpf/tc/globals/tun_iface | ||
81 | [[ -z $DEBUG ]] || set -x | ||
82 | set -e | ||
83 | } | ||
84 | |||
85 | function l2_to_ipip { | ||
86 | echo -n "l2_to_ipip $1: " | ||
87 | |||
88 | local dir=$1 | ||
89 | |||
90 | config_common ipip | ||
91 | |||
92 | $IP link add ipt type ipip external | ||
93 | $IP link set dev ipt up | ||
94 | sysctl -q -w net.ipv4.conf.ipt.rp_filter=0 | ||
95 | sysctl -q -w net.ipv4.conf.ipt.forwarding=1 | ||
96 | |||
97 | if [[ $dir == "egress" ]]; then | ||
98 | $IP route add 10.10.1.0/24 via 10.2.1.102 dev ve2 | ||
99 | $TC filter add dev ve2 egress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_redirect | ||
100 | sysctl -q -w net.ipv4.conf.ve1.forwarding=1 | ||
101 | else | ||
102 | $TC qdisc add dev ve1 clsact | ||
103 | $TC filter add dev ve1 ingress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_redirect | ||
104 | fi | ||
105 | |||
106 | $REDIRECT_USER -U /sys/fs/bpf/tc/globals/tun_iface -i $(< /sys/class/net/ipt/ifindex) | ||
107 | |||
108 | $IP netns exec ns1 ping -c1 10.10.1.102 >& /dev/null | ||
109 | |||
110 | if [[ $dir == "egress" ]]; then | ||
111 | # test direct egress to ve2 (i.e. not forwarding from | ||
112 | # ve1 to ve2). | ||
113 | ping -c1 10.10.1.102 >& /dev/null | ||
114 | fi | ||
115 | |||
116 | cleanup | ||
117 | |||
118 | echo "OK" | ||
119 | } | ||
120 | |||
121 | function l2_to_ip6tnl { | ||
122 | echo -n "l2_to_ip6tnl $1: " | ||
123 | |||
124 | local dir=$1 | ||
125 | |||
126 | config_common ip6tnl | ||
127 | |||
128 | $IP link add ip6t type ip6tnl mode any external | ||
129 | $IP link set dev ip6t up | ||
130 | sysctl -q -w net.ipv4.conf.ip6t.rp_filter=0 | ||
131 | sysctl -q -w net.ipv4.conf.ip6t.forwarding=1 | ||
132 | |||
133 | if [[ $dir == "egress" ]]; then | ||
134 | $IP route add 10.10.1.0/24 via 10.2.1.102 dev ve2 | ||
135 | $IP route add 2401:face::/64 via 2401:db02::66 dev ve2 | ||
136 | $TC filter add dev ve2 egress bpf da obj $REDIRECT_BPF sec l2_to_ip6tun_ingress_redirect | ||
137 | sysctl -q -w net.ipv4.conf.ve1.forwarding=1 | ||
138 | else | ||
139 | $TC qdisc add dev ve1 clsact | ||
140 | $TC filter add dev ve1 ingress bpf da obj $REDIRECT_BPF sec l2_to_ip6tun_ingress_redirect | ||
141 | fi | ||
142 | |||
143 | $REDIRECT_USER -U /sys/fs/bpf/tc/globals/tun_iface -i $(< /sys/class/net/ip6t/ifindex) | ||
144 | |||
145 | $IP netns exec ns1 ping -c1 10.10.1.102 >& /dev/null | ||
146 | $IP netns exec ns1 ping -6 -c1 2401:face::66 >& /dev/null | ||
147 | |||
148 | if [[ $dir == "egress" ]]; then | ||
149 | # test direct egress to ve2 (i.e. not forwarding from | ||
150 | # ve1 to ve2). | ||
151 | ping -c1 10.10.1.102 >& /dev/null | ||
152 | ping -6 -c1 2401:face::66 >& /dev/null | ||
153 | fi | ||
154 | |||
155 | cleanup | ||
156 | |||
157 | echo "OK" | ||
158 | } | ||
159 | |||
160 | cleanup | ||
161 | test_names="l2_to_ipip l2_to_ip6tnl" | ||
162 | test_dirs="ingress egress" | ||
163 | if [[ $# -ge 2 ]]; then | ||
164 | test_names=$1 | ||
165 | test_dirs=$2 | ||
166 | elif [[ $# -ge 1 ]]; then | ||
167 | test_names=$1 | ||
168 | fi | ||
169 | |||
170 | for t in $test_names; do | ||
171 | for d in $test_dirs; do | ||
172 | $t $d | ||
173 | done | ||
174 | done | ||
diff --git a/samples/bpf/tc_l2_redirect_kern.c b/samples/bpf/tc_l2_redirect_kern.c new file mode 100644 index 000000000..fd2fa0004 --- /dev/null +++ b/samples/bpf/tc_l2_redirect_kern.c | |||
@@ -0,0 +1,237 @@ | |||
1 | /* Copyright (c) 2016 Facebook | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #define KBUILD_MODNAME "foo" | ||
8 | #include <uapi/linux/bpf.h> | ||
9 | #include <uapi/linux/if_ether.h> | ||
10 | #include <uapi/linux/if_packet.h> | ||
11 | #include <uapi/linux/ip.h> | ||
12 | #include <uapi/linux/ipv6.h> | ||
13 | #include <uapi/linux/in.h> | ||
14 | #include <uapi/linux/tcp.h> | ||
15 | #include <uapi/linux/filter.h> | ||
16 | #include <uapi/linux/pkt_cls.h> | ||
17 | #include <net/ipv6.h> | ||
18 | #include <bpf/bpf_helpers.h> | ||
19 | |||
20 | #define _htonl __builtin_bswap32 | ||
21 | |||
22 | #define PIN_GLOBAL_NS 2 | ||
23 | struct bpf_elf_map { | ||
24 | __u32 type; | ||
25 | __u32 size_key; | ||
26 | __u32 size_value; | ||
27 | __u32 max_elem; | ||
28 | __u32 flags; | ||
29 | __u32 id; | ||
30 | __u32 pinning; | ||
31 | }; | ||
32 | |||
33 | /* copy of 'struct ethhdr' without __packed */ | ||
34 | struct eth_hdr { | ||
35 | unsigned char h_dest[ETH_ALEN]; | ||
36 | unsigned char h_source[ETH_ALEN]; | ||
37 | unsigned short h_proto; | ||
38 | }; | ||
39 | |||
40 | struct bpf_elf_map SEC("maps") tun_iface = { | ||
41 | .type = BPF_MAP_TYPE_ARRAY, | ||
42 | .size_key = sizeof(int), | ||
43 | .size_value = sizeof(int), | ||
44 | .pinning = PIN_GLOBAL_NS, | ||
45 | .max_elem = 1, | ||
46 | }; | ||
47 | |||
48 | static __always_inline bool is_vip_addr(__be16 eth_proto, __be32 daddr) | ||
49 | { | ||
50 | if (eth_proto == htons(ETH_P_IP)) | ||
51 | return (_htonl(0xffffff00) & daddr) == _htonl(0x0a0a0100); | ||
52 | else if (eth_proto == htons(ETH_P_IPV6)) | ||
53 | return (daddr == _htonl(0x2401face)); | ||
54 | |||
55 | return false; | ||
56 | } | ||
57 | |||
58 | SEC("l2_to_iptun_ingress_forward") | ||
59 | int _l2_to_iptun_ingress_forward(struct __sk_buff *skb) | ||
60 | { | ||
61 | struct bpf_tunnel_key tkey = {}; | ||
62 | void *data = (void *)(long)skb->data; | ||
63 | struct eth_hdr *eth = data; | ||
64 | void *data_end = (void *)(long)skb->data_end; | ||
65 | int key = 0, *ifindex; | ||
66 | |||
67 | int ret; | ||
68 | |||
69 | if (data + sizeof(*eth) > data_end) | ||
70 | return TC_ACT_OK; | ||
71 | |||
72 | ifindex = bpf_map_lookup_elem(&tun_iface, &key); | ||
73 | if (!ifindex) | ||
74 | return TC_ACT_OK; | ||
75 | |||
76 | if (eth->h_proto == htons(ETH_P_IP)) { | ||
77 | char fmt4[] = "ingress forward to ifindex:%d daddr4:%x\n"; | ||
78 | struct iphdr *iph = data + sizeof(*eth); | ||
79 | |||
80 | if (data + sizeof(*eth) + sizeof(*iph) > data_end) | ||
81 | return TC_ACT_OK; | ||
82 | |||
83 | if (iph->protocol != IPPROTO_IPIP) | ||
84 | return TC_ACT_OK; | ||
85 | |||
86 | bpf_trace_printk(fmt4, sizeof(fmt4), *ifindex, | ||
87 | _htonl(iph->daddr)); | ||
88 | return bpf_redirect(*ifindex, BPF_F_INGRESS); | ||
89 | } else if (eth->h_proto == htons(ETH_P_IPV6)) { | ||
90 | char fmt6[] = "ingress forward to ifindex:%d daddr6:%x::%x\n"; | ||
91 | struct ipv6hdr *ip6h = data + sizeof(*eth); | ||
92 | |||
93 | if (data + sizeof(*eth) + sizeof(*ip6h) > data_end) | ||
94 | return TC_ACT_OK; | ||
95 | |||
96 | if (ip6h->nexthdr != IPPROTO_IPIP && | ||
97 | ip6h->nexthdr != IPPROTO_IPV6) | ||
98 | return TC_ACT_OK; | ||
99 | |||
100 | bpf_trace_printk(fmt6, sizeof(fmt6), *ifindex, | ||
101 | _htonl(ip6h->daddr.s6_addr32[0]), | ||
102 | _htonl(ip6h->daddr.s6_addr32[3])); | ||
103 | return bpf_redirect(*ifindex, BPF_F_INGRESS); | ||
104 | } | ||
105 | |||
106 | return TC_ACT_OK; | ||
107 | } | ||
108 | |||
109 | SEC("l2_to_iptun_ingress_redirect") | ||
110 | int _l2_to_iptun_ingress_redirect(struct __sk_buff *skb) | ||
111 | { | ||
112 | struct bpf_tunnel_key tkey = {}; | ||
113 | void *data = (void *)(long)skb->data; | ||
114 | struct eth_hdr *eth = data; | ||
115 | void *data_end = (void *)(long)skb->data_end; | ||
116 | int key = 0, *ifindex; | ||
117 | |||
118 | int ret; | ||
119 | |||
120 | if (data + sizeof(*eth) > data_end) | ||
121 | return TC_ACT_OK; | ||
122 | |||
123 | ifindex = bpf_map_lookup_elem(&tun_iface, &key); | ||
124 | if (!ifindex) | ||
125 | return TC_ACT_OK; | ||
126 | |||
127 | if (eth->h_proto == htons(ETH_P_IP)) { | ||
128 | char fmt4[] = "e/ingress redirect daddr4:%x to ifindex:%d\n"; | ||
129 | struct iphdr *iph = data + sizeof(*eth); | ||
130 | __be32 daddr = iph->daddr; | ||
131 | |||
132 | if (data + sizeof(*eth) + sizeof(*iph) > data_end) | ||
133 | return TC_ACT_OK; | ||
134 | |||
135 | if (!is_vip_addr(eth->h_proto, daddr)) | ||
136 | return TC_ACT_OK; | ||
137 | |||
138 | bpf_trace_printk(fmt4, sizeof(fmt4), _htonl(daddr), *ifindex); | ||
139 | } else { | ||
140 | return TC_ACT_OK; | ||
141 | } | ||
142 | |||
143 | tkey.tunnel_id = 10000; | ||
144 | tkey.tunnel_ttl = 64; | ||
145 | tkey.remote_ipv4 = 0x0a020166; /* 10.2.1.102 */ | ||
146 | bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), 0); | ||
147 | return bpf_redirect(*ifindex, 0); | ||
148 | } | ||
149 | |||
150 | SEC("l2_to_ip6tun_ingress_redirect") | ||
151 | int _l2_to_ip6tun_ingress_redirect(struct __sk_buff *skb) | ||
152 | { | ||
153 | struct bpf_tunnel_key tkey = {}; | ||
154 | void *data = (void *)(long)skb->data; | ||
155 | struct eth_hdr *eth = data; | ||
156 | void *data_end = (void *)(long)skb->data_end; | ||
157 | int key = 0, *ifindex; | ||
158 | |||
159 | if (data + sizeof(*eth) > data_end) | ||
160 | return TC_ACT_OK; | ||
161 | |||
162 | ifindex = bpf_map_lookup_elem(&tun_iface, &key); | ||
163 | if (!ifindex) | ||
164 | return TC_ACT_OK; | ||
165 | |||
166 | if (eth->h_proto == htons(ETH_P_IP)) { | ||
167 | char fmt4[] = "e/ingress redirect daddr4:%x to ifindex:%d\n"; | ||
168 | struct iphdr *iph = data + sizeof(*eth); | ||
169 | |||
170 | if (data + sizeof(*eth) + sizeof(*iph) > data_end) | ||
171 | return TC_ACT_OK; | ||
172 | |||
173 | if (!is_vip_addr(eth->h_proto, iph->daddr)) | ||
174 | return TC_ACT_OK; | ||
175 | |||
176 | bpf_trace_printk(fmt4, sizeof(fmt4), _htonl(iph->daddr), | ||
177 | *ifindex); | ||
178 | } else if (eth->h_proto == htons(ETH_P_IPV6)) { | ||
179 | char fmt6[] = "e/ingress redirect daddr6:%x to ifindex:%d\n"; | ||
180 | struct ipv6hdr *ip6h = data + sizeof(*eth); | ||
181 | |||
182 | if (data + sizeof(*eth) + sizeof(*ip6h) > data_end) | ||
183 | return TC_ACT_OK; | ||
184 | |||
185 | if (!is_vip_addr(eth->h_proto, ip6h->daddr.s6_addr32[0])) | ||
186 | return TC_ACT_OK; | ||
187 | |||
188 | bpf_trace_printk(fmt6, sizeof(fmt6), | ||
189 | _htonl(ip6h->daddr.s6_addr32[0]), *ifindex); | ||
190 | } else { | ||
191 | return TC_ACT_OK; | ||
192 | } | ||
193 | |||
194 | tkey.tunnel_id = 10000; | ||
195 | tkey.tunnel_ttl = 64; | ||
196 | /* 2401:db02:0:0:0:0:0:66 */ | ||
197 | tkey.remote_ipv6[0] = _htonl(0x2401db02); | ||
198 | tkey.remote_ipv6[1] = 0; | ||
199 | tkey.remote_ipv6[2] = 0; | ||
200 | tkey.remote_ipv6[3] = _htonl(0x00000066); | ||
201 | bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), BPF_F_TUNINFO_IPV6); | ||
202 | return bpf_redirect(*ifindex, 0); | ||
203 | } | ||
204 | |||
205 | SEC("drop_non_tun_vip") | ||
206 | int _drop_non_tun_vip(struct __sk_buff *skb) | ||
207 | { | ||
208 | struct bpf_tunnel_key tkey = {}; | ||
209 | void *data = (void *)(long)skb->data; | ||
210 | struct eth_hdr *eth = data; | ||
211 | void *data_end = (void *)(long)skb->data_end; | ||
212 | |||
213 | if (data + sizeof(*eth) > data_end) | ||
214 | return TC_ACT_OK; | ||
215 | |||
216 | if (eth->h_proto == htons(ETH_P_IP)) { | ||
217 | struct iphdr *iph = data + sizeof(*eth); | ||
218 | |||
219 | if (data + sizeof(*eth) + sizeof(*iph) > data_end) | ||
220 | return TC_ACT_OK; | ||
221 | |||
222 | if (is_vip_addr(eth->h_proto, iph->daddr)) | ||
223 | return TC_ACT_SHOT; | ||
224 | } else if (eth->h_proto == htons(ETH_P_IPV6)) { | ||
225 | struct ipv6hdr *ip6h = data + sizeof(*eth); | ||
226 | |||
227 | if (data + sizeof(*eth) + sizeof(*ip6h) > data_end) | ||
228 | return TC_ACT_OK; | ||
229 | |||
230 | if (is_vip_addr(eth->h_proto, ip6h->daddr.s6_addr32[0])) | ||
231 | return TC_ACT_SHOT; | ||
232 | } | ||
233 | |||
234 | return TC_ACT_OK; | ||
235 | } | ||
236 | |||
237 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/tc_l2_redirect_user.c b/samples/bpf/tc_l2_redirect_user.c new file mode 100644 index 000000000..d11a6e1e9 --- /dev/null +++ b/samples/bpf/tc_l2_redirect_user.c | |||
@@ -0,0 +1,70 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* Copyright (c) 2016 Facebook | ||
3 | */ | ||
4 | #include <linux/unistd.h> | ||
5 | #include <linux/bpf.h> | ||
6 | |||
7 | #include <stdlib.h> | ||
8 | #include <stdio.h> | ||
9 | #include <unistd.h> | ||
10 | #include <string.h> | ||
11 | #include <errno.h> | ||
12 | |||
13 | #include <bpf/bpf.h> | ||
14 | |||
15 | static void usage(void) | ||
16 | { | ||
17 | printf("Usage: tc_l2_ipip_redirect [...]\n"); | ||
18 | printf(" -U <file> Update an already pinned BPF array\n"); | ||
19 | printf(" -i <ifindex> Interface index\n"); | ||
20 | printf(" -h Display this help\n"); | ||
21 | } | ||
22 | |||
23 | int main(int argc, char **argv) | ||
24 | { | ||
25 | const char *pinned_file = NULL; | ||
26 | int ifindex = -1; | ||
27 | int array_key = 0; | ||
28 | int array_fd = -1; | ||
29 | int ret = -1; | ||
30 | int opt; | ||
31 | |||
32 | while ((opt = getopt(argc, argv, "F:U:i:")) != -1) { | ||
33 | switch (opt) { | ||
34 | /* General args */ | ||
35 | case 'U': | ||
36 | pinned_file = optarg; | ||
37 | break; | ||
38 | case 'i': | ||
39 | ifindex = atoi(optarg); | ||
40 | break; | ||
41 | default: | ||
42 | usage(); | ||
43 | goto out; | ||
44 | } | ||
45 | } | ||
46 | |||
47 | if (ifindex < 0 || !pinned_file) { | ||
48 | usage(); | ||
49 | goto out; | ||
50 | } | ||
51 | |||
52 | array_fd = bpf_obj_get(pinned_file); | ||
53 | if (array_fd < 0) { | ||
54 | fprintf(stderr, "bpf_obj_get(%s): %s(%d)\n", | ||
55 | pinned_file, strerror(errno), errno); | ||
56 | goto out; | ||
57 | } | ||
58 | |||
59 | /* bpf_tunnel_key.remote_ipv4 expects host byte orders */ | ||
60 | ret = bpf_map_update_elem(array_fd, &array_key, &ifindex, 0); | ||
61 | if (ret) { | ||
62 | perror("bpf_map_update_elem"); | ||
63 | goto out; | ||
64 | } | ||
65 | |||
66 | out: | ||
67 | if (array_fd != -1) | ||
68 | close(array_fd); | ||
69 | return ret; | ||
70 | } | ||
diff --git a/samples/bpf/tcbpf1_kern.c b/samples/bpf/tcbpf1_kern.c new file mode 100644 index 000000000..e9356130f --- /dev/null +++ b/samples/bpf/tcbpf1_kern.c | |||
@@ -0,0 +1,91 @@ | |||
1 | #define KBUILD_MODNAME "foo" | ||
2 | #include <uapi/linux/bpf.h> | ||
3 | #include <uapi/linux/if_ether.h> | ||
4 | #include <uapi/linux/if_packet.h> | ||
5 | #include <uapi/linux/ip.h> | ||
6 | #include <uapi/linux/in.h> | ||
7 | #include <uapi/linux/tcp.h> | ||
8 | #include <uapi/linux/filter.h> | ||
9 | #include <uapi/linux/pkt_cls.h> | ||
10 | #include <bpf/bpf_helpers.h> | ||
11 | #include "bpf_legacy.h" | ||
12 | |||
13 | /* compiler workaround */ | ||
14 | #define _htonl __builtin_bswap32 | ||
15 | |||
16 | static inline void set_dst_mac(struct __sk_buff *skb, char *mac) | ||
17 | { | ||
18 | bpf_skb_store_bytes(skb, 0, mac, ETH_ALEN, 1); | ||
19 | } | ||
20 | |||
21 | #define IP_CSUM_OFF (ETH_HLEN + offsetof(struct iphdr, check)) | ||
22 | #define TOS_OFF (ETH_HLEN + offsetof(struct iphdr, tos)) | ||
23 | |||
24 | static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos) | ||
25 | { | ||
26 | __u8 old_tos = load_byte(skb, TOS_OFF); | ||
27 | |||
28 | bpf_l3_csum_replace(skb, IP_CSUM_OFF, htons(old_tos), htons(new_tos), 2); | ||
29 | bpf_skb_store_bytes(skb, TOS_OFF, &new_tos, sizeof(new_tos), 0); | ||
30 | } | ||
31 | |||
32 | #define TCP_CSUM_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, check)) | ||
33 | #define IP_SRC_OFF (ETH_HLEN + offsetof(struct iphdr, saddr)) | ||
34 | |||
35 | #define IS_PSEUDO 0x10 | ||
36 | |||
37 | static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip) | ||
38 | { | ||
39 | __u32 old_ip = _htonl(load_word(skb, IP_SRC_OFF)); | ||
40 | |||
41 | bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_ip, new_ip, IS_PSEUDO | sizeof(new_ip)); | ||
42 | bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip)); | ||
43 | bpf_skb_store_bytes(skb, IP_SRC_OFF, &new_ip, sizeof(new_ip), 0); | ||
44 | } | ||
45 | |||
46 | #define TCP_DPORT_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, dest)) | ||
47 | static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port) | ||
48 | { | ||
49 | __u16 old_port = htons(load_half(skb, TCP_DPORT_OFF)); | ||
50 | |||
51 | bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_port, new_port, sizeof(new_port)); | ||
52 | bpf_skb_store_bytes(skb, TCP_DPORT_OFF, &new_port, sizeof(new_port), 0); | ||
53 | } | ||
54 | |||
55 | SEC("classifier") | ||
56 | int bpf_prog1(struct __sk_buff *skb) | ||
57 | { | ||
58 | __u8 proto = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); | ||
59 | long *value; | ||
60 | |||
61 | if (proto == IPPROTO_TCP) { | ||
62 | set_ip_tos(skb, 8); | ||
63 | set_tcp_ip_src(skb, 0xA010101); | ||
64 | set_tcp_dest_port(skb, 5001); | ||
65 | } | ||
66 | |||
67 | return 0; | ||
68 | } | ||
69 | SEC("redirect_xmit") | ||
70 | int _redirect_xmit(struct __sk_buff *skb) | ||
71 | { | ||
72 | return bpf_redirect(skb->ifindex + 1, 0); | ||
73 | } | ||
74 | SEC("redirect_recv") | ||
75 | int _redirect_recv(struct __sk_buff *skb) | ||
76 | { | ||
77 | return bpf_redirect(skb->ifindex + 1, 1); | ||
78 | } | ||
79 | SEC("clone_redirect_xmit") | ||
80 | int _clone_redirect_xmit(struct __sk_buff *skb) | ||
81 | { | ||
82 | bpf_clone_redirect(skb, skb->ifindex + 1, 0); | ||
83 | return TC_ACT_SHOT; | ||
84 | } | ||
85 | SEC("clone_redirect_recv") | ||
86 | int _clone_redirect_recv(struct __sk_buff *skb) | ||
87 | { | ||
88 | bpf_clone_redirect(skb, skb->ifindex + 1, 1); | ||
89 | return TC_ACT_SHOT; | ||
90 | } | ||
91 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/tcp_basertt_kern.c b/samples/bpf/tcp_basertt_kern.c new file mode 100644 index 000000000..8dfe09a92 --- /dev/null +++ b/samples/bpf/tcp_basertt_kern.c | |||
@@ -0,0 +1,71 @@ | |||
1 | /* Copyright (c) 2017 Facebook | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | * | ||
7 | * BPF program to set base_rtt to 80us when host is running TCP-NV and | ||
8 | * both hosts are in the same datacenter (as determined by IPv6 prefix). | ||
9 | * | ||
10 | * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. | ||
11 | */ | ||
12 | |||
13 | #include <uapi/linux/bpf.h> | ||
14 | #include <uapi/linux/tcp.h> | ||
15 | #include <uapi/linux/if_ether.h> | ||
16 | #include <uapi/linux/if_packet.h> | ||
17 | #include <uapi/linux/ip.h> | ||
18 | #include <linux/socket.h> | ||
19 | #include <bpf/bpf_helpers.h> | ||
20 | #include <bpf/bpf_endian.h> | ||
21 | |||
22 | #define DEBUG 1 | ||
23 | |||
24 | SEC("sockops") | ||
25 | int bpf_basertt(struct bpf_sock_ops *skops) | ||
26 | { | ||
27 | char cong[20]; | ||
28 | char nv[] = "nv"; | ||
29 | int rv = 0, n; | ||
30 | int op; | ||
31 | |||
32 | op = (int) skops->op; | ||
33 | |||
34 | #ifdef DEBUG | ||
35 | bpf_printk("BPF command: %d\n", op); | ||
36 | #endif | ||
37 | |||
38 | /* Check if both hosts are in the same datacenter. For this | ||
39 | * example they are if the 1st 5.5 bytes in the IPv6 address | ||
40 | * are the same. | ||
41 | */ | ||
42 | if (skops->family == AF_INET6 && | ||
43 | skops->local_ip6[0] == skops->remote_ip6[0] && | ||
44 | (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) == | ||
45 | (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) { | ||
46 | switch (op) { | ||
47 | case BPF_SOCK_OPS_BASE_RTT: | ||
48 | n = bpf_getsockopt(skops, SOL_TCP, TCP_CONGESTION, | ||
49 | cong, sizeof(cong)); | ||
50 | if (!n && !__builtin_memcmp(cong, nv, sizeof(nv)+1)) { | ||
51 | /* Set base_rtt to 80us */ | ||
52 | rv = 80; | ||
53 | } else if (n) { | ||
54 | rv = n; | ||
55 | } else { | ||
56 | rv = -1; | ||
57 | } | ||
58 | break; | ||
59 | default: | ||
60 | rv = -1; | ||
61 | } | ||
62 | } else { | ||
63 | rv = -1; | ||
64 | } | ||
65 | #ifdef DEBUG | ||
66 | bpf_printk("Returning %d\n", rv); | ||
67 | #endif | ||
68 | skops->reply = rv; | ||
69 | return 1; | ||
70 | } | ||
71 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/tcp_bpf.readme b/samples/bpf/tcp_bpf.readme new file mode 100644 index 000000000..78e247f62 --- /dev/null +++ b/samples/bpf/tcp_bpf.readme | |||
@@ -0,0 +1,28 @@ | |||
1 | This file describes how to run the tcp_*_kern.o tcp_bpf (or socket_ops) | ||
2 | programs. These programs attach to a cgroupv2. The following commands create | ||
3 | a cgroupv2 and attach a bash shell to the group. | ||
4 | |||
5 | mkdir -p /tmp/cgroupv2 | ||
6 | mount -t cgroup2 none /tmp/cgroupv2 | ||
7 | mkdir -p /tmp/cgroupv2/foo | ||
8 | bash | ||
9 | echo $$ >> /tmp/cgroupv2/foo/cgroup.procs | ||
10 | |||
11 | Anything that runs under this shell belongs to the foo cgroupv2. To load | ||
12 | (attach) one of the tcp_*_kern.o programs: | ||
13 | |||
14 | bpftool prog load tcp_basertt_kern.o /sys/fs/bpf/tcp_prog | ||
15 | bpftool cgroup attach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog | ||
16 | bpftool prog tracelog | ||
17 | |||
18 | "bpftool prog tracelog" will continue to run printing the BPF log buffer. | ||
19 | The tcp_*_kern.o programs use special print functions to print logging | ||
20 | information (if enabled by the ifdef). | ||
21 | |||
22 | If using netperf/netserver to create traffic, you need to run them under the | ||
23 | cgroupv2 to which the BPF programs are attached (i.e. under bash shell | ||
24 | attached to the cgroupv2). | ||
25 | |||
26 | To remove (unattach) a socket_ops BPF program from a cgroupv2: | ||
27 | |||
28 | bpftool cgroup detach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog | ||
diff --git a/samples/bpf/tcp_bufs_kern.c b/samples/bpf/tcp_bufs_kern.c new file mode 100644 index 000000000..6a80d0895 --- /dev/null +++ b/samples/bpf/tcp_bufs_kern.c | |||
@@ -0,0 +1,81 @@ | |||
1 | /* Copyright (c) 2017 Facebook | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | * | ||
7 | * BPF program to set initial receive window to 40 packets and send | ||
8 | * and receive buffers to 1.5MB. This would usually be done after | ||
9 | * doing appropriate checks that indicate the hosts are far enough | ||
10 | * away (i.e. large RTT). | ||
11 | * | ||
12 | * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. | ||
13 | */ | ||
14 | |||
15 | #include <uapi/linux/bpf.h> | ||
16 | #include <uapi/linux/if_ether.h> | ||
17 | #include <uapi/linux/if_packet.h> | ||
18 | #include <uapi/linux/ip.h> | ||
19 | #include <linux/socket.h> | ||
20 | #include <bpf/bpf_helpers.h> | ||
21 | #include <bpf/bpf_endian.h> | ||
22 | |||
23 | #define DEBUG 1 | ||
24 | |||
25 | SEC("sockops") | ||
26 | int bpf_bufs(struct bpf_sock_ops *skops) | ||
27 | { | ||
28 | int bufsize = 1500000; | ||
29 | int rwnd_init = 40; | ||
30 | int rv = 0; | ||
31 | int op; | ||
32 | |||
33 | /* For testing purposes, only execute rest of BPF program | ||
34 | * if neither port numberis 55601 | ||
35 | */ | ||
36 | if (bpf_ntohl(skops->remote_port) != 55601 && | ||
37 | skops->local_port != 55601) { | ||
38 | skops->reply = -1; | ||
39 | return 1; | ||
40 | } | ||
41 | |||
42 | op = (int) skops->op; | ||
43 | |||
44 | #ifdef DEBUG | ||
45 | bpf_printk("Returning %d\n", rv); | ||
46 | #endif | ||
47 | |||
48 | /* Usually there would be a check to insure the hosts are far | ||
49 | * from each other so it makes sense to increase buffer sizes | ||
50 | */ | ||
51 | switch (op) { | ||
52 | case BPF_SOCK_OPS_RWND_INIT: | ||
53 | rv = rwnd_init; | ||
54 | break; | ||
55 | case BPF_SOCK_OPS_TCP_CONNECT_CB: | ||
56 | /* Set sndbuf and rcvbuf of active connections */ | ||
57 | rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, | ||
58 | sizeof(bufsize)); | ||
59 | rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, | ||
60 | &bufsize, sizeof(bufsize)); | ||
61 | break; | ||
62 | case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: | ||
63 | /* Nothing to do */ | ||
64 | break; | ||
65 | case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: | ||
66 | /* Set sndbuf and rcvbuf of passive connections */ | ||
67 | rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, | ||
68 | sizeof(bufsize)); | ||
69 | rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, | ||
70 | &bufsize, sizeof(bufsize)); | ||
71 | break; | ||
72 | default: | ||
73 | rv = -1; | ||
74 | } | ||
75 | #ifdef DEBUG | ||
76 | bpf_printk("Returning %d\n", rv); | ||
77 | #endif | ||
78 | skops->reply = rv; | ||
79 | return 1; | ||
80 | } | ||
81 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/tcp_clamp_kern.c b/samples/bpf/tcp_clamp_kern.c new file mode 100644 index 000000000..e88bd9ab0 --- /dev/null +++ b/samples/bpf/tcp_clamp_kern.c | |||
@@ -0,0 +1,97 @@ | |||
1 | /* Copyright (c) 2017 Facebook | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | * | ||
7 | * Sample BPF program to set send and receive buffers to 150KB, sndcwnd clamp | ||
8 | * to 100 packets and SYN and SYN_ACK RTOs to 10ms when both hosts are within | ||
9 | * the same datacenter. For his example, we assume they are within the same | ||
10 | * datacenter when the first 5.5 bytes of their IPv6 addresses are the same. | ||
11 | * | ||
12 | * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. | ||
13 | */ | ||
14 | |||
15 | #include <uapi/linux/bpf.h> | ||
16 | #include <uapi/linux/if_ether.h> | ||
17 | #include <uapi/linux/if_packet.h> | ||
18 | #include <uapi/linux/ip.h> | ||
19 | #include <linux/socket.h> | ||
20 | #include <bpf/bpf_helpers.h> | ||
21 | #include <bpf/bpf_endian.h> | ||
22 | |||
23 | #define DEBUG 1 | ||
24 | |||
25 | SEC("sockops") | ||
26 | int bpf_clamp(struct bpf_sock_ops *skops) | ||
27 | { | ||
28 | int bufsize = 150000; | ||
29 | int to_init = 10; | ||
30 | int clamp = 100; | ||
31 | int rv = 0; | ||
32 | int op; | ||
33 | |||
34 | /* For testing purposes, only execute rest of BPF program | ||
35 | * if neither port numberis 55601 | ||
36 | */ | ||
37 | if (bpf_ntohl(skops->remote_port) != 55601 && skops->local_port != 55601) { | ||
38 | skops->reply = -1; | ||
39 | return 0; | ||
40 | } | ||
41 | |||
42 | op = (int) skops->op; | ||
43 | |||
44 | #ifdef DEBUG | ||
45 | bpf_printk("BPF command: %d\n", op); | ||
46 | #endif | ||
47 | |||
48 | /* Check that both hosts are within same datacenter. For this example | ||
49 | * it is the case when the first 5.5 bytes of their IPv6 addresses are | ||
50 | * the same. | ||
51 | */ | ||
52 | if (skops->family == AF_INET6 && | ||
53 | skops->local_ip6[0] == skops->remote_ip6[0] && | ||
54 | (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) == | ||
55 | (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) { | ||
56 | switch (op) { | ||
57 | case BPF_SOCK_OPS_TIMEOUT_INIT: | ||
58 | rv = to_init; | ||
59 | break; | ||
60 | case BPF_SOCK_OPS_TCP_CONNECT_CB: | ||
61 | /* Set sndbuf and rcvbuf of active connections */ | ||
62 | rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, | ||
63 | &bufsize, sizeof(bufsize)); | ||
64 | rv += bpf_setsockopt(skops, SOL_SOCKET, | ||
65 | SO_RCVBUF, &bufsize, | ||
66 | sizeof(bufsize)); | ||
67 | break; | ||
68 | case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: | ||
69 | rv = bpf_setsockopt(skops, SOL_TCP, | ||
70 | TCP_BPF_SNDCWND_CLAMP, | ||
71 | &clamp, sizeof(clamp)); | ||
72 | break; | ||
73 | case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: | ||
74 | /* Set sndbuf and rcvbuf of passive connections */ | ||
75 | rv = bpf_setsockopt(skops, SOL_TCP, | ||
76 | TCP_BPF_SNDCWND_CLAMP, | ||
77 | &clamp, sizeof(clamp)); | ||
78 | rv += bpf_setsockopt(skops, SOL_SOCKET, | ||
79 | SO_SNDBUF, &bufsize, | ||
80 | sizeof(bufsize)); | ||
81 | rv += bpf_setsockopt(skops, SOL_SOCKET, | ||
82 | SO_RCVBUF, &bufsize, | ||
83 | sizeof(bufsize)); | ||
84 | break; | ||
85 | default: | ||
86 | rv = -1; | ||
87 | } | ||
88 | } else { | ||
89 | rv = -1; | ||
90 | } | ||
91 | #ifdef DEBUG | ||
92 | bpf_printk("Returning %d\n", rv); | ||
93 | #endif | ||
94 | skops->reply = rv; | ||
95 | return 1; | ||
96 | } | ||
97 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/tcp_cong_kern.c b/samples/bpf/tcp_cong_kern.c new file mode 100644 index 000000000..2311fc9dd --- /dev/null +++ b/samples/bpf/tcp_cong_kern.c | |||
@@ -0,0 +1,78 @@ | |||
1 | /* Copyright (c) 2017 Facebook | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | * | ||
7 | * BPF program to set congestion control to dctcp when both hosts are | ||
8 | * in the same datacenter (as deteremined by IPv6 prefix). | ||
9 | * | ||
10 | * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. | ||
11 | */ | ||
12 | |||
13 | #include <uapi/linux/bpf.h> | ||
14 | #include <uapi/linux/tcp.h> | ||
15 | #include <uapi/linux/if_ether.h> | ||
16 | #include <uapi/linux/if_packet.h> | ||
17 | #include <uapi/linux/ip.h> | ||
18 | #include <linux/socket.h> | ||
19 | #include <bpf/bpf_helpers.h> | ||
20 | #include <bpf/bpf_endian.h> | ||
21 | |||
22 | #define DEBUG 1 | ||
23 | |||
24 | SEC("sockops") | ||
25 | int bpf_cong(struct bpf_sock_ops *skops) | ||
26 | { | ||
27 | char cong[] = "dctcp"; | ||
28 | int rv = 0; | ||
29 | int op; | ||
30 | |||
31 | /* For testing purposes, only execute rest of BPF program | ||
32 | * if neither port numberis 55601 | ||
33 | */ | ||
34 | if (bpf_ntohl(skops->remote_port) != 55601 && | ||
35 | skops->local_port != 55601) { | ||
36 | skops->reply = -1; | ||
37 | return 1; | ||
38 | } | ||
39 | |||
40 | op = (int) skops->op; | ||
41 | |||
42 | #ifdef DEBUG | ||
43 | bpf_printk("BPF command: %d\n", op); | ||
44 | #endif | ||
45 | |||
46 | /* Check if both hosts are in the same datacenter. For this | ||
47 | * example they are if the 1st 5.5 bytes in the IPv6 address | ||
48 | * are the same. | ||
49 | */ | ||
50 | if (skops->family == AF_INET6 && | ||
51 | skops->local_ip6[0] == skops->remote_ip6[0] && | ||
52 | (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) == | ||
53 | (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) { | ||
54 | switch (op) { | ||
55 | case BPF_SOCK_OPS_NEEDS_ECN: | ||
56 | rv = 1; | ||
57 | break; | ||
58 | case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: | ||
59 | rv = bpf_setsockopt(skops, SOL_TCP, TCP_CONGESTION, | ||
60 | cong, sizeof(cong)); | ||
61 | break; | ||
62 | case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: | ||
63 | rv = bpf_setsockopt(skops, SOL_TCP, TCP_CONGESTION, | ||
64 | cong, sizeof(cong)); | ||
65 | break; | ||
66 | default: | ||
67 | rv = -1; | ||
68 | } | ||
69 | } else { | ||
70 | rv = -1; | ||
71 | } | ||
72 | #ifdef DEBUG | ||
73 | bpf_printk("Returning %d\n", rv); | ||
74 | #endif | ||
75 | skops->reply = rv; | ||
76 | return 1; | ||
77 | } | ||
78 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/tcp_dumpstats_kern.c b/samples/bpf/tcp_dumpstats_kern.c new file mode 100644 index 000000000..e80d3afd2 --- /dev/null +++ b/samples/bpf/tcp_dumpstats_kern.c | |||
@@ -0,0 +1,68 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* Refer to samples/bpf/tcp_bpf.readme for the instructions on | ||
3 | * how to run this sample program. | ||
4 | */ | ||
5 | #include <linux/bpf.h> | ||
6 | |||
7 | #include <bpf/bpf_helpers.h> | ||
8 | #include <bpf/bpf_endian.h> | ||
9 | |||
10 | #define INTERVAL 1000000000ULL | ||
11 | |||
12 | int _version SEC("version") = 1; | ||
13 | char _license[] SEC("license") = "GPL"; | ||
14 | |||
15 | struct { | ||
16 | __u32 type; | ||
17 | __u32 map_flags; | ||
18 | int *key; | ||
19 | __u64 *value; | ||
20 | } bpf_next_dump SEC(".maps") = { | ||
21 | .type = BPF_MAP_TYPE_SK_STORAGE, | ||
22 | .map_flags = BPF_F_NO_PREALLOC, | ||
23 | }; | ||
24 | |||
25 | SEC("sockops") | ||
26 | int _sockops(struct bpf_sock_ops *ctx) | ||
27 | { | ||
28 | struct bpf_tcp_sock *tcp_sk; | ||
29 | struct bpf_sock *sk; | ||
30 | __u64 *next_dump; | ||
31 | __u64 now; | ||
32 | |||
33 | switch (ctx->op) { | ||
34 | case BPF_SOCK_OPS_TCP_CONNECT_CB: | ||
35 | bpf_sock_ops_cb_flags_set(ctx, BPF_SOCK_OPS_RTT_CB_FLAG); | ||
36 | return 1; | ||
37 | case BPF_SOCK_OPS_RTT_CB: | ||
38 | break; | ||
39 | default: | ||
40 | return 1; | ||
41 | } | ||
42 | |||
43 | sk = ctx->sk; | ||
44 | if (!sk) | ||
45 | return 1; | ||
46 | |||
47 | next_dump = bpf_sk_storage_get(&bpf_next_dump, sk, 0, | ||
48 | BPF_SK_STORAGE_GET_F_CREATE); | ||
49 | if (!next_dump) | ||
50 | return 1; | ||
51 | |||
52 | now = bpf_ktime_get_ns(); | ||
53 | if (now < *next_dump) | ||
54 | return 1; | ||
55 | |||
56 | tcp_sk = bpf_tcp_sock(sk); | ||
57 | if (!tcp_sk) | ||
58 | return 1; | ||
59 | |||
60 | *next_dump = now + INTERVAL; | ||
61 | |||
62 | bpf_printk("dsack_dups=%u delivered=%u\n", | ||
63 | tcp_sk->dsack_dups, tcp_sk->delivered); | ||
64 | bpf_printk("delivered_ce=%u icsk_retransmits=%u\n", | ||
65 | tcp_sk->delivered_ce, tcp_sk->icsk_retransmits); | ||
66 | |||
67 | return 1; | ||
68 | } | ||
diff --git a/samples/bpf/tcp_iw_kern.c b/samples/bpf/tcp_iw_kern.c new file mode 100644 index 000000000..d14445573 --- /dev/null +++ b/samples/bpf/tcp_iw_kern.c | |||
@@ -0,0 +1,83 @@ | |||
1 | /* Copyright (c) 2017 Facebook | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | * | ||
7 | * BPF program to set initial congestion window and initial receive | ||
8 | * window to 40 packets and send and receive buffers to 1.5MB. This | ||
9 | * would usually be done after doing appropriate checks that indicate | ||
10 | * the hosts are far enough away (i.e. large RTT). | ||
11 | * | ||
12 | * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. | ||
13 | */ | ||
14 | |||
15 | #include <uapi/linux/bpf.h> | ||
16 | #include <uapi/linux/if_ether.h> | ||
17 | #include <uapi/linux/if_packet.h> | ||
18 | #include <uapi/linux/ip.h> | ||
19 | #include <linux/socket.h> | ||
20 | #include <bpf/bpf_helpers.h> | ||
21 | #include <bpf/bpf_endian.h> | ||
22 | |||
23 | #define DEBUG 1 | ||
24 | |||
25 | SEC("sockops") | ||
26 | int bpf_iw(struct bpf_sock_ops *skops) | ||
27 | { | ||
28 | int bufsize = 1500000; | ||
29 | int rwnd_init = 40; | ||
30 | int iw = 40; | ||
31 | int rv = 0; | ||
32 | int op; | ||
33 | |||
34 | /* For testing purposes, only execute rest of BPF program | ||
35 | * if neither port numberis 55601 | ||
36 | */ | ||
37 | if (bpf_ntohl(skops->remote_port) != 55601 && | ||
38 | skops->local_port != 55601) { | ||
39 | skops->reply = -1; | ||
40 | return 1; | ||
41 | } | ||
42 | |||
43 | op = (int) skops->op; | ||
44 | |||
45 | #ifdef DEBUG | ||
46 | bpf_printk("BPF command: %d\n", op); | ||
47 | #endif | ||
48 | |||
49 | /* Usually there would be a check to insure the hosts are far | ||
50 | * from each other so it makes sense to increase buffer sizes | ||
51 | */ | ||
52 | switch (op) { | ||
53 | case BPF_SOCK_OPS_RWND_INIT: | ||
54 | rv = rwnd_init; | ||
55 | break; | ||
56 | case BPF_SOCK_OPS_TCP_CONNECT_CB: | ||
57 | /* Set sndbuf and rcvbuf of active connections */ | ||
58 | rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, | ||
59 | sizeof(bufsize)); | ||
60 | rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, | ||
61 | &bufsize, sizeof(bufsize)); | ||
62 | break; | ||
63 | case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: | ||
64 | rv = bpf_setsockopt(skops, SOL_TCP, TCP_BPF_IW, &iw, | ||
65 | sizeof(iw)); | ||
66 | break; | ||
67 | case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: | ||
68 | /* Set sndbuf and rcvbuf of passive connections */ | ||
69 | rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, | ||
70 | sizeof(bufsize)); | ||
71 | rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, | ||
72 | &bufsize, sizeof(bufsize)); | ||
73 | break; | ||
74 | default: | ||
75 | rv = -1; | ||
76 | } | ||
77 | #ifdef DEBUG | ||
78 | bpf_printk("Returning %d\n", rv); | ||
79 | #endif | ||
80 | skops->reply = rv; | ||
81 | return 1; | ||
82 | } | ||
83 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/tcp_rwnd_kern.c b/samples/bpf/tcp_rwnd_kern.c new file mode 100644 index 000000000..223d9c23b --- /dev/null +++ b/samples/bpf/tcp_rwnd_kern.c | |||
@@ -0,0 +1,64 @@ | |||
1 | /* Copyright (c) 2017 Facebook | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | * | ||
7 | * BPF program to set initial receive window to 40 packets when using IPv6 | ||
8 | * and the first 5.5 bytes of the IPv6 addresses are not the same (in this | ||
9 | * example that means both hosts are not the same datacenter). | ||
10 | * | ||
11 | * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. | ||
12 | */ | ||
13 | |||
14 | #include <uapi/linux/bpf.h> | ||
15 | #include <uapi/linux/if_ether.h> | ||
16 | #include <uapi/linux/if_packet.h> | ||
17 | #include <uapi/linux/ip.h> | ||
18 | #include <linux/socket.h> | ||
19 | #include <bpf/bpf_helpers.h> | ||
20 | #include <bpf/bpf_endian.h> | ||
21 | |||
22 | #define DEBUG 1 | ||
23 | |||
24 | SEC("sockops") | ||
25 | int bpf_rwnd(struct bpf_sock_ops *skops) | ||
26 | { | ||
27 | int rv = -1; | ||
28 | int op; | ||
29 | |||
30 | /* For testing purposes, only execute rest of BPF program | ||
31 | * if neither port numberis 55601 | ||
32 | */ | ||
33 | if (bpf_ntohl(skops->remote_port) != | ||
34 | 55601 && skops->local_port != 55601) { | ||
35 | skops->reply = -1; | ||
36 | return 1; | ||
37 | } | ||
38 | |||
39 | op = (int) skops->op; | ||
40 | |||
41 | #ifdef DEBUG | ||
42 | bpf_printk("BPF command: %d\n", op); | ||
43 | #endif | ||
44 | |||
45 | /* Check for RWND_INIT operation and IPv6 addresses */ | ||
46 | if (op == BPF_SOCK_OPS_RWND_INIT && | ||
47 | skops->family == AF_INET6) { | ||
48 | |||
49 | /* If the first 5.5 bytes of the IPv6 address are not the same | ||
50 | * then both hosts are not in the same datacenter | ||
51 | * so use a larger initial advertized window (40 packets) | ||
52 | */ | ||
53 | if (skops->local_ip6[0] != skops->remote_ip6[0] || | ||
54 | (bpf_ntohl(skops->local_ip6[1]) & 0xfffff000) != | ||
55 | (bpf_ntohl(skops->remote_ip6[1]) & 0xfffff000)) | ||
56 | rv = 40; | ||
57 | } | ||
58 | #ifdef DEBUG | ||
59 | bpf_printk("Returning %d\n", rv); | ||
60 | #endif | ||
61 | skops->reply = rv; | ||
62 | return 1; | ||
63 | } | ||
64 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/tcp_synrto_kern.c b/samples/bpf/tcp_synrto_kern.c new file mode 100644 index 000000000..d58004eef --- /dev/null +++ b/samples/bpf/tcp_synrto_kern.c | |||
@@ -0,0 +1,64 @@ | |||
1 | /* Copyright (c) 2017 Facebook | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | * | ||
7 | * BPF program to set SYN and SYN-ACK RTOs to 10ms when using IPv6 addresses | ||
8 | * and the first 5.5 bytes of the IPv6 addresses are the same (in this example | ||
9 | * that means both hosts are in the same datacenter). | ||
10 | * | ||
11 | * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. | ||
12 | */ | ||
13 | |||
14 | #include <uapi/linux/bpf.h> | ||
15 | #include <uapi/linux/if_ether.h> | ||
16 | #include <uapi/linux/if_packet.h> | ||
17 | #include <uapi/linux/ip.h> | ||
18 | #include <linux/socket.h> | ||
19 | #include <bpf/bpf_helpers.h> | ||
20 | #include <bpf/bpf_endian.h> | ||
21 | |||
22 | #define DEBUG 1 | ||
23 | |||
24 | SEC("sockops") | ||
25 | int bpf_synrto(struct bpf_sock_ops *skops) | ||
26 | { | ||
27 | int rv = -1; | ||
28 | int op; | ||
29 | |||
30 | /* For testing purposes, only execute rest of BPF program | ||
31 | * if neither port numberis 55601 | ||
32 | */ | ||
33 | if (bpf_ntohl(skops->remote_port) != 55601 && | ||
34 | skops->local_port != 55601) { | ||
35 | skops->reply = -1; | ||
36 | return 1; | ||
37 | } | ||
38 | |||
39 | op = (int) skops->op; | ||
40 | |||
41 | #ifdef DEBUG | ||
42 | bpf_printk("BPF command: %d\n", op); | ||
43 | #endif | ||
44 | |||
45 | /* Check for TIMEOUT_INIT operation and IPv6 addresses */ | ||
46 | if (op == BPF_SOCK_OPS_TIMEOUT_INIT && | ||
47 | skops->family == AF_INET6) { | ||
48 | |||
49 | /* If the first 5.5 bytes of the IPv6 address are the same | ||
50 | * then both hosts are in the same datacenter | ||
51 | * so use an RTO of 10ms | ||
52 | */ | ||
53 | if (skops->local_ip6[0] == skops->remote_ip6[0] && | ||
54 | (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) == | ||
55 | (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) | ||
56 | rv = 10; | ||
57 | } | ||
58 | #ifdef DEBUG | ||
59 | bpf_printk("Returning %d\n", rv); | ||
60 | #endif | ||
61 | skops->reply = rv; | ||
62 | return 1; | ||
63 | } | ||
64 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/tcp_tos_reflect_kern.c b/samples/bpf/tcp_tos_reflect_kern.c new file mode 100644 index 000000000..953fedc79 --- /dev/null +++ b/samples/bpf/tcp_tos_reflect_kern.c | |||
@@ -0,0 +1,80 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Copyright (c) 2018 Facebook | ||
4 | * | ||
5 | * BPF program to automatically reflect TOS option from received syn packet | ||
6 | * | ||
7 | * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. | ||
8 | */ | ||
9 | |||
10 | #include <uapi/linux/bpf.h> | ||
11 | #include <uapi/linux/tcp.h> | ||
12 | #include <uapi/linux/if_ether.h> | ||
13 | #include <uapi/linux/if_packet.h> | ||
14 | #include <uapi/linux/ip.h> | ||
15 | #include <uapi/linux/ipv6.h> | ||
16 | #include <uapi/linux/in.h> | ||
17 | #include <linux/socket.h> | ||
18 | #include <bpf/bpf_helpers.h> | ||
19 | #include <bpf/bpf_endian.h> | ||
20 | |||
21 | #define DEBUG 1 | ||
22 | |||
23 | SEC("sockops") | ||
24 | int bpf_basertt(struct bpf_sock_ops *skops) | ||
25 | { | ||
26 | char header[sizeof(struct ipv6hdr)]; | ||
27 | struct ipv6hdr *hdr6; | ||
28 | struct iphdr *hdr; | ||
29 | int hdr_size = 0; | ||
30 | int save_syn = 1; | ||
31 | int tos = 0; | ||
32 | int rv = 0; | ||
33 | int op; | ||
34 | |||
35 | op = (int) skops->op; | ||
36 | |||
37 | #ifdef DEBUG | ||
38 | bpf_printk("BPF command: %d\n", op); | ||
39 | #endif | ||
40 | switch (op) { | ||
41 | case BPF_SOCK_OPS_TCP_LISTEN_CB: | ||
42 | rv = bpf_setsockopt(skops, SOL_TCP, TCP_SAVE_SYN, | ||
43 | &save_syn, sizeof(save_syn)); | ||
44 | break; | ||
45 | case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: | ||
46 | if (skops->family == AF_INET) | ||
47 | hdr_size = sizeof(struct iphdr); | ||
48 | else | ||
49 | hdr_size = sizeof(struct ipv6hdr); | ||
50 | rv = bpf_getsockopt(skops, SOL_TCP, TCP_SAVED_SYN, | ||
51 | header, hdr_size); | ||
52 | if (!rv) { | ||
53 | if (skops->family == AF_INET) { | ||
54 | hdr = (struct iphdr *) header; | ||
55 | tos = hdr->tos; | ||
56 | if (tos != 0) | ||
57 | bpf_setsockopt(skops, SOL_IP, IP_TOS, | ||
58 | &tos, sizeof(tos)); | ||
59 | } else { | ||
60 | hdr6 = (struct ipv6hdr *) header; | ||
61 | tos = ((hdr6->priority) << 4 | | ||
62 | (hdr6->flow_lbl[0]) >> 4); | ||
63 | if (tos) | ||
64 | bpf_setsockopt(skops, SOL_IPV6, | ||
65 | IPV6_TCLASS, | ||
66 | &tos, sizeof(tos)); | ||
67 | } | ||
68 | rv = 0; | ||
69 | } | ||
70 | break; | ||
71 | default: | ||
72 | rv = -1; | ||
73 | } | ||
74 | #ifdef DEBUG | ||
75 | bpf_printk("Returning %d\n", rv); | ||
76 | #endif | ||
77 | skops->reply = rv; | ||
78 | return 1; | ||
79 | } | ||
80 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/test_cgrp2_array_pin.c b/samples/bpf/test_cgrp2_array_pin.c new file mode 100644 index 000000000..6d564aa75 --- /dev/null +++ b/samples/bpf/test_cgrp2_array_pin.c | |||
@@ -0,0 +1,106 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* Copyright (c) 2016 Facebook | ||
3 | */ | ||
4 | #include <linux/unistd.h> | ||
5 | #include <linux/bpf.h> | ||
6 | |||
7 | #include <stdio.h> | ||
8 | #include <stdint.h> | ||
9 | #include <unistd.h> | ||
10 | #include <string.h> | ||
11 | #include <errno.h> | ||
12 | #include <fcntl.h> | ||
13 | |||
14 | #include <bpf/bpf.h> | ||
15 | |||
16 | static void usage(void) | ||
17 | { | ||
18 | printf("Usage: test_cgrp2_array_pin [...]\n"); | ||
19 | printf(" -F <file> File to pin an BPF cgroup array\n"); | ||
20 | printf(" -U <file> Update an already pinned BPF cgroup array\n"); | ||
21 | printf(" -v <value> Full path of the cgroup2\n"); | ||
22 | printf(" -h Display this help\n"); | ||
23 | } | ||
24 | |||
25 | int main(int argc, char **argv) | ||
26 | { | ||
27 | const char *pinned_file = NULL, *cg2 = NULL; | ||
28 | int create_array = 1; | ||
29 | int array_key = 0; | ||
30 | int array_fd = -1; | ||
31 | int cg2_fd = -1; | ||
32 | int ret = -1; | ||
33 | int opt; | ||
34 | |||
35 | while ((opt = getopt(argc, argv, "F:U:v:")) != -1) { | ||
36 | switch (opt) { | ||
37 | /* General args */ | ||
38 | case 'F': | ||
39 | pinned_file = optarg; | ||
40 | break; | ||
41 | case 'U': | ||
42 | pinned_file = optarg; | ||
43 | create_array = 0; | ||
44 | break; | ||
45 | case 'v': | ||
46 | cg2 = optarg; | ||
47 | break; | ||
48 | default: | ||
49 | usage(); | ||
50 | goto out; | ||
51 | } | ||
52 | } | ||
53 | |||
54 | if (!cg2 || !pinned_file) { | ||
55 | usage(); | ||
56 | goto out; | ||
57 | } | ||
58 | |||
59 | cg2_fd = open(cg2, O_RDONLY); | ||
60 | if (cg2_fd < 0) { | ||
61 | fprintf(stderr, "open(%s,...): %s(%d)\n", | ||
62 | cg2, strerror(errno), errno); | ||
63 | goto out; | ||
64 | } | ||
65 | |||
66 | if (create_array) { | ||
67 | array_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_ARRAY, | ||
68 | sizeof(uint32_t), sizeof(uint32_t), | ||
69 | 1, 0); | ||
70 | if (array_fd < 0) { | ||
71 | fprintf(stderr, | ||
72 | "bpf_create_map(BPF_MAP_TYPE_CGROUP_ARRAY,...): %s(%d)\n", | ||
73 | strerror(errno), errno); | ||
74 | goto out; | ||
75 | } | ||
76 | } else { | ||
77 | array_fd = bpf_obj_get(pinned_file); | ||
78 | if (array_fd < 0) { | ||
79 | fprintf(stderr, "bpf_obj_get(%s): %s(%d)\n", | ||
80 | pinned_file, strerror(errno), errno); | ||
81 | goto out; | ||
82 | } | ||
83 | } | ||
84 | |||
85 | ret = bpf_map_update_elem(array_fd, &array_key, &cg2_fd, 0); | ||
86 | if (ret) { | ||
87 | perror("bpf_map_update_elem"); | ||
88 | goto out; | ||
89 | } | ||
90 | |||
91 | if (create_array) { | ||
92 | ret = bpf_obj_pin(array_fd, pinned_file); | ||
93 | if (ret) { | ||
94 | fprintf(stderr, "bpf_obj_pin(..., %s): %s(%d)\n", | ||
95 | pinned_file, strerror(errno), errno); | ||
96 | goto out; | ||
97 | } | ||
98 | } | ||
99 | |||
100 | out: | ||
101 | if (array_fd != -1) | ||
102 | close(array_fd); | ||
103 | if (cg2_fd != -1) | ||
104 | close(cg2_fd); | ||
105 | return ret; | ||
106 | } | ||
diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c new file mode 100644 index 000000000..20fbd1241 --- /dev/null +++ b/samples/bpf/test_cgrp2_attach.c | |||
@@ -0,0 +1,172 @@ | |||
1 | /* eBPF example program: | ||
2 | * | ||
3 | * - Creates arraymap in kernel with 4 bytes keys and 8 byte values | ||
4 | * | ||
5 | * - Loads eBPF program | ||
6 | * | ||
7 | * The eBPF program accesses the map passed in to store two pieces of | ||
8 | * information. The number of invocations of the program, which maps | ||
9 | * to the number of packets received, is stored to key 0. Key 1 is | ||
10 | * incremented on each iteration by the number of bytes stored in | ||
11 | * the skb. | ||
12 | * | ||
13 | * - Attaches the new program to a cgroup using BPF_PROG_ATTACH | ||
14 | * | ||
15 | * - Every second, reads map[0] and map[1] to see how many bytes and | ||
16 | * packets were seen on any socket of tasks in the given cgroup. | ||
17 | */ | ||
18 | |||
19 | #define _GNU_SOURCE | ||
20 | |||
21 | #include <stdio.h> | ||
22 | #include <stdlib.h> | ||
23 | #include <stddef.h> | ||
24 | #include <string.h> | ||
25 | #include <unistd.h> | ||
26 | #include <assert.h> | ||
27 | #include <errno.h> | ||
28 | #include <fcntl.h> | ||
29 | |||
30 | #include <linux/bpf.h> | ||
31 | #include <bpf/bpf.h> | ||
32 | |||
33 | #include "bpf_insn.h" | ||
34 | |||
35 | enum { | ||
36 | MAP_KEY_PACKETS, | ||
37 | MAP_KEY_BYTES, | ||
38 | }; | ||
39 | |||
40 | char bpf_log_buf[BPF_LOG_BUF_SIZE]; | ||
41 | |||
42 | static int prog_load(int map_fd, int verdict) | ||
43 | { | ||
44 | struct bpf_insn prog[] = { | ||
45 | BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), /* save r6 so it's not clobbered by BPF_CALL */ | ||
46 | |||
47 | /* Count packets */ | ||
48 | BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */ | ||
49 | BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ | ||
50 | BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), | ||
51 | BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ | ||
52 | BPF_LD_MAP_FD(BPF_REG_1, map_fd), /* load map fd to r1 */ | ||
53 | BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), | ||
54 | BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), | ||
55 | BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ | ||
56 | BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ | ||
57 | |||
58 | /* Count bytes */ | ||
59 | BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */ | ||
60 | BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ | ||
61 | BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), | ||
62 | BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ | ||
63 | BPF_LD_MAP_FD(BPF_REG_1, map_fd), | ||
64 | BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), | ||
65 | BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), | ||
66 | BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */ | ||
67 | BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ | ||
68 | |||
69 | BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */ | ||
70 | BPF_EXIT_INSN(), | ||
71 | }; | ||
72 | size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); | ||
73 | |||
74 | return bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB, | ||
75 | prog, insns_cnt, "GPL", 0, | ||
76 | bpf_log_buf, BPF_LOG_BUF_SIZE); | ||
77 | } | ||
78 | |||
79 | static int usage(const char *argv0) | ||
80 | { | ||
81 | printf("Usage: %s [-d] [-D] <cg-path> <egress|ingress>\n", argv0); | ||
82 | printf(" -d Drop Traffic\n"); | ||
83 | printf(" -D Detach filter, and exit\n"); | ||
84 | return EXIT_FAILURE; | ||
85 | } | ||
86 | |||
87 | static int attach_filter(int cg_fd, int type, int verdict) | ||
88 | { | ||
89 | int prog_fd, map_fd, ret, key; | ||
90 | long long pkt_cnt, byte_cnt; | ||
91 | |||
92 | map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, | ||
93 | sizeof(key), sizeof(byte_cnt), | ||
94 | 256, 0); | ||
95 | if (map_fd < 0) { | ||
96 | printf("Failed to create map: '%s'\n", strerror(errno)); | ||
97 | return EXIT_FAILURE; | ||
98 | } | ||
99 | |||
100 | prog_fd = prog_load(map_fd, verdict); | ||
101 | printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf); | ||
102 | |||
103 | if (prog_fd < 0) { | ||
104 | printf("Failed to load prog: '%s'\n", strerror(errno)); | ||
105 | return EXIT_FAILURE; | ||
106 | } | ||
107 | |||
108 | ret = bpf_prog_attach(prog_fd, cg_fd, type, 0); | ||
109 | if (ret < 0) { | ||
110 | printf("Failed to attach prog to cgroup: '%s'\n", | ||
111 | strerror(errno)); | ||
112 | return EXIT_FAILURE; | ||
113 | } | ||
114 | while (1) { | ||
115 | key = MAP_KEY_PACKETS; | ||
116 | assert(bpf_map_lookup_elem(map_fd, &key, &pkt_cnt) == 0); | ||
117 | |||
118 | key = MAP_KEY_BYTES; | ||
119 | assert(bpf_map_lookup_elem(map_fd, &key, &byte_cnt) == 0); | ||
120 | |||
121 | printf("cgroup received %lld packets, %lld bytes\n", | ||
122 | pkt_cnt, byte_cnt); | ||
123 | sleep(1); | ||
124 | } | ||
125 | |||
126 | return EXIT_SUCCESS; | ||
127 | } | ||
128 | |||
129 | int main(int argc, char **argv) | ||
130 | { | ||
131 | int detach_only = 0, verdict = 1; | ||
132 | enum bpf_attach_type type; | ||
133 | int opt, cg_fd, ret; | ||
134 | |||
135 | while ((opt = getopt(argc, argv, "Dd")) != -1) { | ||
136 | switch (opt) { | ||
137 | case 'd': | ||
138 | verdict = 0; | ||
139 | break; | ||
140 | case 'D': | ||
141 | detach_only = 1; | ||
142 | break; | ||
143 | default: | ||
144 | return usage(argv[0]); | ||
145 | } | ||
146 | } | ||
147 | |||
148 | if (argc - optind < 2) | ||
149 | return usage(argv[0]); | ||
150 | |||
151 | if (strcmp(argv[optind + 1], "ingress") == 0) | ||
152 | type = BPF_CGROUP_INET_INGRESS; | ||
153 | else if (strcmp(argv[optind + 1], "egress") == 0) | ||
154 | type = BPF_CGROUP_INET_EGRESS; | ||
155 | else | ||
156 | return usage(argv[0]); | ||
157 | |||
158 | cg_fd = open(argv[optind], O_DIRECTORY | O_RDONLY); | ||
159 | if (cg_fd < 0) { | ||
160 | printf("Failed to open cgroup path: '%s'\n", strerror(errno)); | ||
161 | return EXIT_FAILURE; | ||
162 | } | ||
163 | |||
164 | if (detach_only) { | ||
165 | ret = bpf_prog_detach(cg_fd, type); | ||
166 | printf("bpf_prog_detach() returned '%s' (%d)\n", | ||
167 | strerror(errno), errno); | ||
168 | } else | ||
169 | ret = attach_filter(cg_fd, type, verdict); | ||
170 | |||
171 | return ret; | ||
172 | } | ||
diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c new file mode 100644 index 000000000..b0811da5a --- /dev/null +++ b/samples/bpf/test_cgrp2_sock.c | |||
@@ -0,0 +1,290 @@ | |||
1 | /* eBPF example program: | ||
2 | * | ||
3 | * - Loads eBPF program | ||
4 | * | ||
5 | * The eBPF program sets the sk_bound_dev_if index in new AF_INET{6} | ||
6 | * sockets opened by processes in the cgroup. | ||
7 | * | ||
8 | * - Attaches the new program to a cgroup using BPF_PROG_ATTACH | ||
9 | */ | ||
10 | |||
11 | #define _GNU_SOURCE | ||
12 | |||
13 | #include <stdio.h> | ||
14 | #include <stdlib.h> | ||
15 | #include <stddef.h> | ||
16 | #include <string.h> | ||
17 | #include <unistd.h> | ||
18 | #include <assert.h> | ||
19 | #include <errno.h> | ||
20 | #include <fcntl.h> | ||
21 | #include <net/if.h> | ||
22 | #include <inttypes.h> | ||
23 | #include <linux/bpf.h> | ||
24 | #include <bpf/bpf.h> | ||
25 | |||
26 | #include "bpf_insn.h" | ||
27 | |||
28 | char bpf_log_buf[BPF_LOG_BUF_SIZE]; | ||
29 | |||
30 | static int prog_load(__u32 idx, __u32 mark, __u32 prio) | ||
31 | { | ||
32 | /* save pointer to context */ | ||
33 | struct bpf_insn prog_start[] = { | ||
34 | BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), | ||
35 | }; | ||
36 | struct bpf_insn prog_end[] = { | ||
37 | BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = verdict */ | ||
38 | BPF_EXIT_INSN(), | ||
39 | }; | ||
40 | |||
41 | /* set sk_bound_dev_if on socket */ | ||
42 | struct bpf_insn prog_dev[] = { | ||
43 | BPF_MOV64_IMM(BPF_REG_3, idx), | ||
44 | BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, bound_dev_if)), | ||
45 | BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, bound_dev_if)), | ||
46 | }; | ||
47 | |||
48 | /* set mark on socket */ | ||
49 | struct bpf_insn prog_mark[] = { | ||
50 | /* get uid of process */ | ||
51 | BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, | ||
52 | BPF_FUNC_get_current_uid_gid), | ||
53 | BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xffffffff), | ||
54 | |||
55 | /* if uid is 0, use given mark, else use the uid as the mark */ | ||
56 | BPF_MOV64_REG(BPF_REG_3, BPF_REG_0), | ||
57 | BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), | ||
58 | BPF_MOV64_IMM(BPF_REG_3, mark), | ||
59 | |||
60 | /* set the mark on the new socket */ | ||
61 | BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), | ||
62 | BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, mark)), | ||
63 | BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, mark)), | ||
64 | }; | ||
65 | |||
66 | /* set priority on socket */ | ||
67 | struct bpf_insn prog_prio[] = { | ||
68 | BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), | ||
69 | BPF_MOV64_IMM(BPF_REG_3, prio), | ||
70 | BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, priority)), | ||
71 | BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, priority)), | ||
72 | }; | ||
73 | |||
74 | struct bpf_insn *prog; | ||
75 | size_t insns_cnt; | ||
76 | void *p; | ||
77 | int ret; | ||
78 | |||
79 | insns_cnt = sizeof(prog_start) + sizeof(prog_end); | ||
80 | if (idx) | ||
81 | insns_cnt += sizeof(prog_dev); | ||
82 | |||
83 | if (mark) | ||
84 | insns_cnt += sizeof(prog_mark); | ||
85 | |||
86 | if (prio) | ||
87 | insns_cnt += sizeof(prog_prio); | ||
88 | |||
89 | p = prog = malloc(insns_cnt); | ||
90 | if (!prog) { | ||
91 | fprintf(stderr, "Failed to allocate memory for instructions\n"); | ||
92 | return EXIT_FAILURE; | ||
93 | } | ||
94 | |||
95 | memcpy(p, prog_start, sizeof(prog_start)); | ||
96 | p += sizeof(prog_start); | ||
97 | |||
98 | if (idx) { | ||
99 | memcpy(p, prog_dev, sizeof(prog_dev)); | ||
100 | p += sizeof(prog_dev); | ||
101 | } | ||
102 | |||
103 | if (mark) { | ||
104 | memcpy(p, prog_mark, sizeof(prog_mark)); | ||
105 | p += sizeof(prog_mark); | ||
106 | } | ||
107 | |||
108 | if (prio) { | ||
109 | memcpy(p, prog_prio, sizeof(prog_prio)); | ||
110 | p += sizeof(prog_prio); | ||
111 | } | ||
112 | |||
113 | memcpy(p, prog_end, sizeof(prog_end)); | ||
114 | p += sizeof(prog_end); | ||
115 | |||
116 | insns_cnt /= sizeof(struct bpf_insn); | ||
117 | |||
118 | ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SOCK, prog, insns_cnt, | ||
119 | "GPL", 0, bpf_log_buf, BPF_LOG_BUF_SIZE); | ||
120 | |||
121 | free(prog); | ||
122 | |||
123 | return ret; | ||
124 | } | ||
125 | |||
126 | static int get_bind_to_device(int sd, char *name, size_t len) | ||
127 | { | ||
128 | socklen_t optlen = len; | ||
129 | int rc; | ||
130 | |||
131 | name[0] = '\0'; | ||
132 | rc = getsockopt(sd, SOL_SOCKET, SO_BINDTODEVICE, name, &optlen); | ||
133 | if (rc < 0) | ||
134 | perror("setsockopt(SO_BINDTODEVICE)"); | ||
135 | |||
136 | return rc; | ||
137 | } | ||
138 | |||
139 | static unsigned int get_somark(int sd) | ||
140 | { | ||
141 | unsigned int mark = 0; | ||
142 | socklen_t optlen = sizeof(mark); | ||
143 | int rc; | ||
144 | |||
145 | rc = getsockopt(sd, SOL_SOCKET, SO_MARK, &mark, &optlen); | ||
146 | if (rc < 0) | ||
147 | perror("getsockopt(SO_MARK)"); | ||
148 | |||
149 | return mark; | ||
150 | } | ||
151 | |||
152 | static unsigned int get_priority(int sd) | ||
153 | { | ||
154 | unsigned int prio = 0; | ||
155 | socklen_t optlen = sizeof(prio); | ||
156 | int rc; | ||
157 | |||
158 | rc = getsockopt(sd, SOL_SOCKET, SO_PRIORITY, &prio, &optlen); | ||
159 | if (rc < 0) | ||
160 | perror("getsockopt(SO_PRIORITY)"); | ||
161 | |||
162 | return prio; | ||
163 | } | ||
164 | |||
165 | static int show_sockopts(int family) | ||
166 | { | ||
167 | unsigned int mark, prio; | ||
168 | char name[16]; | ||
169 | int sd; | ||
170 | |||
171 | sd = socket(family, SOCK_DGRAM, 17); | ||
172 | if (sd < 0) { | ||
173 | perror("socket"); | ||
174 | return 1; | ||
175 | } | ||
176 | |||
177 | if (get_bind_to_device(sd, name, sizeof(name)) < 0) | ||
178 | return 1; | ||
179 | |||
180 | mark = get_somark(sd); | ||
181 | prio = get_priority(sd); | ||
182 | |||
183 | close(sd); | ||
184 | |||
185 | printf("sd %d: dev %s, mark %u, priority %u\n", sd, name, mark, prio); | ||
186 | |||
187 | return 0; | ||
188 | } | ||
189 | |||
190 | static int usage(const char *argv0) | ||
191 | { | ||
192 | printf("Usage:\n"); | ||
193 | printf(" Attach a program\n"); | ||
194 | printf(" %s -b bind-to-dev -m mark -p prio cg-path\n", argv0); | ||
195 | printf("\n"); | ||
196 | printf(" Detach a program\n"); | ||
197 | printf(" %s -d cg-path\n", argv0); | ||
198 | printf("\n"); | ||
199 | printf(" Show inherited socket settings (mark, priority, and device)\n"); | ||
200 | printf(" %s [-6]\n", argv0); | ||
201 | return EXIT_FAILURE; | ||
202 | } | ||
203 | |||
204 | int main(int argc, char **argv) | ||
205 | { | ||
206 | __u32 idx = 0, mark = 0, prio = 0; | ||
207 | const char *cgrp_path = NULL; | ||
208 | int cg_fd, prog_fd, ret; | ||
209 | int family = PF_INET; | ||
210 | int do_attach = 1; | ||
211 | int rc; | ||
212 | |||
213 | while ((rc = getopt(argc, argv, "db:m:p:6")) != -1) { | ||
214 | switch (rc) { | ||
215 | case 'd': | ||
216 | do_attach = 0; | ||
217 | break; | ||
218 | case 'b': | ||
219 | idx = if_nametoindex(optarg); | ||
220 | if (!idx) { | ||
221 | idx = strtoumax(optarg, NULL, 0); | ||
222 | if (!idx) { | ||
223 | printf("Invalid device name\n"); | ||
224 | return EXIT_FAILURE; | ||
225 | } | ||
226 | } | ||
227 | break; | ||
228 | case 'm': | ||
229 | mark = strtoumax(optarg, NULL, 0); | ||
230 | break; | ||
231 | case 'p': | ||
232 | prio = strtoumax(optarg, NULL, 0); | ||
233 | break; | ||
234 | case '6': | ||
235 | family = PF_INET6; | ||
236 | break; | ||
237 | default: | ||
238 | return usage(argv[0]); | ||
239 | } | ||
240 | } | ||
241 | |||
242 | if (optind == argc) | ||
243 | return show_sockopts(family); | ||
244 | |||
245 | cgrp_path = argv[optind]; | ||
246 | if (!cgrp_path) { | ||
247 | fprintf(stderr, "cgroup path not given\n"); | ||
248 | return EXIT_FAILURE; | ||
249 | } | ||
250 | |||
251 | if (do_attach && !idx && !mark && !prio) { | ||
252 | fprintf(stderr, | ||
253 | "One of device, mark or priority must be given\n"); | ||
254 | return EXIT_FAILURE; | ||
255 | } | ||
256 | |||
257 | cg_fd = open(cgrp_path, O_DIRECTORY | O_RDONLY); | ||
258 | if (cg_fd < 0) { | ||
259 | printf("Failed to open cgroup path: '%s'\n", strerror(errno)); | ||
260 | return EXIT_FAILURE; | ||
261 | } | ||
262 | |||
263 | if (do_attach) { | ||
264 | prog_fd = prog_load(idx, mark, prio); | ||
265 | if (prog_fd < 0) { | ||
266 | printf("Failed to load prog: '%s'\n", strerror(errno)); | ||
267 | printf("Output from kernel verifier:\n%s\n-------\n", | ||
268 | bpf_log_buf); | ||
269 | return EXIT_FAILURE; | ||
270 | } | ||
271 | |||
272 | ret = bpf_prog_attach(prog_fd, cg_fd, | ||
273 | BPF_CGROUP_INET_SOCK_CREATE, 0); | ||
274 | if (ret < 0) { | ||
275 | printf("Failed to attach prog to cgroup: '%s'\n", | ||
276 | strerror(errno)); | ||
277 | return EXIT_FAILURE; | ||
278 | } | ||
279 | } else { | ||
280 | ret = bpf_prog_detach(cg_fd, BPF_CGROUP_INET_SOCK_CREATE); | ||
281 | if (ret < 0) { | ||
282 | printf("Failed to detach prog from cgroup: '%s'\n", | ||
283 | strerror(errno)); | ||
284 | return EXIT_FAILURE; | ||
285 | } | ||
286 | } | ||
287 | |||
288 | close(cg_fd); | ||
289 | return EXIT_SUCCESS; | ||
290 | } | ||
diff --git a/samples/bpf/test_cgrp2_sock.sh b/samples/bpf/test_cgrp2_sock.sh new file mode 100755 index 000000000..9f6174236 --- /dev/null +++ b/samples/bpf/test_cgrp2_sock.sh | |||
@@ -0,0 +1,135 @@ | |||
1 | #!/bin/sh | ||
2 | # SPDX-License-Identifier: GPL-2.0 | ||
3 | |||
4 | # Test various socket options that can be set by attaching programs to cgroups. | ||
5 | |||
6 | CGRP_MNT="/tmp/cgroupv2-test_cgrp2_sock" | ||
7 | |||
8 | ################################################################################ | ||
9 | # | ||
10 | print_result() | ||
11 | { | ||
12 | local rc=$1 | ||
13 | local status=" OK " | ||
14 | |||
15 | [ $rc -ne 0 ] && status="FAIL" | ||
16 | |||
17 | printf "%-50s [%4s]\n" "$2" "$status" | ||
18 | } | ||
19 | |||
20 | check_sock() | ||
21 | { | ||
22 | out=$(test_cgrp2_sock) | ||
23 | echo $out | grep -q "$1" | ||
24 | if [ $? -ne 0 ]; then | ||
25 | print_result 1 "IPv4: $2" | ||
26 | echo " expected: $1" | ||
27 | echo " have: $out" | ||
28 | rc=1 | ||
29 | else | ||
30 | print_result 0 "IPv4: $2" | ||
31 | fi | ||
32 | } | ||
33 | |||
34 | check_sock6() | ||
35 | { | ||
36 | out=$(test_cgrp2_sock -6) | ||
37 | echo $out | grep -q "$1" | ||
38 | if [ $? -ne 0 ]; then | ||
39 | print_result 1 "IPv6: $2" | ||
40 | echo " expected: $1" | ||
41 | echo " have: $out" | ||
42 | rc=1 | ||
43 | else | ||
44 | print_result 0 "IPv6: $2" | ||
45 | fi | ||
46 | } | ||
47 | |||
48 | ################################################################################ | ||
49 | # | ||
50 | |||
51 | cleanup() | ||
52 | { | ||
53 | echo $$ >> ${CGRP_MNT}/cgroup.procs | ||
54 | rmdir ${CGRP_MNT}/sockopts | ||
55 | } | ||
56 | |||
57 | cleanup_and_exit() | ||
58 | { | ||
59 | local rc=$1 | ||
60 | local msg="$2" | ||
61 | |||
62 | [ -n "$msg" ] && echo "ERROR: $msg" | ||
63 | |||
64 | test_cgrp2_sock -d ${CGRP_MNT}/sockopts | ||
65 | ip li del cgrp2_sock | ||
66 | umount ${CGRP_MNT} | ||
67 | |||
68 | exit $rc | ||
69 | } | ||
70 | |||
71 | |||
72 | ################################################################################ | ||
73 | # main | ||
74 | |||
75 | rc=0 | ||
76 | |||
77 | ip li add cgrp2_sock type dummy 2>/dev/null | ||
78 | |||
79 | set -e | ||
80 | mkdir -p ${CGRP_MNT} | ||
81 | mount -t cgroup2 none ${CGRP_MNT} | ||
82 | set +e | ||
83 | |||
84 | |||
85 | # make sure we have a known start point | ||
86 | cleanup 2>/dev/null | ||
87 | |||
88 | mkdir -p ${CGRP_MNT}/sockopts | ||
89 | [ $? -ne 0 ] && cleanup_and_exit 1 "Failed to create cgroup hierarchy" | ||
90 | |||
91 | |||
92 | # set pid into cgroup | ||
93 | echo $$ > ${CGRP_MNT}/sockopts/cgroup.procs | ||
94 | |||
95 | # no bpf program attached, so socket should show no settings | ||
96 | check_sock "dev , mark 0, priority 0" "No programs attached" | ||
97 | check_sock6 "dev , mark 0, priority 0" "No programs attached" | ||
98 | |||
99 | # verify device is set | ||
100 | # | ||
101 | test_cgrp2_sock -b cgrp2_sock ${CGRP_MNT}/sockopts | ||
102 | if [ $? -ne 0 ]; then | ||
103 | cleanup_and_exit 1 "Failed to install program to set device" | ||
104 | fi | ||
105 | check_sock "dev cgrp2_sock, mark 0, priority 0" "Device set" | ||
106 | check_sock6 "dev cgrp2_sock, mark 0, priority 0" "Device set" | ||
107 | |||
108 | # verify mark is set | ||
109 | # | ||
110 | test_cgrp2_sock -m 666 ${CGRP_MNT}/sockopts | ||
111 | if [ $? -ne 0 ]; then | ||
112 | cleanup_and_exit 1 "Failed to install program to set mark" | ||
113 | fi | ||
114 | check_sock "dev , mark 666, priority 0" "Mark set" | ||
115 | check_sock6 "dev , mark 666, priority 0" "Mark set" | ||
116 | |||
117 | # verify priority is set | ||
118 | # | ||
119 | test_cgrp2_sock -p 123 ${CGRP_MNT}/sockopts | ||
120 | if [ $? -ne 0 ]; then | ||
121 | cleanup_and_exit 1 "Failed to install program to set priority" | ||
122 | fi | ||
123 | check_sock "dev , mark 0, priority 123" "Priority set" | ||
124 | check_sock6 "dev , mark 0, priority 123" "Priority set" | ||
125 | |||
126 | # all 3 at once | ||
127 | # | ||
128 | test_cgrp2_sock -b cgrp2_sock -m 666 -p 123 ${CGRP_MNT}/sockopts | ||
129 | if [ $? -ne 0 ]; then | ||
130 | cleanup_and_exit 1 "Failed to install program to set device, mark and priority" | ||
131 | fi | ||
132 | check_sock "dev cgrp2_sock, mark 666, priority 123" "Priority set" | ||
133 | check_sock6 "dev cgrp2_sock, mark 666, priority 123" "Priority set" | ||
134 | |||
135 | cleanup_and_exit $rc | ||
diff --git a/samples/bpf/test_cgrp2_sock2.c b/samples/bpf/test_cgrp2_sock2.c new file mode 100644 index 000000000..a9277b118 --- /dev/null +++ b/samples/bpf/test_cgrp2_sock2.c | |||
@@ -0,0 +1,68 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* eBPF example program: | ||
3 | * | ||
4 | * - Loads eBPF program | ||
5 | * | ||
6 | * The eBPF program loads a filter from file and attaches the | ||
7 | * program to a cgroup using BPF_PROG_ATTACH | ||
8 | */ | ||
9 | |||
10 | #define _GNU_SOURCE | ||
11 | |||
12 | #include <stdio.h> | ||
13 | #include <stdlib.h> | ||
14 | #include <stddef.h> | ||
15 | #include <string.h> | ||
16 | #include <unistd.h> | ||
17 | #include <assert.h> | ||
18 | #include <errno.h> | ||
19 | #include <fcntl.h> | ||
20 | #include <net/if.h> | ||
21 | #include <linux/bpf.h> | ||
22 | #include <bpf/bpf.h> | ||
23 | |||
24 | #include "bpf_insn.h" | ||
25 | #include "bpf_load.h" | ||
26 | |||
27 | static int usage(const char *argv0) | ||
28 | { | ||
29 | printf("Usage: %s cg-path filter-path [filter-id]\n", argv0); | ||
30 | return EXIT_FAILURE; | ||
31 | } | ||
32 | |||
33 | int main(int argc, char **argv) | ||
34 | { | ||
35 | int cg_fd, ret, filter_id = 0; | ||
36 | |||
37 | if (argc < 3) | ||
38 | return usage(argv[0]); | ||
39 | |||
40 | cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY); | ||
41 | if (cg_fd < 0) { | ||
42 | printf("Failed to open cgroup path: '%s'\n", strerror(errno)); | ||
43 | return EXIT_FAILURE; | ||
44 | } | ||
45 | |||
46 | if (load_bpf_file(argv[2])) | ||
47 | return EXIT_FAILURE; | ||
48 | |||
49 | printf("Output from kernel verifier:\n%s\n-------\n", bpf_log_buf); | ||
50 | |||
51 | if (argc > 3) | ||
52 | filter_id = atoi(argv[3]); | ||
53 | |||
54 | if (filter_id >= prog_cnt) { | ||
55 | printf("Invalid program id; program not found in file\n"); | ||
56 | return EXIT_FAILURE; | ||
57 | } | ||
58 | |||
59 | ret = bpf_prog_attach(prog_fd[filter_id], cg_fd, | ||
60 | BPF_CGROUP_INET_SOCK_CREATE, 0); | ||
61 | if (ret < 0) { | ||
62 | printf("Failed to attach prog to cgroup: '%s'\n", | ||
63 | strerror(errno)); | ||
64 | return EXIT_FAILURE; | ||
65 | } | ||
66 | |||
67 | return EXIT_SUCCESS; | ||
68 | } | ||
diff --git a/samples/bpf/test_cgrp2_sock2.sh b/samples/bpf/test_cgrp2_sock2.sh new file mode 100755 index 000000000..0f396a86e --- /dev/null +++ b/samples/bpf/test_cgrp2_sock2.sh | |||
@@ -0,0 +1,85 @@ | |||
1 | #!/bin/bash | ||
2 | # SPDX-License-Identifier: GPL-2.0 | ||
3 | |||
4 | function config_device { | ||
5 | ip netns add at_ns0 | ||
6 | ip link add veth0 type veth peer name veth0b | ||
7 | ip link set veth0b up | ||
8 | ip link set veth0 netns at_ns0 | ||
9 | ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0 | ||
10 | ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad | ||
11 | ip netns exec at_ns0 ip link set dev veth0 up | ||
12 | ip addr add 172.16.1.101/24 dev veth0b | ||
13 | ip addr add 2401:db00::2/64 dev veth0b nodad | ||
14 | } | ||
15 | |||
16 | function config_cgroup { | ||
17 | rm -rf /tmp/cgroupv2 | ||
18 | mkdir -p /tmp/cgroupv2 | ||
19 | mount -t cgroup2 none /tmp/cgroupv2 | ||
20 | mkdir -p /tmp/cgroupv2/foo | ||
21 | echo $$ >> /tmp/cgroupv2/foo/cgroup.procs | ||
22 | } | ||
23 | |||
24 | |||
25 | function attach_bpf { | ||
26 | test_cgrp2_sock2 /tmp/cgroupv2/foo sock_flags_kern.o $1 | ||
27 | [ $? -ne 0 ] && exit 1 | ||
28 | } | ||
29 | |||
30 | function cleanup { | ||
31 | if [ -d /tmp/cgroupv2/foo ]; then | ||
32 | test_cgrp2_sock -d /tmp/cgroupv2/foo | ||
33 | fi | ||
34 | ip link del veth0b | ||
35 | ip netns delete at_ns0 | ||
36 | umount /tmp/cgroupv2 | ||
37 | rm -rf /tmp/cgroupv2 | ||
38 | } | ||
39 | |||
40 | cleanup 2>/dev/null | ||
41 | |||
42 | set -e | ||
43 | config_device | ||
44 | config_cgroup | ||
45 | set +e | ||
46 | |||
47 | # | ||
48 | # Test 1 - fail ping6 | ||
49 | # | ||
50 | attach_bpf 0 | ||
51 | ping -c1 -w1 172.16.1.100 | ||
52 | if [ $? -ne 0 ]; then | ||
53 | echo "ping failed when it should succeed" | ||
54 | cleanup | ||
55 | exit 1 | ||
56 | fi | ||
57 | |||
58 | ping6 -c1 -w1 2401:db00::1 | ||
59 | if [ $? -eq 0 ]; then | ||
60 | echo "ping6 succeeded when it should not" | ||
61 | cleanup | ||
62 | exit 1 | ||
63 | fi | ||
64 | |||
65 | # | ||
66 | # Test 2 - fail ping | ||
67 | # | ||
68 | attach_bpf 1 | ||
69 | ping6 -c1 -w1 2401:db00::1 | ||
70 | if [ $? -ne 0 ]; then | ||
71 | echo "ping6 failed when it should succeed" | ||
72 | cleanup | ||
73 | exit 1 | ||
74 | fi | ||
75 | |||
76 | ping -c1 -w1 172.16.1.100 | ||
77 | if [ $? -eq 0 ]; then | ||
78 | echo "ping succeeded when it should not" | ||
79 | cleanup | ||
80 | exit 1 | ||
81 | fi | ||
82 | |||
83 | cleanup | ||
84 | echo | ||
85 | echo "*** PASS ***" | ||
diff --git a/samples/bpf/test_cgrp2_tc.sh b/samples/bpf/test_cgrp2_tc.sh new file mode 100755 index 000000000..12faf5847 --- /dev/null +++ b/samples/bpf/test_cgrp2_tc.sh | |||
@@ -0,0 +1,185 @@ | |||
1 | #!/bin/bash | ||
2 | # SPDX-License-Identifier: GPL-2.0 | ||
3 | |||
4 | MY_DIR=$(dirname $0) | ||
5 | # Details on the bpf prog | ||
6 | BPF_CGRP2_ARRAY_NAME='test_cgrp2_array_pin' | ||
7 | BPF_PROG="$MY_DIR/test_cgrp2_tc_kern.o" | ||
8 | BPF_SECTION='filter' | ||
9 | |||
10 | [ -z "$TC" ] && TC='tc' | ||
11 | [ -z "$IP" ] && IP='ip' | ||
12 | |||
13 | # Names of the veth interface, net namespace...etc. | ||
14 | HOST_IFC='ve' | ||
15 | NS_IFC='vens' | ||
16 | NS='ns' | ||
17 | |||
18 | find_mnt() { | ||
19 | cat /proc/mounts | \ | ||
20 | awk '{ if ($3 == "'$1'" && mnt == "") { mnt = $2 }} END { print mnt }' | ||
21 | } | ||
22 | |||
23 | # Init cgroup2 vars | ||
24 | init_cgrp2_vars() { | ||
25 | CGRP2_ROOT=$(find_mnt cgroup2) | ||
26 | if [ -z "$CGRP2_ROOT" ] | ||
27 | then | ||
28 | CGRP2_ROOT='/mnt/cgroup2' | ||
29 | MOUNT_CGRP2="yes" | ||
30 | fi | ||
31 | CGRP2_TC="$CGRP2_ROOT/tc" | ||
32 | CGRP2_TC_LEAF="$CGRP2_TC/leaf" | ||
33 | } | ||
34 | |||
35 | # Init bpf fs vars | ||
36 | init_bpf_fs_vars() { | ||
37 | local bpf_fs_root=$(find_mnt bpf) | ||
38 | [ -n "$bpf_fs_root" ] || return -1 | ||
39 | BPF_FS_TC_SHARE="$bpf_fs_root/tc/globals" | ||
40 | } | ||
41 | |||
42 | setup_cgrp2() { | ||
43 | case $1 in | ||
44 | start) | ||
45 | if [ "$MOUNT_CGRP2" == 'yes' ] | ||
46 | then | ||
47 | [ -d $CGRP2_ROOT ] || mkdir -p $CGRP2_ROOT | ||
48 | mount -t cgroup2 none $CGRP2_ROOT || return $? | ||
49 | fi | ||
50 | mkdir -p $CGRP2_TC_LEAF | ||
51 | ;; | ||
52 | *) | ||
53 | rmdir $CGRP2_TC_LEAF && rmdir $CGRP2_TC | ||
54 | [ "$MOUNT_CGRP2" == 'yes' ] && umount $CGRP2_ROOT | ||
55 | ;; | ||
56 | esac | ||
57 | } | ||
58 | |||
59 | setup_bpf_cgrp2_array() { | ||
60 | local bpf_cgrp2_array="$BPF_FS_TC_SHARE/$BPF_CGRP2_ARRAY_NAME" | ||
61 | case $1 in | ||
62 | start) | ||
63 | $MY_DIR/test_cgrp2_array_pin -U $bpf_cgrp2_array -v $CGRP2_TC | ||
64 | ;; | ||
65 | *) | ||
66 | [ -d "$BPF_FS_TC_SHARE" ] && rm -f $bpf_cgrp2_array | ||
67 | ;; | ||
68 | esac | ||
69 | } | ||
70 | |||
71 | setup_net() { | ||
72 | case $1 in | ||
73 | start) | ||
74 | $IP link add $HOST_IFC type veth peer name $NS_IFC || return $? | ||
75 | $IP link set dev $HOST_IFC up || return $? | ||
76 | sysctl -q net.ipv6.conf.$HOST_IFC.accept_dad=0 | ||
77 | |||
78 | $IP netns add ns || return $? | ||
79 | $IP link set dev $NS_IFC netns ns || return $? | ||
80 | $IP -n $NS link set dev $NS_IFC up || return $? | ||
81 | $IP netns exec $NS sysctl -q net.ipv6.conf.$NS_IFC.accept_dad=0 | ||
82 | $TC qdisc add dev $HOST_IFC clsact || return $? | ||
83 | $TC filter add dev $HOST_IFC egress bpf da obj $BPF_PROG sec $BPF_SECTION || return $? | ||
84 | ;; | ||
85 | *) | ||
86 | $IP netns del $NS | ||
87 | $IP link del $HOST_IFC | ||
88 | ;; | ||
89 | esac | ||
90 | } | ||
91 | |||
92 | run_in_cgrp() { | ||
93 | # Fork another bash and move it under the specified cgroup. | ||
94 | # It makes the cgroup cleanup easier at the end of the test. | ||
95 | cmd='echo $$ > ' | ||
96 | cmd="$cmd $1/cgroup.procs; exec $2" | ||
97 | bash -c "$cmd" | ||
98 | } | ||
99 | |||
100 | do_test() { | ||
101 | run_in_cgrp $CGRP2_TC_LEAF "ping -6 -c3 ff02::1%$HOST_IFC >& /dev/null" | ||
102 | local dropped=$($TC -s qdisc show dev $HOST_IFC | tail -3 | \ | ||
103 | awk '/drop/{print substr($7, 0, index($7, ",")-1)}') | ||
104 | if [[ $dropped -eq 0 ]] | ||
105 | then | ||
106 | echo "FAIL" | ||
107 | return 1 | ||
108 | else | ||
109 | echo "Successfully filtered $dropped packets" | ||
110 | return 0 | ||
111 | fi | ||
112 | } | ||
113 | |||
114 | do_exit() { | ||
115 | if [ "$DEBUG" == "yes" ] && [ "$MODE" != 'cleanuponly' ] | ||
116 | then | ||
117 | echo "------ DEBUG ------" | ||
118 | echo "mount: "; mount | egrep '(cgroup2|bpf)'; echo | ||
119 | echo "$CGRP2_TC_LEAF: "; ls -l $CGRP2_TC_LEAF; echo | ||
120 | if [ -d "$BPF_FS_TC_SHARE" ] | ||
121 | then | ||
122 | echo "$BPF_FS_TC_SHARE: "; ls -l $BPF_FS_TC_SHARE; echo | ||
123 | fi | ||
124 | echo "Host net:" | ||
125 | $IP netns | ||
126 | $IP link show dev $HOST_IFC | ||
127 | $IP -6 a show dev $HOST_IFC | ||
128 | $TC -s qdisc show dev $HOST_IFC | ||
129 | echo | ||
130 | echo "$NS net:" | ||
131 | $IP -n $NS link show dev $NS_IFC | ||
132 | $IP -n $NS -6 link show dev $NS_IFC | ||
133 | echo "------ DEBUG ------" | ||
134 | echo | ||
135 | fi | ||
136 | |||
137 | if [ "$MODE" != 'nocleanup' ] | ||
138 | then | ||
139 | setup_net stop | ||
140 | setup_bpf_cgrp2_array stop | ||
141 | setup_cgrp2 stop | ||
142 | fi | ||
143 | } | ||
144 | |||
145 | init_cgrp2_vars | ||
146 | init_bpf_fs_vars | ||
147 | |||
148 | while [[ $# -ge 1 ]] | ||
149 | do | ||
150 | a="$1" | ||
151 | case $a in | ||
152 | debug) | ||
153 | DEBUG='yes' | ||
154 | shift 1 | ||
155 | ;; | ||
156 | cleanup-only) | ||
157 | MODE='cleanuponly' | ||
158 | shift 1 | ||
159 | ;; | ||
160 | no-cleanup) | ||
161 | MODE='nocleanup' | ||
162 | shift 1 | ||
163 | ;; | ||
164 | *) | ||
165 | echo "test_cgrp2_tc [debug] [cleanup-only | no-cleanup]" | ||
166 | echo " debug: Print cgrp and network setup details at the end of the test" | ||
167 | echo " cleanup-only: Try to cleanup things from last test. No test will be run" | ||
168 | echo " no-cleanup: Run the test but don't do cleanup at the end" | ||
169 | echo "[Note: If no arg is given, it will run the test and do cleanup at the end]" | ||
170 | echo | ||
171 | exit -1 | ||
172 | ;; | ||
173 | esac | ||
174 | done | ||
175 | |||
176 | trap do_exit 0 | ||
177 | |||
178 | [ "$MODE" == 'cleanuponly' ] && exit | ||
179 | |||
180 | setup_cgrp2 start || exit $? | ||
181 | setup_net start || exit $? | ||
182 | init_bpf_fs_vars || exit $? | ||
183 | setup_bpf_cgrp2_array start || exit $? | ||
184 | do_test | ||
185 | echo | ||
diff --git a/samples/bpf/test_cgrp2_tc_kern.c b/samples/bpf/test_cgrp2_tc_kern.c new file mode 100644 index 000000000..4dd532a31 --- /dev/null +++ b/samples/bpf/test_cgrp2_tc_kern.c | |||
@@ -0,0 +1,70 @@ | |||
1 | /* Copyright (c) 2016 Facebook | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #define KBUILD_MODNAME "foo" | ||
8 | #include <uapi/linux/if_ether.h> | ||
9 | #include <uapi/linux/in6.h> | ||
10 | #include <uapi/linux/ipv6.h> | ||
11 | #include <uapi/linux/pkt_cls.h> | ||
12 | #include <uapi/linux/bpf.h> | ||
13 | #include <bpf/bpf_helpers.h> | ||
14 | |||
15 | /* copy of 'struct ethhdr' without __packed */ | ||
16 | struct eth_hdr { | ||
17 | unsigned char h_dest[ETH_ALEN]; | ||
18 | unsigned char h_source[ETH_ALEN]; | ||
19 | unsigned short h_proto; | ||
20 | }; | ||
21 | |||
22 | #define PIN_GLOBAL_NS 2 | ||
23 | struct bpf_elf_map { | ||
24 | __u32 type; | ||
25 | __u32 size_key; | ||
26 | __u32 size_value; | ||
27 | __u32 max_elem; | ||
28 | __u32 flags; | ||
29 | __u32 id; | ||
30 | __u32 pinning; | ||
31 | }; | ||
32 | |||
33 | struct bpf_elf_map SEC("maps") test_cgrp2_array_pin = { | ||
34 | .type = BPF_MAP_TYPE_CGROUP_ARRAY, | ||
35 | .size_key = sizeof(uint32_t), | ||
36 | .size_value = sizeof(uint32_t), | ||
37 | .pinning = PIN_GLOBAL_NS, | ||
38 | .max_elem = 1, | ||
39 | }; | ||
40 | |||
41 | SEC("filter") | ||
42 | int handle_egress(struct __sk_buff *skb) | ||
43 | { | ||
44 | void *data = (void *)(long)skb->data; | ||
45 | struct eth_hdr *eth = data; | ||
46 | struct ipv6hdr *ip6h = data + sizeof(*eth); | ||
47 | void *data_end = (void *)(long)skb->data_end; | ||
48 | char dont_care_msg[] = "dont care %04x %d\n"; | ||
49 | char pass_msg[] = "pass\n"; | ||
50 | char reject_msg[] = "reject\n"; | ||
51 | |||
52 | /* single length check */ | ||
53 | if (data + sizeof(*eth) + sizeof(*ip6h) > data_end) | ||
54 | return TC_ACT_OK; | ||
55 | |||
56 | if (eth->h_proto != htons(ETH_P_IPV6) || | ||
57 | ip6h->nexthdr != IPPROTO_ICMPV6) { | ||
58 | bpf_trace_printk(dont_care_msg, sizeof(dont_care_msg), | ||
59 | eth->h_proto, ip6h->nexthdr); | ||
60 | return TC_ACT_OK; | ||
61 | } else if (bpf_skb_under_cgroup(skb, &test_cgrp2_array_pin, 0) != 1) { | ||
62 | bpf_trace_printk(pass_msg, sizeof(pass_msg)); | ||
63 | return TC_ACT_OK; | ||
64 | } else { | ||
65 | bpf_trace_printk(reject_msg, sizeof(reject_msg)); | ||
66 | return TC_ACT_SHOT; | ||
67 | } | ||
68 | } | ||
69 | |||
70 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/test_cls_bpf.sh b/samples/bpf/test_cls_bpf.sh new file mode 100755 index 000000000..aaddd67b3 --- /dev/null +++ b/samples/bpf/test_cls_bpf.sh | |||
@@ -0,0 +1,38 @@ | |||
1 | #!/bin/bash | ||
2 | # SPDX-License-Identifier: GPL-2.0 | ||
3 | |||
4 | function pktgen { | ||
5 | ../pktgen/pktgen_bench_xmit_mode_netif_receive.sh -i $IFC -s 64 \ | ||
6 | -m 90:e2:ba:ff:ff:ff -d 192.168.0.1 -t 4 | ||
7 | local dropped=`tc -s qdisc show dev $IFC | tail -3 | awk '/drop/{print $7}'` | ||
8 | if [ "$dropped" == "0," ]; then | ||
9 | echo "FAIL" | ||
10 | else | ||
11 | echo "Successfully filtered " $dropped " packets" | ||
12 | fi | ||
13 | } | ||
14 | |||
15 | function test { | ||
16 | echo -n "Loading bpf program '$2'... " | ||
17 | tc qdisc add dev $IFC clsact | ||
18 | tc filter add dev $IFC ingress bpf da obj $1 sec $2 | ||
19 | local status=$? | ||
20 | if [ $status -ne 0 ]; then | ||
21 | echo "FAIL" | ||
22 | else | ||
23 | echo "ok" | ||
24 | pktgen | ||
25 | fi | ||
26 | tc qdisc del dev $IFC clsact | ||
27 | } | ||
28 | |||
29 | IFC=test_veth | ||
30 | |||
31 | ip link add name $IFC type veth peer name pair_$IFC | ||
32 | ip link set $IFC up | ||
33 | ip link set pair_$IFC up | ||
34 | |||
35 | test ./parse_simple.o simple | ||
36 | test ./parse_varlen.o varlen | ||
37 | test ./parse_ldabs.o ldabs | ||
38 | ip link del dev $IFC | ||
diff --git a/samples/bpf/test_current_task_under_cgroup_kern.c b/samples/bpf/test_current_task_under_cgroup_kern.c new file mode 100644 index 000000000..fbd43e2bb --- /dev/null +++ b/samples/bpf/test_current_task_under_cgroup_kern.c | |||
@@ -0,0 +1,44 @@ | |||
1 | /* Copyright (c) 2016 Sargun Dhillon <sargun@sargun.me> | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | |||
8 | #include <linux/ptrace.h> | ||
9 | #include <uapi/linux/bpf.h> | ||
10 | #include <linux/version.h> | ||
11 | #include <bpf/bpf_helpers.h> | ||
12 | #include <uapi/linux/utsname.h> | ||
13 | #include "trace_common.h" | ||
14 | |||
15 | struct { | ||
16 | __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY); | ||
17 | __uint(key_size, sizeof(u32)); | ||
18 | __uint(value_size, sizeof(u32)); | ||
19 | __uint(max_entries, 1); | ||
20 | } cgroup_map SEC(".maps"); | ||
21 | |||
22 | struct { | ||
23 | __uint(type, BPF_MAP_TYPE_ARRAY); | ||
24 | __type(key, u32); | ||
25 | __type(value, u64); | ||
26 | __uint(max_entries, 1); | ||
27 | } perf_map SEC(".maps"); | ||
28 | |||
29 | /* Writes the last PID that called sync to a map at index 0 */ | ||
30 | SEC("kprobe/" SYSCALL(sys_sync)) | ||
31 | int bpf_prog1(struct pt_regs *ctx) | ||
32 | { | ||
33 | u64 pid = bpf_get_current_pid_tgid(); | ||
34 | int idx = 0; | ||
35 | |||
36 | if (!bpf_current_task_under_cgroup(&cgroup_map, 0)) | ||
37 | return 0; | ||
38 | |||
39 | bpf_map_update_elem(&perf_map, &idx, &pid, BPF_ANY); | ||
40 | return 0; | ||
41 | } | ||
42 | |||
43 | char _license[] SEC("license") = "GPL"; | ||
44 | u32 _version SEC("version") = LINUX_VERSION_CODE; | ||
diff --git a/samples/bpf/test_current_task_under_cgroup_user.c b/samples/bpf/test_current_task_under_cgroup_user.c new file mode 100644 index 000000000..ac251a417 --- /dev/null +++ b/samples/bpf/test_current_task_under_cgroup_user.c | |||
@@ -0,0 +1,113 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* Copyright (c) 2016 Sargun Dhillon <sargun@sargun.me> | ||
3 | */ | ||
4 | |||
5 | #define _GNU_SOURCE | ||
6 | #include <stdio.h> | ||
7 | #include <unistd.h> | ||
8 | #include <bpf/bpf.h> | ||
9 | #include <bpf/libbpf.h> | ||
10 | #include "cgroup_helpers.h" | ||
11 | |||
12 | #define CGROUP_PATH "/my-cgroup" | ||
13 | |||
14 | int main(int argc, char **argv) | ||
15 | { | ||
16 | pid_t remote_pid, local_pid = getpid(); | ||
17 | struct bpf_link *link = NULL; | ||
18 | struct bpf_program *prog; | ||
19 | int cg2, idx = 0, rc = 1; | ||
20 | struct bpf_object *obj; | ||
21 | char filename[256]; | ||
22 | int map_fd[2]; | ||
23 | |||
24 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
25 | obj = bpf_object__open_file(filename, NULL); | ||
26 | if (libbpf_get_error(obj)) { | ||
27 | fprintf(stderr, "ERROR: opening BPF object file failed\n"); | ||
28 | return 0; | ||
29 | } | ||
30 | |||
31 | prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); | ||
32 | if (!prog) { | ||
33 | printf("finding a prog in obj file failed\n"); | ||
34 | goto cleanup; | ||
35 | } | ||
36 | |||
37 | /* load BPF program */ | ||
38 | if (bpf_object__load(obj)) { | ||
39 | fprintf(stderr, "ERROR: loading BPF object file failed\n"); | ||
40 | goto cleanup; | ||
41 | } | ||
42 | |||
43 | map_fd[0] = bpf_object__find_map_fd_by_name(obj, "cgroup_map"); | ||
44 | map_fd[1] = bpf_object__find_map_fd_by_name(obj, "perf_map"); | ||
45 | if (map_fd[0] < 0 || map_fd[1] < 0) { | ||
46 | fprintf(stderr, "ERROR: finding a map in obj file failed\n"); | ||
47 | goto cleanup; | ||
48 | } | ||
49 | |||
50 | link = bpf_program__attach(prog); | ||
51 | if (libbpf_get_error(link)) { | ||
52 | fprintf(stderr, "ERROR: bpf_program__attach failed\n"); | ||
53 | link = NULL; | ||
54 | goto cleanup; | ||
55 | } | ||
56 | |||
57 | if (setup_cgroup_environment()) | ||
58 | goto err; | ||
59 | |||
60 | cg2 = create_and_get_cgroup(CGROUP_PATH); | ||
61 | |||
62 | if (cg2 < 0) | ||
63 | goto err; | ||
64 | |||
65 | if (bpf_map_update_elem(map_fd[0], &idx, &cg2, BPF_ANY)) { | ||
66 | log_err("Adding target cgroup to map"); | ||
67 | goto err; | ||
68 | } | ||
69 | |||
70 | if (join_cgroup(CGROUP_PATH)) | ||
71 | goto err; | ||
72 | |||
73 | /* | ||
74 | * The installed helper program catched the sync call, and should | ||
75 | * write it to the map. | ||
76 | */ | ||
77 | |||
78 | sync(); | ||
79 | bpf_map_lookup_elem(map_fd[1], &idx, &remote_pid); | ||
80 | |||
81 | if (local_pid != remote_pid) { | ||
82 | fprintf(stderr, | ||
83 | "BPF Helper didn't write correct PID to map, but: %d\n", | ||
84 | remote_pid); | ||
85 | goto err; | ||
86 | } | ||
87 | |||
88 | /* Verify the negative scenario; leave the cgroup */ | ||
89 | if (join_cgroup("/")) | ||
90 | goto err; | ||
91 | |||
92 | remote_pid = 0; | ||
93 | bpf_map_update_elem(map_fd[1], &idx, &remote_pid, BPF_ANY); | ||
94 | |||
95 | sync(); | ||
96 | bpf_map_lookup_elem(map_fd[1], &idx, &remote_pid); | ||
97 | |||
98 | if (local_pid == remote_pid) { | ||
99 | fprintf(stderr, "BPF cgroup negative test did not work\n"); | ||
100 | goto err; | ||
101 | } | ||
102 | |||
103 | rc = 0; | ||
104 | |||
105 | err: | ||
106 | close(cg2); | ||
107 | cleanup_cgroup_environment(); | ||
108 | |||
109 | cleanup: | ||
110 | bpf_link__destroy(link); | ||
111 | bpf_object__close(obj); | ||
112 | return rc; | ||
113 | } | ||
diff --git a/samples/bpf/test_ipip.sh b/samples/bpf/test_ipip.sh new file mode 100755 index 000000000..9e507c305 --- /dev/null +++ b/samples/bpf/test_ipip.sh | |||
@@ -0,0 +1,179 @@ | |||
1 | #!/bin/bash | ||
2 | # SPDX-License-Identifier: GPL-2.0 | ||
3 | |||
4 | function config_device { | ||
5 | ip netns add at_ns0 | ||
6 | ip netns add at_ns1 | ||
7 | ip netns add at_ns2 | ||
8 | ip link add veth0 type veth peer name veth0b | ||
9 | ip link add veth1 type veth peer name veth1b | ||
10 | ip link add veth2 type veth peer name veth2b | ||
11 | ip link set veth0b up | ||
12 | ip link set veth1b up | ||
13 | ip link set veth2b up | ||
14 | ip link set dev veth0b mtu 1500 | ||
15 | ip link set dev veth1b mtu 1500 | ||
16 | ip link set dev veth2b mtu 1500 | ||
17 | ip link set veth0 netns at_ns0 | ||
18 | ip link set veth1 netns at_ns1 | ||
19 | ip link set veth2 netns at_ns2 | ||
20 | ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0 | ||
21 | ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad | ||
22 | ip netns exec at_ns0 ip link set dev veth0 up | ||
23 | ip netns exec at_ns1 ip addr add 172.16.1.101/24 dev veth1 | ||
24 | ip netns exec at_ns1 ip addr add 2401:db00::2/64 dev veth1 nodad | ||
25 | ip netns exec at_ns1 ip link set dev veth1 up | ||
26 | ip netns exec at_ns2 ip addr add 172.16.1.200/24 dev veth2 | ||
27 | ip netns exec at_ns2 ip addr add 2401:db00::3/64 dev veth2 nodad | ||
28 | ip netns exec at_ns2 ip link set dev veth2 up | ||
29 | ip link add br0 type bridge | ||
30 | ip link set br0 up | ||
31 | ip link set dev br0 mtu 1500 | ||
32 | ip link set veth0b master br0 | ||
33 | ip link set veth1b master br0 | ||
34 | ip link set veth2b master br0 | ||
35 | } | ||
36 | |||
37 | function add_ipip_tunnel { | ||
38 | ip netns exec at_ns0 \ | ||
39 | ip link add dev $DEV_NS type ipip local 172.16.1.100 remote 172.16.1.200 | ||
40 | ip netns exec at_ns0 ip link set dev $DEV_NS up | ||
41 | ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 | ||
42 | ip netns exec at_ns1 \ | ||
43 | ip link add dev $DEV_NS type ipip local 172.16.1.101 remote 172.16.1.200 | ||
44 | ip netns exec at_ns1 ip link set dev $DEV_NS up | ||
45 | # same inner IP address in at_ns0 and at_ns1 | ||
46 | ip netns exec at_ns1 ip addr add dev $DEV_NS 10.1.1.100/24 | ||
47 | |||
48 | ip netns exec at_ns2 ip link add dev $DEV type ipip external | ||
49 | ip netns exec at_ns2 ip link set dev $DEV up | ||
50 | ip netns exec at_ns2 ip addr add dev $DEV 10.1.1.200/24 | ||
51 | } | ||
52 | |||
53 | function add_ipip6_tunnel { | ||
54 | ip netns exec at_ns0 \ | ||
55 | ip link add dev $DEV_NS type ip6tnl mode ipip6 local 2401:db00::1/64 remote 2401:db00::3/64 | ||
56 | ip netns exec at_ns0 ip link set dev $DEV_NS up | ||
57 | ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 | ||
58 | ip netns exec at_ns1 \ | ||
59 | ip link add dev $DEV_NS type ip6tnl mode ipip6 local 2401:db00::2/64 remote 2401:db00::3/64 | ||
60 | ip netns exec at_ns1 ip link set dev $DEV_NS up | ||
61 | # same inner IP address in at_ns0 and at_ns1 | ||
62 | ip netns exec at_ns1 ip addr add dev $DEV_NS 10.1.1.100/24 | ||
63 | |||
64 | ip netns exec at_ns2 ip link add dev $DEV type ip6tnl mode ipip6 external | ||
65 | ip netns exec at_ns2 ip link set dev $DEV up | ||
66 | ip netns exec at_ns2 ip addr add dev $DEV 10.1.1.200/24 | ||
67 | } | ||
68 | |||
69 | function add_ip6ip6_tunnel { | ||
70 | ip netns exec at_ns0 \ | ||
71 | ip link add dev $DEV_NS type ip6tnl mode ip6ip6 local 2401:db00::1/64 remote 2401:db00::3/64 | ||
72 | ip netns exec at_ns0 ip link set dev $DEV_NS up | ||
73 | ip netns exec at_ns0 ip addr add dev $DEV_NS 2601:646::1/64 | ||
74 | ip netns exec at_ns1 \ | ||
75 | ip link add dev $DEV_NS type ip6tnl mode ip6ip6 local 2401:db00::2/64 remote 2401:db00::3/64 | ||
76 | ip netns exec at_ns1 ip link set dev $DEV_NS up | ||
77 | # same inner IP address in at_ns0 and at_ns1 | ||
78 | ip netns exec at_ns1 ip addr add dev $DEV_NS 2601:646::1/64 | ||
79 | |||
80 | ip netns exec at_ns2 ip link add dev $DEV type ip6tnl mode ip6ip6 external | ||
81 | ip netns exec at_ns2 ip link set dev $DEV up | ||
82 | ip netns exec at_ns2 ip addr add dev $DEV 2601:646::2/64 | ||
83 | } | ||
84 | |||
85 | function attach_bpf { | ||
86 | DEV=$1 | ||
87 | SET_TUNNEL=$2 | ||
88 | GET_TUNNEL=$3 | ||
89 | ip netns exec at_ns2 tc qdisc add dev $DEV clsact | ||
90 | ip netns exec at_ns2 tc filter add dev $DEV egress bpf da obj tcbpf2_kern.o sec $SET_TUNNEL | ||
91 | ip netns exec at_ns2 tc filter add dev $DEV ingress bpf da obj tcbpf2_kern.o sec $GET_TUNNEL | ||
92 | } | ||
93 | |||
94 | function test_ipip { | ||
95 | DEV_NS=ipip_std | ||
96 | DEV=ipip_bpf | ||
97 | config_device | ||
98 | # tcpdump -nei br0 & | ||
99 | cat /sys/kernel/debug/tracing/trace_pipe & | ||
100 | |||
101 | add_ipip_tunnel | ||
102 | attach_bpf $DEV ipip_set_tunnel ipip_get_tunnel | ||
103 | |||
104 | ip netns exec at_ns0 ping -c 1 10.1.1.200 | ||
105 | ip netns exec at_ns2 ping -c 1 10.1.1.100 | ||
106 | ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null | ||
107 | ip netns exec at_ns1 iperf -sD -p 5201 > /dev/null | ||
108 | sleep 0.2 | ||
109 | # tcp check _same_ IP over different tunnels | ||
110 | ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5200 | ||
111 | ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5201 | ||
112 | cleanup | ||
113 | } | ||
114 | |||
115 | # IPv4 over IPv6 tunnel | ||
116 | function test_ipip6 { | ||
117 | DEV_NS=ipip_std | ||
118 | DEV=ipip_bpf | ||
119 | config_device | ||
120 | # tcpdump -nei br0 & | ||
121 | cat /sys/kernel/debug/tracing/trace_pipe & | ||
122 | |||
123 | add_ipip6_tunnel | ||
124 | attach_bpf $DEV ipip6_set_tunnel ipip6_get_tunnel | ||
125 | |||
126 | ip netns exec at_ns0 ping -c 1 10.1.1.200 | ||
127 | ip netns exec at_ns2 ping -c 1 10.1.1.100 | ||
128 | ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null | ||
129 | ip netns exec at_ns1 iperf -sD -p 5201 > /dev/null | ||
130 | sleep 0.2 | ||
131 | # tcp check _same_ IP over different tunnels | ||
132 | ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5200 | ||
133 | ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5201 | ||
134 | cleanup | ||
135 | } | ||
136 | |||
137 | # IPv6 over IPv6 tunnel | ||
138 | function test_ip6ip6 { | ||
139 | DEV_NS=ipip_std | ||
140 | DEV=ipip_bpf | ||
141 | config_device | ||
142 | # tcpdump -nei br0 & | ||
143 | cat /sys/kernel/debug/tracing/trace_pipe & | ||
144 | |||
145 | add_ip6ip6_tunnel | ||
146 | attach_bpf $DEV ip6ip6_set_tunnel ip6ip6_get_tunnel | ||
147 | |||
148 | ip netns exec at_ns0 ping -6 -c 1 2601:646::2 | ||
149 | ip netns exec at_ns2 ping -6 -c 1 2601:646::1 | ||
150 | ip netns exec at_ns0 iperf -6sD -p 5200 > /dev/null | ||
151 | ip netns exec at_ns1 iperf -6sD -p 5201 > /dev/null | ||
152 | sleep 0.2 | ||
153 | # tcp check _same_ IP over different tunnels | ||
154 | ip netns exec at_ns2 iperf -6c 2601:646::1 -n 5k -p 5200 | ||
155 | ip netns exec at_ns2 iperf -6c 2601:646::1 -n 5k -p 5201 | ||
156 | cleanup | ||
157 | } | ||
158 | |||
159 | function cleanup { | ||
160 | set +ex | ||
161 | pkill iperf | ||
162 | ip netns delete at_ns0 | ||
163 | ip netns delete at_ns1 | ||
164 | ip netns delete at_ns2 | ||
165 | ip link del veth0 | ||
166 | ip link del veth1 | ||
167 | ip link del veth2 | ||
168 | ip link del br0 | ||
169 | pkill tcpdump | ||
170 | pkill cat | ||
171 | set -ex | ||
172 | } | ||
173 | |||
174 | cleanup | ||
175 | echo "Testing IP tunnels..." | ||
176 | test_ipip | ||
177 | test_ipip6 | ||
178 | test_ip6ip6 | ||
179 | echo "*** PASS ***" | ||
diff --git a/samples/bpf/test_lru_dist.c b/samples/bpf/test_lru_dist.c new file mode 100644 index 000000000..b313dba41 --- /dev/null +++ b/samples/bpf/test_lru_dist.c | |||
@@ -0,0 +1,540 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * Copyright (c) 2016 Facebook | ||
4 | */ | ||
5 | #define _GNU_SOURCE | ||
6 | #include <linux/types.h> | ||
7 | #include <stdio.h> | ||
8 | #include <unistd.h> | ||
9 | #include <linux/bpf.h> | ||
10 | #include <errno.h> | ||
11 | #include <string.h> | ||
12 | #include <assert.h> | ||
13 | #include <sched.h> | ||
14 | #include <sys/wait.h> | ||
15 | #include <sys/stat.h> | ||
16 | #include <sys/resource.h> | ||
17 | #include <fcntl.h> | ||
18 | #include <stdlib.h> | ||
19 | #include <time.h> | ||
20 | |||
21 | #include <bpf/bpf.h> | ||
22 | #include "bpf_util.h" | ||
23 | |||
24 | #define min(a, b) ((a) < (b) ? (a) : (b)) | ||
25 | #ifndef offsetof | ||
26 | # define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER) | ||
27 | #endif | ||
28 | #define container_of(ptr, type, member) ({ \ | ||
29 | const typeof( ((type *)0)->member ) *__mptr = (ptr); \ | ||
30 | (type *)( (char *)__mptr - offsetof(type,member) );}) | ||
31 | |||
32 | static int nr_cpus; | ||
33 | static unsigned long long *dist_keys; | ||
34 | static unsigned int dist_key_counts; | ||
35 | |||
36 | struct list_head { | ||
37 | struct list_head *next, *prev; | ||
38 | }; | ||
39 | |||
40 | static inline void INIT_LIST_HEAD(struct list_head *list) | ||
41 | { | ||
42 | list->next = list; | ||
43 | list->prev = list; | ||
44 | } | ||
45 | |||
46 | static inline int list_empty(const struct list_head *head) | ||
47 | { | ||
48 | return head->next == head; | ||
49 | } | ||
50 | |||
51 | static inline void __list_add(struct list_head *new, | ||
52 | struct list_head *prev, | ||
53 | struct list_head *next) | ||
54 | { | ||
55 | next->prev = new; | ||
56 | new->next = next; | ||
57 | new->prev = prev; | ||
58 | prev->next = new; | ||
59 | } | ||
60 | |||
61 | static inline void list_add(struct list_head *new, struct list_head *head) | ||
62 | { | ||
63 | __list_add(new, head, head->next); | ||
64 | } | ||
65 | |||
66 | static inline void __list_del(struct list_head *prev, struct list_head *next) | ||
67 | { | ||
68 | next->prev = prev; | ||
69 | prev->next = next; | ||
70 | } | ||
71 | |||
72 | static inline void __list_del_entry(struct list_head *entry) | ||
73 | { | ||
74 | __list_del(entry->prev, entry->next); | ||
75 | } | ||
76 | |||
77 | static inline void list_move(struct list_head *list, struct list_head *head) | ||
78 | { | ||
79 | __list_del_entry(list); | ||
80 | list_add(list, head); | ||
81 | } | ||
82 | |||
83 | #define list_entry(ptr, type, member) \ | ||
84 | container_of(ptr, type, member) | ||
85 | |||
86 | #define list_last_entry(ptr, type, member) \ | ||
87 | list_entry((ptr)->prev, type, member) | ||
88 | |||
89 | struct pfect_lru_node { | ||
90 | struct list_head list; | ||
91 | unsigned long long key; | ||
92 | }; | ||
93 | |||
94 | struct pfect_lru { | ||
95 | struct list_head list; | ||
96 | struct pfect_lru_node *free_nodes; | ||
97 | unsigned int cur_size; | ||
98 | unsigned int lru_size; | ||
99 | unsigned int nr_unique; | ||
100 | unsigned int nr_misses; | ||
101 | unsigned int total; | ||
102 | int map_fd; | ||
103 | }; | ||
104 | |||
105 | static void pfect_lru_init(struct pfect_lru *lru, unsigned int lru_size, | ||
106 | unsigned int nr_possible_elems) | ||
107 | { | ||
108 | lru->map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, | ||
109 | sizeof(unsigned long long), | ||
110 | sizeof(struct pfect_lru_node *), | ||
111 | nr_possible_elems, 0); | ||
112 | assert(lru->map_fd != -1); | ||
113 | |||
114 | lru->free_nodes = malloc(lru_size * sizeof(struct pfect_lru_node)); | ||
115 | assert(lru->free_nodes); | ||
116 | |||
117 | INIT_LIST_HEAD(&lru->list); | ||
118 | lru->cur_size = 0; | ||
119 | lru->lru_size = lru_size; | ||
120 | lru->nr_unique = lru->nr_misses = lru->total = 0; | ||
121 | } | ||
122 | |||
123 | static void pfect_lru_destroy(struct pfect_lru *lru) | ||
124 | { | ||
125 | close(lru->map_fd); | ||
126 | free(lru->free_nodes); | ||
127 | } | ||
128 | |||
129 | static int pfect_lru_lookup_or_insert(struct pfect_lru *lru, | ||
130 | unsigned long long key) | ||
131 | { | ||
132 | struct pfect_lru_node *node = NULL; | ||
133 | int seen = 0; | ||
134 | |||
135 | lru->total++; | ||
136 | if (!bpf_map_lookup_elem(lru->map_fd, &key, &node)) { | ||
137 | if (node) { | ||
138 | list_move(&node->list, &lru->list); | ||
139 | return 1; | ||
140 | } | ||
141 | seen = 1; | ||
142 | } | ||
143 | |||
144 | if (lru->cur_size < lru->lru_size) { | ||
145 | node = &lru->free_nodes[lru->cur_size++]; | ||
146 | INIT_LIST_HEAD(&node->list); | ||
147 | } else { | ||
148 | struct pfect_lru_node *null_node = NULL; | ||
149 | |||
150 | node = list_last_entry(&lru->list, | ||
151 | struct pfect_lru_node, | ||
152 | list); | ||
153 | bpf_map_update_elem(lru->map_fd, &node->key, &null_node, BPF_EXIST); | ||
154 | } | ||
155 | |||
156 | node->key = key; | ||
157 | list_move(&node->list, &lru->list); | ||
158 | |||
159 | lru->nr_misses++; | ||
160 | if (seen) { | ||
161 | assert(!bpf_map_update_elem(lru->map_fd, &key, &node, BPF_EXIST)); | ||
162 | } else { | ||
163 | lru->nr_unique++; | ||
164 | assert(!bpf_map_update_elem(lru->map_fd, &key, &node, BPF_NOEXIST)); | ||
165 | } | ||
166 | |||
167 | return seen; | ||
168 | } | ||
169 | |||
170 | static unsigned int read_keys(const char *dist_file, | ||
171 | unsigned long long **keys) | ||
172 | { | ||
173 | struct stat fst; | ||
174 | unsigned long long *retkeys; | ||
175 | unsigned int counts = 0; | ||
176 | int dist_fd; | ||
177 | char *b, *l; | ||
178 | int i; | ||
179 | |||
180 | dist_fd = open(dist_file, 0); | ||
181 | assert(dist_fd != -1); | ||
182 | |||
183 | assert(fstat(dist_fd, &fst) == 0); | ||
184 | b = malloc(fst.st_size); | ||
185 | assert(b); | ||
186 | |||
187 | assert(read(dist_fd, b, fst.st_size) == fst.st_size); | ||
188 | close(dist_fd); | ||
189 | for (i = 0; i < fst.st_size; i++) { | ||
190 | if (b[i] == '\n') | ||
191 | counts++; | ||
192 | } | ||
193 | counts++; /* in case the last line has no \n */ | ||
194 | |||
195 | retkeys = malloc(counts * sizeof(unsigned long long)); | ||
196 | assert(retkeys); | ||
197 | |||
198 | counts = 0; | ||
199 | for (l = strtok(b, "\n"); l; l = strtok(NULL, "\n")) | ||
200 | retkeys[counts++] = strtoull(l, NULL, 10); | ||
201 | free(b); | ||
202 | |||
203 | *keys = retkeys; | ||
204 | |||
205 | return counts; | ||
206 | } | ||
207 | |||
208 | static int create_map(int map_type, int map_flags, unsigned int size) | ||
209 | { | ||
210 | int map_fd; | ||
211 | |||
212 | map_fd = bpf_create_map(map_type, sizeof(unsigned long long), | ||
213 | sizeof(unsigned long long), size, map_flags); | ||
214 | |||
215 | if (map_fd == -1) | ||
216 | perror("bpf_create_map"); | ||
217 | |||
218 | return map_fd; | ||
219 | } | ||
220 | |||
221 | static int sched_next_online(int pid, int next_to_try) | ||
222 | { | ||
223 | cpu_set_t cpuset; | ||
224 | |||
225 | if (next_to_try == nr_cpus) | ||
226 | return -1; | ||
227 | |||
228 | while (next_to_try < nr_cpus) { | ||
229 | CPU_ZERO(&cpuset); | ||
230 | CPU_SET(next_to_try++, &cpuset); | ||
231 | if (!sched_setaffinity(pid, sizeof(cpuset), &cpuset)) | ||
232 | break; | ||
233 | } | ||
234 | |||
235 | return next_to_try; | ||
236 | } | ||
237 | |||
238 | static void run_parallel(unsigned int tasks, void (*fn)(int i, void *data), | ||
239 | void *data) | ||
240 | { | ||
241 | int next_sched_cpu = 0; | ||
242 | pid_t pid[tasks]; | ||
243 | int i; | ||
244 | |||
245 | for (i = 0; i < tasks; i++) { | ||
246 | pid[i] = fork(); | ||
247 | if (pid[i] == 0) { | ||
248 | next_sched_cpu = sched_next_online(0, next_sched_cpu); | ||
249 | fn(i, data); | ||
250 | exit(0); | ||
251 | } else if (pid[i] == -1) { | ||
252 | printf("couldn't spawn #%d process\n", i); | ||
253 | exit(1); | ||
254 | } | ||
255 | /* It is mostly redundant and just allow the parent | ||
256 | * process to update next_shced_cpu for the next child | ||
257 | * process | ||
258 | */ | ||
259 | next_sched_cpu = sched_next_online(pid[i], next_sched_cpu); | ||
260 | } | ||
261 | for (i = 0; i < tasks; i++) { | ||
262 | int status; | ||
263 | |||
264 | assert(waitpid(pid[i], &status, 0) == pid[i]); | ||
265 | assert(status == 0); | ||
266 | } | ||
267 | } | ||
268 | |||
269 | static void do_test_lru_dist(int task, void *data) | ||
270 | { | ||
271 | unsigned int nr_misses = 0; | ||
272 | struct pfect_lru pfect_lru; | ||
273 | unsigned long long key, value = 1234; | ||
274 | unsigned int i; | ||
275 | |||
276 | unsigned int lru_map_fd = ((unsigned int *)data)[0]; | ||
277 | unsigned int lru_size = ((unsigned int *)data)[1]; | ||
278 | unsigned long long key_offset = task * dist_key_counts; | ||
279 | |||
280 | pfect_lru_init(&pfect_lru, lru_size, dist_key_counts); | ||
281 | |||
282 | for (i = 0; i < dist_key_counts; i++) { | ||
283 | key = dist_keys[i] + key_offset; | ||
284 | |||
285 | pfect_lru_lookup_or_insert(&pfect_lru, key); | ||
286 | |||
287 | if (!bpf_map_lookup_elem(lru_map_fd, &key, &value)) | ||
288 | continue; | ||
289 | |||
290 | if (bpf_map_update_elem(lru_map_fd, &key, &value, BPF_NOEXIST)) { | ||
291 | printf("bpf_map_update_elem(lru_map_fd, %llu): errno:%d\n", | ||
292 | key, errno); | ||
293 | assert(0); | ||
294 | } | ||
295 | |||
296 | nr_misses++; | ||
297 | } | ||
298 | |||
299 | printf(" task:%d BPF LRU: nr_unique:%u(/%u) nr_misses:%u(/%u)\n", | ||
300 | task, pfect_lru.nr_unique, dist_key_counts, nr_misses, | ||
301 | dist_key_counts); | ||
302 | printf(" task:%d Perfect LRU: nr_unique:%u(/%u) nr_misses:%u(/%u)\n", | ||
303 | task, pfect_lru.nr_unique, pfect_lru.total, | ||
304 | pfect_lru.nr_misses, pfect_lru.total); | ||
305 | |||
306 | pfect_lru_destroy(&pfect_lru); | ||
307 | close(lru_map_fd); | ||
308 | } | ||
309 | |||
310 | static void test_parallel_lru_dist(int map_type, int map_flags, | ||
311 | int nr_tasks, unsigned int lru_size) | ||
312 | { | ||
313 | int child_data[2]; | ||
314 | int lru_map_fd; | ||
315 | |||
316 | printf("%s (map_type:%d map_flags:0x%X):\n", __func__, map_type, | ||
317 | map_flags); | ||
318 | |||
319 | if (map_flags & BPF_F_NO_COMMON_LRU) | ||
320 | lru_map_fd = create_map(map_type, map_flags, | ||
321 | nr_cpus * lru_size); | ||
322 | else | ||
323 | lru_map_fd = create_map(map_type, map_flags, | ||
324 | nr_tasks * lru_size); | ||
325 | assert(lru_map_fd != -1); | ||
326 | |||
327 | child_data[0] = lru_map_fd; | ||
328 | child_data[1] = lru_size; | ||
329 | |||
330 | run_parallel(nr_tasks, do_test_lru_dist, child_data); | ||
331 | |||
332 | close(lru_map_fd); | ||
333 | } | ||
334 | |||
335 | static void test_lru_loss0(int map_type, int map_flags) | ||
336 | { | ||
337 | unsigned long long key, value[nr_cpus]; | ||
338 | unsigned int old_unused_losses = 0; | ||
339 | unsigned int new_unused_losses = 0; | ||
340 | unsigned int used_losses = 0; | ||
341 | int map_fd; | ||
342 | |||
343 | printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type, | ||
344 | map_flags); | ||
345 | |||
346 | assert(sched_next_online(0, 0) != -1); | ||
347 | |||
348 | if (map_flags & BPF_F_NO_COMMON_LRU) | ||
349 | map_fd = create_map(map_type, map_flags, 900 * nr_cpus); | ||
350 | else | ||
351 | map_fd = create_map(map_type, map_flags, 900); | ||
352 | |||
353 | assert(map_fd != -1); | ||
354 | |||
355 | value[0] = 1234; | ||
356 | |||
357 | for (key = 1; key <= 1000; key++) { | ||
358 | int start_key, end_key; | ||
359 | |||
360 | assert(bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST) == 0); | ||
361 | |||
362 | start_key = 101; | ||
363 | end_key = min(key, 900); | ||
364 | |||
365 | while (start_key <= end_key) { | ||
366 | bpf_map_lookup_elem(map_fd, &start_key, value); | ||
367 | start_key++; | ||
368 | } | ||
369 | } | ||
370 | |||
371 | for (key = 1; key <= 1000; key++) { | ||
372 | if (bpf_map_lookup_elem(map_fd, &key, value)) { | ||
373 | if (key <= 100) | ||
374 | old_unused_losses++; | ||
375 | else if (key <= 900) | ||
376 | used_losses++; | ||
377 | else | ||
378 | new_unused_losses++; | ||
379 | } | ||
380 | } | ||
381 | |||
382 | close(map_fd); | ||
383 | |||
384 | printf("older-elem-losses:%d(/100) active-elem-losses:%d(/800) " | ||
385 | "newer-elem-losses:%d(/100)\n", | ||
386 | old_unused_losses, used_losses, new_unused_losses); | ||
387 | } | ||
388 | |||
389 | static void test_lru_loss1(int map_type, int map_flags) | ||
390 | { | ||
391 | unsigned long long key, value[nr_cpus]; | ||
392 | int map_fd; | ||
393 | unsigned int nr_losses = 0; | ||
394 | |||
395 | printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type, | ||
396 | map_flags); | ||
397 | |||
398 | assert(sched_next_online(0, 0) != -1); | ||
399 | |||
400 | if (map_flags & BPF_F_NO_COMMON_LRU) | ||
401 | map_fd = create_map(map_type, map_flags, 1000 * nr_cpus); | ||
402 | else | ||
403 | map_fd = create_map(map_type, map_flags, 1000); | ||
404 | |||
405 | assert(map_fd != -1); | ||
406 | |||
407 | value[0] = 1234; | ||
408 | |||
409 | for (key = 1; key <= 1000; key++) | ||
410 | assert(!bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST)); | ||
411 | |||
412 | for (key = 1; key <= 1000; key++) { | ||
413 | if (bpf_map_lookup_elem(map_fd, &key, value)) | ||
414 | nr_losses++; | ||
415 | } | ||
416 | |||
417 | close(map_fd); | ||
418 | |||
419 | printf("nr_losses:%d(/1000)\n", nr_losses); | ||
420 | } | ||
421 | |||
422 | static void do_test_parallel_lru_loss(int task, void *data) | ||
423 | { | ||
424 | const unsigned int nr_stable_elems = 1000; | ||
425 | const unsigned int nr_repeats = 100000; | ||
426 | |||
427 | int map_fd = *(int *)data; | ||
428 | unsigned long long stable_base; | ||
429 | unsigned long long key, value[nr_cpus]; | ||
430 | unsigned long long next_ins_key; | ||
431 | unsigned int nr_losses = 0; | ||
432 | unsigned int i; | ||
433 | |||
434 | stable_base = task * nr_repeats * 2 + 1; | ||
435 | next_ins_key = stable_base; | ||
436 | value[0] = 1234; | ||
437 | for (i = 0; i < nr_stable_elems; i++) { | ||
438 | assert(bpf_map_update_elem(map_fd, &next_ins_key, value, | ||
439 | BPF_NOEXIST) == 0); | ||
440 | next_ins_key++; | ||
441 | } | ||
442 | |||
443 | for (i = 0; i < nr_repeats; i++) { | ||
444 | int rn; | ||
445 | |||
446 | rn = rand(); | ||
447 | |||
448 | if (rn % 10) { | ||
449 | key = rn % nr_stable_elems + stable_base; | ||
450 | bpf_map_lookup_elem(map_fd, &key, value); | ||
451 | } else { | ||
452 | bpf_map_update_elem(map_fd, &next_ins_key, value, | ||
453 | BPF_NOEXIST); | ||
454 | next_ins_key++; | ||
455 | } | ||
456 | } | ||
457 | |||
458 | key = stable_base; | ||
459 | for (i = 0; i < nr_stable_elems; i++) { | ||
460 | if (bpf_map_lookup_elem(map_fd, &key, value)) | ||
461 | nr_losses++; | ||
462 | key++; | ||
463 | } | ||
464 | |||
465 | printf(" task:%d nr_losses:%u\n", task, nr_losses); | ||
466 | } | ||
467 | |||
468 | static void test_parallel_lru_loss(int map_type, int map_flags, int nr_tasks) | ||
469 | { | ||
470 | int map_fd; | ||
471 | |||
472 | printf("%s (map_type:%d map_flags:0x%X):\n", __func__, map_type, | ||
473 | map_flags); | ||
474 | |||
475 | /* Give 20% more than the active working set */ | ||
476 | if (map_flags & BPF_F_NO_COMMON_LRU) | ||
477 | map_fd = create_map(map_type, map_flags, | ||
478 | nr_cpus * (1000 + 200)); | ||
479 | else | ||
480 | map_fd = create_map(map_type, map_flags, | ||
481 | nr_tasks * (1000 + 200)); | ||
482 | |||
483 | assert(map_fd != -1); | ||
484 | |||
485 | run_parallel(nr_tasks, do_test_parallel_lru_loss, &map_fd); | ||
486 | |||
487 | close(map_fd); | ||
488 | } | ||
489 | |||
490 | int main(int argc, char **argv) | ||
491 | { | ||
492 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
493 | int map_flags[] = {0, BPF_F_NO_COMMON_LRU}; | ||
494 | const char *dist_file; | ||
495 | int nr_tasks = 1; | ||
496 | int lru_size; | ||
497 | int f; | ||
498 | |||
499 | if (argc < 4) { | ||
500 | printf("Usage: %s <dist-file> <lru-size> <nr-tasks>\n", | ||
501 | argv[0]); | ||
502 | return -1; | ||
503 | } | ||
504 | |||
505 | dist_file = argv[1]; | ||
506 | lru_size = atoi(argv[2]); | ||
507 | nr_tasks = atoi(argv[3]); | ||
508 | |||
509 | setbuf(stdout, NULL); | ||
510 | |||
511 | assert(!setrlimit(RLIMIT_MEMLOCK, &r)); | ||
512 | |||
513 | srand(time(NULL)); | ||
514 | |||
515 | nr_cpus = bpf_num_possible_cpus(); | ||
516 | assert(nr_cpus != -1); | ||
517 | printf("nr_cpus:%d\n\n", nr_cpus); | ||
518 | |||
519 | nr_tasks = min(nr_tasks, nr_cpus); | ||
520 | |||
521 | dist_key_counts = read_keys(dist_file, &dist_keys); | ||
522 | if (!dist_key_counts) { | ||
523 | printf("%s has no key\n", dist_file); | ||
524 | return -1; | ||
525 | } | ||
526 | |||
527 | for (f = 0; f < sizeof(map_flags) / sizeof(*map_flags); f++) { | ||
528 | test_lru_loss0(BPF_MAP_TYPE_LRU_HASH, map_flags[f]); | ||
529 | test_lru_loss1(BPF_MAP_TYPE_LRU_HASH, map_flags[f]); | ||
530 | test_parallel_lru_loss(BPF_MAP_TYPE_LRU_HASH, map_flags[f], | ||
531 | nr_tasks); | ||
532 | test_parallel_lru_dist(BPF_MAP_TYPE_LRU_HASH, map_flags[f], | ||
533 | nr_tasks, lru_size); | ||
534 | printf("\n"); | ||
535 | } | ||
536 | |||
537 | free(dist_keys); | ||
538 | |||
539 | return 0; | ||
540 | } | ||
diff --git a/samples/bpf/test_lwt_bpf.c b/samples/bpf/test_lwt_bpf.c new file mode 100644 index 000000000..1b568575a --- /dev/null +++ b/samples/bpf/test_lwt_bpf.c | |||
@@ -0,0 +1,253 @@ | |||
1 | /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | * | ||
7 | * This program is distributed in the hope that it will be useful, but | ||
8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
10 | * General Public License for more details. | ||
11 | */ | ||
12 | |||
13 | #include <stdint.h> | ||
14 | #include <stddef.h> | ||
15 | #include <linux/bpf.h> | ||
16 | #include <linux/ip.h> | ||
17 | #include <linux/in.h> | ||
18 | #include <linux/in6.h> | ||
19 | #include <linux/tcp.h> | ||
20 | #include <linux/udp.h> | ||
21 | #include <linux/icmpv6.h> | ||
22 | #include <linux/if_ether.h> | ||
23 | #include <bpf/bpf_helpers.h> | ||
24 | #include <string.h> | ||
25 | |||
26 | # define printk(fmt, ...) \ | ||
27 | ({ \ | ||
28 | char ____fmt[] = fmt; \ | ||
29 | bpf_trace_printk(____fmt, sizeof(____fmt), \ | ||
30 | ##__VA_ARGS__); \ | ||
31 | }) | ||
32 | |||
33 | #define CB_MAGIC 1234 | ||
34 | |||
35 | /* Test: Pass all packets through */ | ||
36 | SEC("nop") | ||
37 | int do_nop(struct __sk_buff *skb) | ||
38 | { | ||
39 | return BPF_OK; | ||
40 | } | ||
41 | |||
42 | /* Test: Verify context information can be accessed */ | ||
43 | SEC("test_ctx") | ||
44 | int do_test_ctx(struct __sk_buff *skb) | ||
45 | { | ||
46 | skb->cb[0] = CB_MAGIC; | ||
47 | printk("len %d hash %d protocol %d\n", skb->len, skb->hash, | ||
48 | skb->protocol); | ||
49 | printk("cb %d ingress_ifindex %d ifindex %d\n", skb->cb[0], | ||
50 | skb->ingress_ifindex, skb->ifindex); | ||
51 | |||
52 | return BPF_OK; | ||
53 | } | ||
54 | |||
55 | /* Test: Ensure skb->cb[] buffer is cleared */ | ||
56 | SEC("test_cb") | ||
57 | int do_test_cb(struct __sk_buff *skb) | ||
58 | { | ||
59 | printk("cb0: %x cb1: %x cb2: %x\n", skb->cb[0], skb->cb[1], | ||
60 | skb->cb[2]); | ||
61 | printk("cb3: %x cb4: %x\n", skb->cb[3], skb->cb[4]); | ||
62 | |||
63 | return BPF_OK; | ||
64 | } | ||
65 | |||
66 | /* Test: Verify skb data can be read */ | ||
67 | SEC("test_data") | ||
68 | int do_test_data(struct __sk_buff *skb) | ||
69 | { | ||
70 | void *data = (void *)(long)skb->data; | ||
71 | void *data_end = (void *)(long)skb->data_end; | ||
72 | struct iphdr *iph = data; | ||
73 | |||
74 | if (data + sizeof(*iph) > data_end) { | ||
75 | printk("packet truncated\n"); | ||
76 | return BPF_DROP; | ||
77 | } | ||
78 | |||
79 | printk("src: %x dst: %x\n", iph->saddr, iph->daddr); | ||
80 | |||
81 | return BPF_OK; | ||
82 | } | ||
83 | |||
84 | #define IP_CSUM_OFF offsetof(struct iphdr, check) | ||
85 | #define IP_DST_OFF offsetof(struct iphdr, daddr) | ||
86 | #define IP_SRC_OFF offsetof(struct iphdr, saddr) | ||
87 | #define IP_PROTO_OFF offsetof(struct iphdr, protocol) | ||
88 | #define TCP_CSUM_OFF offsetof(struct tcphdr, check) | ||
89 | #define UDP_CSUM_OFF offsetof(struct udphdr, check) | ||
90 | #define IS_PSEUDO 0x10 | ||
91 | |||
92 | static inline int rewrite(struct __sk_buff *skb, uint32_t old_ip, | ||
93 | uint32_t new_ip, int rw_daddr) | ||
94 | { | ||
95 | int ret, off = 0, flags = IS_PSEUDO; | ||
96 | uint8_t proto; | ||
97 | |||
98 | ret = bpf_skb_load_bytes(skb, IP_PROTO_OFF, &proto, 1); | ||
99 | if (ret < 0) { | ||
100 | printk("bpf_l4_csum_replace failed: %d\n", ret); | ||
101 | return BPF_DROP; | ||
102 | } | ||
103 | |||
104 | switch (proto) { | ||
105 | case IPPROTO_TCP: | ||
106 | off = TCP_CSUM_OFF; | ||
107 | break; | ||
108 | |||
109 | case IPPROTO_UDP: | ||
110 | off = UDP_CSUM_OFF; | ||
111 | flags |= BPF_F_MARK_MANGLED_0; | ||
112 | break; | ||
113 | |||
114 | case IPPROTO_ICMPV6: | ||
115 | off = offsetof(struct icmp6hdr, icmp6_cksum); | ||
116 | break; | ||
117 | } | ||
118 | |||
119 | if (off) { | ||
120 | ret = bpf_l4_csum_replace(skb, off, old_ip, new_ip, | ||
121 | flags | sizeof(new_ip)); | ||
122 | if (ret < 0) { | ||
123 | printk("bpf_l4_csum_replace failed: %d\n"); | ||
124 | return BPF_DROP; | ||
125 | } | ||
126 | } | ||
127 | |||
128 | ret = bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip)); | ||
129 | if (ret < 0) { | ||
130 | printk("bpf_l3_csum_replace failed: %d\n", ret); | ||
131 | return BPF_DROP; | ||
132 | } | ||
133 | |||
134 | if (rw_daddr) | ||
135 | ret = bpf_skb_store_bytes(skb, IP_DST_OFF, &new_ip, sizeof(new_ip), 0); | ||
136 | else | ||
137 | ret = bpf_skb_store_bytes(skb, IP_SRC_OFF, &new_ip, sizeof(new_ip), 0); | ||
138 | |||
139 | if (ret < 0) { | ||
140 | printk("bpf_skb_store_bytes() failed: %d\n", ret); | ||
141 | return BPF_DROP; | ||
142 | } | ||
143 | |||
144 | return BPF_OK; | ||
145 | } | ||
146 | |||
147 | /* Test: Verify skb data can be modified */ | ||
148 | SEC("test_rewrite") | ||
149 | int do_test_rewrite(struct __sk_buff *skb) | ||
150 | { | ||
151 | uint32_t old_ip, new_ip = 0x3fea8c0; | ||
152 | int ret; | ||
153 | |||
154 | ret = bpf_skb_load_bytes(skb, IP_DST_OFF, &old_ip, 4); | ||
155 | if (ret < 0) { | ||
156 | printk("bpf_skb_load_bytes failed: %d\n", ret); | ||
157 | return BPF_DROP; | ||
158 | } | ||
159 | |||
160 | if (old_ip == 0x2fea8c0) { | ||
161 | printk("out: rewriting from %x to %x\n", old_ip, new_ip); | ||
162 | return rewrite(skb, old_ip, new_ip, 1); | ||
163 | } | ||
164 | |||
165 | return BPF_OK; | ||
166 | } | ||
167 | |||
168 | static inline int __do_push_ll_and_redirect(struct __sk_buff *skb) | ||
169 | { | ||
170 | uint64_t smac = SRC_MAC, dmac = DST_MAC; | ||
171 | int ret, ifindex = DST_IFINDEX; | ||
172 | struct ethhdr ehdr; | ||
173 | |||
174 | ret = bpf_skb_change_head(skb, 14, 0); | ||
175 | if (ret < 0) { | ||
176 | printk("skb_change_head() failed: %d\n", ret); | ||
177 | } | ||
178 | |||
179 | ehdr.h_proto = __constant_htons(ETH_P_IP); | ||
180 | memcpy(&ehdr.h_source, &smac, 6); | ||
181 | memcpy(&ehdr.h_dest, &dmac, 6); | ||
182 | |||
183 | ret = bpf_skb_store_bytes(skb, 0, &ehdr, sizeof(ehdr), 0); | ||
184 | if (ret < 0) { | ||
185 | printk("skb_store_bytes() failed: %d\n", ret); | ||
186 | return BPF_DROP; | ||
187 | } | ||
188 | |||
189 | return bpf_redirect(ifindex, 0); | ||
190 | } | ||
191 | |||
192 | SEC("push_ll_and_redirect_silent") | ||
193 | int do_push_ll_and_redirect_silent(struct __sk_buff *skb) | ||
194 | { | ||
195 | return __do_push_ll_and_redirect(skb); | ||
196 | } | ||
197 | |||
198 | SEC("push_ll_and_redirect") | ||
199 | int do_push_ll_and_redirect(struct __sk_buff *skb) | ||
200 | { | ||
201 | int ret, ifindex = DST_IFINDEX; | ||
202 | |||
203 | ret = __do_push_ll_and_redirect(skb); | ||
204 | if (ret >= 0) | ||
205 | printk("redirected to %d\n", ifindex); | ||
206 | |||
207 | return ret; | ||
208 | } | ||
209 | |||
210 | static inline void __fill_garbage(struct __sk_buff *skb) | ||
211 | { | ||
212 | uint64_t f = 0xFFFFFFFFFFFFFFFF; | ||
213 | |||
214 | bpf_skb_store_bytes(skb, 0, &f, sizeof(f), 0); | ||
215 | bpf_skb_store_bytes(skb, 8, &f, sizeof(f), 0); | ||
216 | bpf_skb_store_bytes(skb, 16, &f, sizeof(f), 0); | ||
217 | bpf_skb_store_bytes(skb, 24, &f, sizeof(f), 0); | ||
218 | bpf_skb_store_bytes(skb, 32, &f, sizeof(f), 0); | ||
219 | bpf_skb_store_bytes(skb, 40, &f, sizeof(f), 0); | ||
220 | bpf_skb_store_bytes(skb, 48, &f, sizeof(f), 0); | ||
221 | bpf_skb_store_bytes(skb, 56, &f, sizeof(f), 0); | ||
222 | bpf_skb_store_bytes(skb, 64, &f, sizeof(f), 0); | ||
223 | bpf_skb_store_bytes(skb, 72, &f, sizeof(f), 0); | ||
224 | bpf_skb_store_bytes(skb, 80, &f, sizeof(f), 0); | ||
225 | bpf_skb_store_bytes(skb, 88, &f, sizeof(f), 0); | ||
226 | } | ||
227 | |||
228 | SEC("fill_garbage") | ||
229 | int do_fill_garbage(struct __sk_buff *skb) | ||
230 | { | ||
231 | __fill_garbage(skb); | ||
232 | printk("Set initial 96 bytes of header to FF\n"); | ||
233 | return BPF_OK; | ||
234 | } | ||
235 | |||
236 | SEC("fill_garbage_and_redirect") | ||
237 | int do_fill_garbage_and_redirect(struct __sk_buff *skb) | ||
238 | { | ||
239 | int ifindex = DST_IFINDEX; | ||
240 | __fill_garbage(skb); | ||
241 | printk("redirected to %d\n", ifindex); | ||
242 | return bpf_redirect(ifindex, 0); | ||
243 | } | ||
244 | |||
245 | /* Drop all packets */ | ||
246 | SEC("drop_all") | ||
247 | int do_drop_all(struct __sk_buff *skb) | ||
248 | { | ||
249 | printk("dropping with: %d\n", BPF_DROP); | ||
250 | return BPF_DROP; | ||
251 | } | ||
252 | |||
253 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/test_lwt_bpf.sh b/samples/bpf/test_lwt_bpf.sh new file mode 100755 index 000000000..65a976058 --- /dev/null +++ b/samples/bpf/test_lwt_bpf.sh | |||
@@ -0,0 +1,400 @@ | |||
1 | #!/bin/bash | ||
2 | # SPDX-License-Identifier: GPL-2.0 | ||
3 | |||
4 | # Uncomment to see generated bytecode | ||
5 | #VERBOSE=verbose | ||
6 | |||
7 | NS1=lwt_ns1 | ||
8 | NS2=lwt_ns2 | ||
9 | VETH0=tst_lwt1a | ||
10 | VETH1=tst_lwt1b | ||
11 | VETH2=tst_lwt2a | ||
12 | VETH3=tst_lwt2b | ||
13 | IPVETH0="192.168.254.1" | ||
14 | IPVETH1="192.168.254.2" | ||
15 | IPVETH1b="192.168.254.3" | ||
16 | |||
17 | IPVETH2="192.168.111.1" | ||
18 | IPVETH3="192.168.111.2" | ||
19 | |||
20 | IP_LOCAL="192.168.99.1" | ||
21 | |||
22 | TRACE_ROOT=/sys/kernel/debug/tracing | ||
23 | |||
24 | function lookup_mac() | ||
25 | { | ||
26 | set +x | ||
27 | if [ ! -z "$2" ]; then | ||
28 | MAC=$(ip netns exec $2 ip link show $1 | grep ether | awk '{print $2}') | ||
29 | else | ||
30 | MAC=$(ip link show $1 | grep ether | awk '{print $2}') | ||
31 | fi | ||
32 | MAC="${MAC//:/}" | ||
33 | echo "0x${MAC:10:2}${MAC:8:2}${MAC:6:2}${MAC:4:2}${MAC:2:2}${MAC:0:2}" | ||
34 | set -x | ||
35 | } | ||
36 | |||
37 | function cleanup { | ||
38 | set +ex | ||
39 | rm test_lwt_bpf.o 2> /dev/null | ||
40 | ip link del $VETH0 2> /dev/null | ||
41 | ip link del $VETH1 2> /dev/null | ||
42 | ip link del $VETH2 2> /dev/null | ||
43 | ip link del $VETH3 2> /dev/null | ||
44 | ip netns exec $NS1 killall netserver | ||
45 | ip netns delete $NS1 2> /dev/null | ||
46 | ip netns delete $NS2 2> /dev/null | ||
47 | set -ex | ||
48 | } | ||
49 | |||
50 | function setup_one_veth { | ||
51 | ip netns add $1 | ||
52 | ip link add $2 type veth peer name $3 | ||
53 | ip link set dev $2 up | ||
54 | ip addr add $4/24 dev $2 | ||
55 | ip link set $3 netns $1 | ||
56 | ip netns exec $1 ip link set dev $3 up | ||
57 | ip netns exec $1 ip addr add $5/24 dev $3 | ||
58 | |||
59 | if [ "$6" ]; then | ||
60 | ip netns exec $1 ip addr add $6/32 dev $3 | ||
61 | fi | ||
62 | } | ||
63 | |||
64 | function get_trace { | ||
65 | set +x | ||
66 | cat ${TRACE_ROOT}/trace | grep -v '^#' | ||
67 | set -x | ||
68 | } | ||
69 | |||
70 | function cleanup_routes { | ||
71 | ip route del ${IPVETH1}/32 dev $VETH0 2> /dev/null || true | ||
72 | ip route del table local local ${IP_LOCAL}/32 dev lo 2> /dev/null || true | ||
73 | } | ||
74 | |||
75 | function install_test { | ||
76 | cleanup_routes | ||
77 | cp /dev/null ${TRACE_ROOT}/trace | ||
78 | |||
79 | OPTS="encap bpf headroom 14 $1 obj test_lwt_bpf.o section $2 $VERBOSE" | ||
80 | |||
81 | if [ "$1" == "in" ]; then | ||
82 | ip route add table local local ${IP_LOCAL}/32 $OPTS dev lo | ||
83 | else | ||
84 | ip route add ${IPVETH1}/32 $OPTS dev $VETH0 | ||
85 | fi | ||
86 | } | ||
87 | |||
88 | function remove_prog { | ||
89 | if [ "$1" == "in" ]; then | ||
90 | ip route del table local local ${IP_LOCAL}/32 dev lo | ||
91 | else | ||
92 | ip route del ${IPVETH1}/32 dev $VETH0 | ||
93 | fi | ||
94 | } | ||
95 | |||
96 | function filter_trace { | ||
97 | # Add newline to allow starting EXPECT= variables on newline | ||
98 | NL=$'\n' | ||
99 | echo "${NL}$*" | sed -e 's/^.*: : //g' | ||
100 | } | ||
101 | |||
102 | function expect_fail { | ||
103 | set +x | ||
104 | echo "FAIL:" | ||
105 | echo "Expected: $1" | ||
106 | echo "Got: $2" | ||
107 | set -x | ||
108 | exit 1 | ||
109 | } | ||
110 | |||
111 | function match_trace { | ||
112 | set +x | ||
113 | RET=0 | ||
114 | TRACE=$1 | ||
115 | EXPECT=$2 | ||
116 | GOT="$(filter_trace "$TRACE")" | ||
117 | |||
118 | [ "$GOT" != "$EXPECT" ] && { | ||
119 | expect_fail "$EXPECT" "$GOT" | ||
120 | RET=1 | ||
121 | } | ||
122 | set -x | ||
123 | return $RET | ||
124 | } | ||
125 | |||
126 | function test_start { | ||
127 | set +x | ||
128 | echo "----------------------------------------------------------------" | ||
129 | echo "Starting test: $*" | ||
130 | echo "----------------------------------------------------------------" | ||
131 | set -x | ||
132 | } | ||
133 | |||
134 | function failure { | ||
135 | get_trace | ||
136 | echo "FAIL: $*" | ||
137 | exit 1 | ||
138 | } | ||
139 | |||
140 | function test_ctx_xmit { | ||
141 | test_start "test_ctx on lwt xmit" | ||
142 | install_test xmit test_ctx | ||
143 | ping -c 3 $IPVETH1 || { | ||
144 | failure "test_ctx xmit: packets are dropped" | ||
145 | } | ||
146 | match_trace "$(get_trace)" " | ||
147 | len 84 hash 0 protocol 8 | ||
148 | cb 1234 ingress_ifindex 0 ifindex $DST_IFINDEX | ||
149 | len 84 hash 0 protocol 8 | ||
150 | cb 1234 ingress_ifindex 0 ifindex $DST_IFINDEX | ||
151 | len 84 hash 0 protocol 8 | ||
152 | cb 1234 ingress_ifindex 0 ifindex $DST_IFINDEX" || exit 1 | ||
153 | remove_prog xmit | ||
154 | } | ||
155 | |||
156 | function test_ctx_out { | ||
157 | test_start "test_ctx on lwt out" | ||
158 | install_test out test_ctx | ||
159 | ping -c 3 $IPVETH1 || { | ||
160 | failure "test_ctx out: packets are dropped" | ||
161 | } | ||
162 | match_trace "$(get_trace)" " | ||
163 | len 84 hash 0 protocol 0 | ||
164 | cb 1234 ingress_ifindex 0 ifindex 0 | ||
165 | len 84 hash 0 protocol 0 | ||
166 | cb 1234 ingress_ifindex 0 ifindex 0 | ||
167 | len 84 hash 0 protocol 0 | ||
168 | cb 1234 ingress_ifindex 0 ifindex 0" || exit 1 | ||
169 | remove_prog out | ||
170 | } | ||
171 | |||
172 | function test_ctx_in { | ||
173 | test_start "test_ctx on lwt in" | ||
174 | install_test in test_ctx | ||
175 | ping -c 3 $IP_LOCAL || { | ||
176 | failure "test_ctx out: packets are dropped" | ||
177 | } | ||
178 | # We will both request & reply packets as the packets will | ||
179 | # be from $IP_LOCAL => $IP_LOCAL | ||
180 | match_trace "$(get_trace)" " | ||
181 | len 84 hash 0 protocol 8 | ||
182 | cb 1234 ingress_ifindex 1 ifindex 1 | ||
183 | len 84 hash 0 protocol 8 | ||
184 | cb 1234 ingress_ifindex 1 ifindex 1 | ||
185 | len 84 hash 0 protocol 8 | ||
186 | cb 1234 ingress_ifindex 1 ifindex 1 | ||
187 | len 84 hash 0 protocol 8 | ||
188 | cb 1234 ingress_ifindex 1 ifindex 1 | ||
189 | len 84 hash 0 protocol 8 | ||
190 | cb 1234 ingress_ifindex 1 ifindex 1 | ||
191 | len 84 hash 0 protocol 8 | ||
192 | cb 1234 ingress_ifindex 1 ifindex 1" || exit 1 | ||
193 | remove_prog in | ||
194 | } | ||
195 | |||
196 | function test_data { | ||
197 | test_start "test_data on lwt $1" | ||
198 | install_test $1 test_data | ||
199 | ping -c 3 $IPVETH1 || { | ||
200 | failure "test_data ${1}: packets are dropped" | ||
201 | } | ||
202 | match_trace "$(get_trace)" " | ||
203 | src: 1fea8c0 dst: 2fea8c0 | ||
204 | src: 1fea8c0 dst: 2fea8c0 | ||
205 | src: 1fea8c0 dst: 2fea8c0" || exit 1 | ||
206 | remove_prog $1 | ||
207 | } | ||
208 | |||
209 | function test_data_in { | ||
210 | test_start "test_data on lwt in" | ||
211 | install_test in test_data | ||
212 | ping -c 3 $IP_LOCAL || { | ||
213 | failure "test_data in: packets are dropped" | ||
214 | } | ||
215 | # We will both request & reply packets as the packets will | ||
216 | # be from $IP_LOCAL => $IP_LOCAL | ||
217 | match_trace "$(get_trace)" " | ||
218 | src: 163a8c0 dst: 163a8c0 | ||
219 | src: 163a8c0 dst: 163a8c0 | ||
220 | src: 163a8c0 dst: 163a8c0 | ||
221 | src: 163a8c0 dst: 163a8c0 | ||
222 | src: 163a8c0 dst: 163a8c0 | ||
223 | src: 163a8c0 dst: 163a8c0" || exit 1 | ||
224 | remove_prog in | ||
225 | } | ||
226 | |||
227 | function test_cb { | ||
228 | test_start "test_cb on lwt $1" | ||
229 | install_test $1 test_cb | ||
230 | ping -c 3 $IPVETH1 || { | ||
231 | failure "test_cb ${1}: packets are dropped" | ||
232 | } | ||
233 | match_trace "$(get_trace)" " | ||
234 | cb0: 0 cb1: 0 cb2: 0 | ||
235 | cb3: 0 cb4: 0 | ||
236 | cb0: 0 cb1: 0 cb2: 0 | ||
237 | cb3: 0 cb4: 0 | ||
238 | cb0: 0 cb1: 0 cb2: 0 | ||
239 | cb3: 0 cb4: 0" || exit 1 | ||
240 | remove_prog $1 | ||
241 | } | ||
242 | |||
243 | function test_cb_in { | ||
244 | test_start "test_cb on lwt in" | ||
245 | install_test in test_cb | ||
246 | ping -c 3 $IP_LOCAL || { | ||
247 | failure "test_cb in: packets are dropped" | ||
248 | } | ||
249 | # We will both request & reply packets as the packets will | ||
250 | # be from $IP_LOCAL => $IP_LOCAL | ||
251 | match_trace "$(get_trace)" " | ||
252 | cb0: 0 cb1: 0 cb2: 0 | ||
253 | cb3: 0 cb4: 0 | ||
254 | cb0: 0 cb1: 0 cb2: 0 | ||
255 | cb3: 0 cb4: 0 | ||
256 | cb0: 0 cb1: 0 cb2: 0 | ||
257 | cb3: 0 cb4: 0 | ||
258 | cb0: 0 cb1: 0 cb2: 0 | ||
259 | cb3: 0 cb4: 0 | ||
260 | cb0: 0 cb1: 0 cb2: 0 | ||
261 | cb3: 0 cb4: 0 | ||
262 | cb0: 0 cb1: 0 cb2: 0 | ||
263 | cb3: 0 cb4: 0" || exit 1 | ||
264 | remove_prog in | ||
265 | } | ||
266 | |||
267 | function test_drop_all { | ||
268 | test_start "test_drop_all on lwt $1" | ||
269 | install_test $1 drop_all | ||
270 | ping -c 3 $IPVETH1 && { | ||
271 | failure "test_drop_all ${1}: Unexpected success of ping" | ||
272 | } | ||
273 | match_trace "$(get_trace)" " | ||
274 | dropping with: 2 | ||
275 | dropping with: 2 | ||
276 | dropping with: 2" || exit 1 | ||
277 | remove_prog $1 | ||
278 | } | ||
279 | |||
280 | function test_drop_all_in { | ||
281 | test_start "test_drop_all on lwt in" | ||
282 | install_test in drop_all | ||
283 | ping -c 3 $IP_LOCAL && { | ||
284 | failure "test_drop_all in: Unexpected success of ping" | ||
285 | } | ||
286 | match_trace "$(get_trace)" " | ||
287 | dropping with: 2 | ||
288 | dropping with: 2 | ||
289 | dropping with: 2" || exit 1 | ||
290 | remove_prog in | ||
291 | } | ||
292 | |||
293 | function test_push_ll_and_redirect { | ||
294 | test_start "test_push_ll_and_redirect on lwt xmit" | ||
295 | install_test xmit push_ll_and_redirect | ||
296 | ping -c 3 $IPVETH1 || { | ||
297 | failure "Redirected packets appear to be dropped" | ||
298 | } | ||
299 | match_trace "$(get_trace)" " | ||
300 | redirected to $DST_IFINDEX | ||
301 | redirected to $DST_IFINDEX | ||
302 | redirected to $DST_IFINDEX" || exit 1 | ||
303 | remove_prog xmit | ||
304 | } | ||
305 | |||
306 | function test_no_l2_and_redirect { | ||
307 | test_start "test_no_l2_and_redirect on lwt xmit" | ||
308 | install_test xmit fill_garbage_and_redirect | ||
309 | ping -c 3 $IPVETH1 && { | ||
310 | failure "Unexpected success despite lack of L2 header" | ||
311 | } | ||
312 | match_trace "$(get_trace)" " | ||
313 | redirected to $DST_IFINDEX | ||
314 | redirected to $DST_IFINDEX | ||
315 | redirected to $DST_IFINDEX" || exit 1 | ||
316 | remove_prog xmit | ||
317 | } | ||
318 | |||
319 | function test_rewrite { | ||
320 | test_start "test_rewrite on lwt xmit" | ||
321 | install_test xmit test_rewrite | ||
322 | ping -c 3 $IPVETH1 || { | ||
323 | failure "Rewritten packets appear to be dropped" | ||
324 | } | ||
325 | match_trace "$(get_trace)" " | ||
326 | out: rewriting from 2fea8c0 to 3fea8c0 | ||
327 | out: rewriting from 2fea8c0 to 3fea8c0 | ||
328 | out: rewriting from 2fea8c0 to 3fea8c0" || exit 1 | ||
329 | remove_prog out | ||
330 | } | ||
331 | |||
332 | function test_fill_garbage { | ||
333 | test_start "test_fill_garbage on lwt xmit" | ||
334 | install_test xmit fill_garbage | ||
335 | ping -c 3 $IPVETH1 && { | ||
336 | failure "test_drop_all ${1}: Unexpected success of ping" | ||
337 | } | ||
338 | match_trace "$(get_trace)" " | ||
339 | Set initial 96 bytes of header to FF | ||
340 | Set initial 96 bytes of header to FF | ||
341 | Set initial 96 bytes of header to FF" || exit 1 | ||
342 | remove_prog xmit | ||
343 | } | ||
344 | |||
345 | function test_netperf_nop { | ||
346 | test_start "test_netperf_nop on lwt xmit" | ||
347 | install_test xmit nop | ||
348 | netperf -H $IPVETH1 -t TCP_STREAM || { | ||
349 | failure "packets appear to be dropped" | ||
350 | } | ||
351 | match_trace "$(get_trace)" ""|| exit 1 | ||
352 | remove_prog xmit | ||
353 | } | ||
354 | |||
355 | function test_netperf_redirect { | ||
356 | test_start "test_netperf_redirect on lwt xmit" | ||
357 | install_test xmit push_ll_and_redirect_silent | ||
358 | netperf -H $IPVETH1 -t TCP_STREAM || { | ||
359 | failure "Rewritten packets appear to be dropped" | ||
360 | } | ||
361 | match_trace "$(get_trace)" ""|| exit 1 | ||
362 | remove_prog xmit | ||
363 | } | ||
364 | |||
365 | cleanup | ||
366 | setup_one_veth $NS1 $VETH0 $VETH1 $IPVETH0 $IPVETH1 $IPVETH1b | ||
367 | setup_one_veth $NS2 $VETH2 $VETH3 $IPVETH2 $IPVETH3 | ||
368 | ip netns exec $NS1 netserver | ||
369 | echo 1 > ${TRACE_ROOT}/tracing_on | ||
370 | |||
371 | DST_MAC=$(lookup_mac $VETH1 $NS1) | ||
372 | SRC_MAC=$(lookup_mac $VETH0) | ||
373 | DST_IFINDEX=$(cat /sys/class/net/$VETH0/ifindex) | ||
374 | |||
375 | CLANG_OPTS="-O2 -target bpf -I ../include/" | ||
376 | CLANG_OPTS+=" -DSRC_MAC=$SRC_MAC -DDST_MAC=$DST_MAC -DDST_IFINDEX=$DST_IFINDEX" | ||
377 | clang $CLANG_OPTS -c test_lwt_bpf.c -o test_lwt_bpf.o | ||
378 | |||
379 | test_ctx_xmit | ||
380 | test_ctx_out | ||
381 | test_ctx_in | ||
382 | test_data "xmit" | ||
383 | test_data "out" | ||
384 | test_data_in | ||
385 | test_cb "xmit" | ||
386 | test_cb "out" | ||
387 | test_cb_in | ||
388 | test_drop_all "xmit" | ||
389 | test_drop_all "out" | ||
390 | test_drop_all_in | ||
391 | test_rewrite | ||
392 | test_push_ll_and_redirect | ||
393 | test_no_l2_and_redirect | ||
394 | test_fill_garbage | ||
395 | test_netperf_nop | ||
396 | test_netperf_redirect | ||
397 | |||
398 | cleanup | ||
399 | echo 0 > ${TRACE_ROOT}/tracing_on | ||
400 | exit 0 | ||
diff --git a/samples/bpf/test_map_in_map_kern.c b/samples/bpf/test_map_in_map_kern.c new file mode 100644 index 000000000..b0200c8ea --- /dev/null +++ b/samples/bpf/test_map_in_map_kern.c | |||
@@ -0,0 +1,176 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2017 Facebook | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of version 2 of the GNU General Public | ||
6 | * License as published by the Free Software Foundation. | ||
7 | */ | ||
8 | #define KBUILD_MODNAME "foo" | ||
9 | #include <linux/ptrace.h> | ||
10 | #include <linux/version.h> | ||
11 | #include <uapi/linux/bpf.h> | ||
12 | #include <uapi/linux/in6.h> | ||
13 | #include <bpf/bpf_helpers.h> | ||
14 | #include <bpf/bpf_tracing.h> | ||
15 | #include <bpf/bpf_core_read.h> | ||
16 | #include "trace_common.h" | ||
17 | |||
18 | #define MAX_NR_PORTS 65536 | ||
19 | |||
20 | /* map #0 */ | ||
21 | struct inner_a { | ||
22 | __uint(type, BPF_MAP_TYPE_ARRAY); | ||
23 | __type(key, u32); | ||
24 | __type(value, int); | ||
25 | __uint(max_entries, MAX_NR_PORTS); | ||
26 | } port_a SEC(".maps"); | ||
27 | |||
28 | /* map #1 */ | ||
29 | struct inner_h { | ||
30 | __uint(type, BPF_MAP_TYPE_HASH); | ||
31 | __type(key, u32); | ||
32 | __type(value, int); | ||
33 | __uint(max_entries, 1); | ||
34 | } port_h SEC(".maps"); | ||
35 | |||
36 | /* map #2 */ | ||
37 | struct { | ||
38 | __uint(type, BPF_MAP_TYPE_HASH); | ||
39 | __type(key, u32); | ||
40 | __type(value, int); | ||
41 | __uint(max_entries, 1); | ||
42 | } reg_result_h SEC(".maps"); | ||
43 | |||
44 | /* map #3 */ | ||
45 | struct { | ||
46 | __uint(type, BPF_MAP_TYPE_HASH); | ||
47 | __type(key, u32); | ||
48 | __type(value, int); | ||
49 | __uint(max_entries, 1); | ||
50 | } inline_result_h SEC(".maps"); | ||
51 | |||
52 | /* map #4 */ /* Test case #0 */ | ||
53 | struct { | ||
54 | __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); | ||
55 | __uint(max_entries, MAX_NR_PORTS); | ||
56 | __uint(key_size, sizeof(u32)); | ||
57 | __array(values, struct inner_a); /* use inner_a as inner map */ | ||
58 | } a_of_port_a SEC(".maps"); | ||
59 | |||
60 | /* map #5 */ /* Test case #1 */ | ||
61 | struct { | ||
62 | __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); | ||
63 | __uint(max_entries, 1); | ||
64 | __uint(key_size, sizeof(u32)); | ||
65 | __array(values, struct inner_a); /* use inner_a as inner map */ | ||
66 | } h_of_port_a SEC(".maps"); | ||
67 | |||
68 | /* map #6 */ /* Test case #2 */ | ||
69 | struct { | ||
70 | __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); | ||
71 | __uint(max_entries, 1); | ||
72 | __uint(key_size, sizeof(u32)); | ||
73 | __array(values, struct inner_h); /* use inner_h as inner map */ | ||
74 | } h_of_port_h SEC(".maps"); | ||
75 | |||
76 | static __always_inline int do_reg_lookup(void *inner_map, u32 port) | ||
77 | { | ||
78 | int *result; | ||
79 | |||
80 | result = bpf_map_lookup_elem(inner_map, &port); | ||
81 | return result ? *result : -ENOENT; | ||
82 | } | ||
83 | |||
84 | static __always_inline int do_inline_array_lookup(void *inner_map, u32 port) | ||
85 | { | ||
86 | int *result; | ||
87 | |||
88 | if (inner_map != &port_a) | ||
89 | return -EINVAL; | ||
90 | |||
91 | result = bpf_map_lookup_elem(&port_a, &port); | ||
92 | return result ? *result : -ENOENT; | ||
93 | } | ||
94 | |||
95 | static __always_inline int do_inline_hash_lookup(void *inner_map, u32 port) | ||
96 | { | ||
97 | int *result; | ||
98 | |||
99 | if (inner_map != &port_h) | ||
100 | return -EINVAL; | ||
101 | |||
102 | result = bpf_map_lookup_elem(&port_h, &port); | ||
103 | return result ? *result : -ENOENT; | ||
104 | } | ||
105 | |||
106 | SEC("kprobe/__sys_connect") | ||
107 | int trace_sys_connect(struct pt_regs *ctx) | ||
108 | { | ||
109 | struct sockaddr_in6 *in6; | ||
110 | u16 test_case, port, dst6[8]; | ||
111 | int addrlen, ret, inline_ret, ret_key = 0; | ||
112 | u32 port_key; | ||
113 | void *outer_map, *inner_map; | ||
114 | bool inline_hash = false; | ||
115 | |||
116 | in6 = (struct sockaddr_in6 *)PT_REGS_PARM2_CORE(ctx); | ||
117 | addrlen = (int)PT_REGS_PARM3_CORE(ctx); | ||
118 | |||
119 | if (addrlen != sizeof(*in6)) | ||
120 | return 0; | ||
121 | |||
122 | ret = bpf_probe_read_user(dst6, sizeof(dst6), &in6->sin6_addr); | ||
123 | if (ret) { | ||
124 | inline_ret = ret; | ||
125 | goto done; | ||
126 | } | ||
127 | |||
128 | if (dst6[0] != 0xdead || dst6[1] != 0xbeef) | ||
129 | return 0; | ||
130 | |||
131 | test_case = dst6[7]; | ||
132 | |||
133 | ret = bpf_probe_read_user(&port, sizeof(port), &in6->sin6_port); | ||
134 | if (ret) { | ||
135 | inline_ret = ret; | ||
136 | goto done; | ||
137 | } | ||
138 | |||
139 | port_key = port; | ||
140 | |||
141 | ret = -ENOENT; | ||
142 | if (test_case == 0) { | ||
143 | outer_map = &a_of_port_a; | ||
144 | } else if (test_case == 1) { | ||
145 | outer_map = &h_of_port_a; | ||
146 | } else if (test_case == 2) { | ||
147 | outer_map = &h_of_port_h; | ||
148 | } else { | ||
149 | ret = __LINE__; | ||
150 | inline_ret = ret; | ||
151 | goto done; | ||
152 | } | ||
153 | |||
154 | inner_map = bpf_map_lookup_elem(outer_map, &port_key); | ||
155 | if (!inner_map) { | ||
156 | ret = __LINE__; | ||
157 | inline_ret = ret; | ||
158 | goto done; | ||
159 | } | ||
160 | |||
161 | ret = do_reg_lookup(inner_map, port_key); | ||
162 | |||
163 | if (test_case == 0 || test_case == 1) | ||
164 | inline_ret = do_inline_array_lookup(inner_map, port_key); | ||
165 | else | ||
166 | inline_ret = do_inline_hash_lookup(inner_map, port_key); | ||
167 | |||
168 | done: | ||
169 | bpf_map_update_elem(®_result_h, &ret_key, &ret, BPF_ANY); | ||
170 | bpf_map_update_elem(&inline_result_h, &ret_key, &inline_ret, BPF_ANY); | ||
171 | |||
172 | return 0; | ||
173 | } | ||
174 | |||
175 | char _license[] SEC("license") = "GPL"; | ||
176 | u32 _version SEC("version") = LINUX_VERSION_CODE; | ||
diff --git a/samples/bpf/test_map_in_map_user.c b/samples/bpf/test_map_in_map_user.c new file mode 100644 index 000000000..98656de56 --- /dev/null +++ b/samples/bpf/test_map_in_map_user.c | |||
@@ -0,0 +1,173 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * Copyright (c) 2017 Facebook | ||
4 | */ | ||
5 | #include <sys/resource.h> | ||
6 | #include <sys/socket.h> | ||
7 | #include <arpa/inet.h> | ||
8 | #include <stdint.h> | ||
9 | #include <assert.h> | ||
10 | #include <errno.h> | ||
11 | #include <stdlib.h> | ||
12 | #include <stdio.h> | ||
13 | #include <bpf/bpf.h> | ||
14 | #include <bpf/libbpf.h> | ||
15 | |||
16 | static int map_fd[7]; | ||
17 | |||
18 | #define PORT_A (map_fd[0]) | ||
19 | #define PORT_H (map_fd[1]) | ||
20 | #define REG_RESULT_H (map_fd[2]) | ||
21 | #define INLINE_RESULT_H (map_fd[3]) | ||
22 | #define A_OF_PORT_A (map_fd[4]) /* Test case #0 */ | ||
23 | #define H_OF_PORT_A (map_fd[5]) /* Test case #1 */ | ||
24 | #define H_OF_PORT_H (map_fd[6]) /* Test case #2 */ | ||
25 | |||
26 | static const char * const test_names[] = { | ||
27 | "Array of Array", | ||
28 | "Hash of Array", | ||
29 | "Hash of Hash", | ||
30 | }; | ||
31 | |||
32 | #define NR_TESTS (sizeof(test_names) / sizeof(*test_names)) | ||
33 | |||
34 | static void check_map_id(int inner_map_fd, int map_in_map_fd, uint32_t key) | ||
35 | { | ||
36 | struct bpf_map_info info = {}; | ||
37 | uint32_t info_len = sizeof(info); | ||
38 | int ret, id; | ||
39 | |||
40 | ret = bpf_obj_get_info_by_fd(inner_map_fd, &info, &info_len); | ||
41 | assert(!ret); | ||
42 | |||
43 | ret = bpf_map_lookup_elem(map_in_map_fd, &key, &id); | ||
44 | assert(!ret); | ||
45 | assert(id == info.id); | ||
46 | } | ||
47 | |||
48 | static void populate_map(uint32_t port_key, int magic_result) | ||
49 | { | ||
50 | int ret; | ||
51 | |||
52 | ret = bpf_map_update_elem(PORT_A, &port_key, &magic_result, BPF_ANY); | ||
53 | assert(!ret); | ||
54 | |||
55 | ret = bpf_map_update_elem(PORT_H, &port_key, &magic_result, | ||
56 | BPF_NOEXIST); | ||
57 | assert(!ret); | ||
58 | |||
59 | ret = bpf_map_update_elem(A_OF_PORT_A, &port_key, &PORT_A, BPF_ANY); | ||
60 | assert(!ret); | ||
61 | check_map_id(PORT_A, A_OF_PORT_A, port_key); | ||
62 | |||
63 | ret = bpf_map_update_elem(H_OF_PORT_A, &port_key, &PORT_A, BPF_NOEXIST); | ||
64 | assert(!ret); | ||
65 | check_map_id(PORT_A, H_OF_PORT_A, port_key); | ||
66 | |||
67 | ret = bpf_map_update_elem(H_OF_PORT_H, &port_key, &PORT_H, BPF_NOEXIST); | ||
68 | assert(!ret); | ||
69 | check_map_id(PORT_H, H_OF_PORT_H, port_key); | ||
70 | } | ||
71 | |||
72 | static void test_map_in_map(void) | ||
73 | { | ||
74 | struct sockaddr_in6 in6 = { .sin6_family = AF_INET6 }; | ||
75 | uint32_t result_key = 0, port_key; | ||
76 | int result, inline_result; | ||
77 | int magic_result = 0xfaceb00c; | ||
78 | int ret; | ||
79 | int i; | ||
80 | |||
81 | port_key = rand() & 0x00FF; | ||
82 | populate_map(port_key, magic_result); | ||
83 | |||
84 | in6.sin6_addr.s6_addr16[0] = 0xdead; | ||
85 | in6.sin6_addr.s6_addr16[1] = 0xbeef; | ||
86 | in6.sin6_port = port_key; | ||
87 | |||
88 | for (i = 0; i < NR_TESTS; i++) { | ||
89 | printf("%s: ", test_names[i]); | ||
90 | |||
91 | in6.sin6_addr.s6_addr16[7] = i; | ||
92 | ret = connect(-1, (struct sockaddr *)&in6, sizeof(in6)); | ||
93 | assert(ret == -1 && errno == EBADF); | ||
94 | |||
95 | ret = bpf_map_lookup_elem(REG_RESULT_H, &result_key, &result); | ||
96 | assert(!ret); | ||
97 | |||
98 | ret = bpf_map_lookup_elem(INLINE_RESULT_H, &result_key, | ||
99 | &inline_result); | ||
100 | assert(!ret); | ||
101 | |||
102 | if (result != magic_result || inline_result != magic_result) { | ||
103 | printf("Error. result:%d inline_result:%d\n", | ||
104 | result, inline_result); | ||
105 | exit(1); | ||
106 | } | ||
107 | |||
108 | bpf_map_delete_elem(REG_RESULT_H, &result_key); | ||
109 | bpf_map_delete_elem(INLINE_RESULT_H, &result_key); | ||
110 | |||
111 | printf("Pass\n"); | ||
112 | } | ||
113 | } | ||
114 | |||
115 | int main(int argc, char **argv) | ||
116 | { | ||
117 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
118 | struct bpf_link *link = NULL; | ||
119 | struct bpf_program *prog; | ||
120 | struct bpf_object *obj; | ||
121 | char filename[256]; | ||
122 | |||
123 | if (setrlimit(RLIMIT_MEMLOCK, &r)) { | ||
124 | perror("setrlimit(RLIMIT_MEMLOCK)"); | ||
125 | return 1; | ||
126 | } | ||
127 | |||
128 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
129 | obj = bpf_object__open_file(filename, NULL); | ||
130 | if (libbpf_get_error(obj)) { | ||
131 | fprintf(stderr, "ERROR: opening BPF object file failed\n"); | ||
132 | return 0; | ||
133 | } | ||
134 | |||
135 | prog = bpf_object__find_program_by_name(obj, "trace_sys_connect"); | ||
136 | if (!prog) { | ||
137 | printf("finding a prog in obj file failed\n"); | ||
138 | goto cleanup; | ||
139 | } | ||
140 | |||
141 | /* load BPF program */ | ||
142 | if (bpf_object__load(obj)) { | ||
143 | fprintf(stderr, "ERROR: loading BPF object file failed\n"); | ||
144 | goto cleanup; | ||
145 | } | ||
146 | |||
147 | map_fd[0] = bpf_object__find_map_fd_by_name(obj, "port_a"); | ||
148 | map_fd[1] = bpf_object__find_map_fd_by_name(obj, "port_h"); | ||
149 | map_fd[2] = bpf_object__find_map_fd_by_name(obj, "reg_result_h"); | ||
150 | map_fd[3] = bpf_object__find_map_fd_by_name(obj, "inline_result_h"); | ||
151 | map_fd[4] = bpf_object__find_map_fd_by_name(obj, "a_of_port_a"); | ||
152 | map_fd[5] = bpf_object__find_map_fd_by_name(obj, "h_of_port_a"); | ||
153 | map_fd[6] = bpf_object__find_map_fd_by_name(obj, "h_of_port_h"); | ||
154 | if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0 || | ||
155 | map_fd[3] < 0 || map_fd[4] < 0 || map_fd[5] < 0 || map_fd[6] < 0) { | ||
156 | fprintf(stderr, "ERROR: finding a map in obj file failed\n"); | ||
157 | goto cleanup; | ||
158 | } | ||
159 | |||
160 | link = bpf_program__attach(prog); | ||
161 | if (libbpf_get_error(link)) { | ||
162 | fprintf(stderr, "ERROR: bpf_program__attach failed\n"); | ||
163 | link = NULL; | ||
164 | goto cleanup; | ||
165 | } | ||
166 | |||
167 | test_map_in_map(); | ||
168 | |||
169 | cleanup: | ||
170 | bpf_link__destroy(link); | ||
171 | bpf_object__close(obj); | ||
172 | return 0; | ||
173 | } | ||
diff --git a/samples/bpf/test_overhead_kprobe_kern.c b/samples/bpf/test_overhead_kprobe_kern.c new file mode 100644 index 000000000..f6d593e47 --- /dev/null +++ b/samples/bpf/test_overhead_kprobe_kern.c | |||
@@ -0,0 +1,48 @@ | |||
1 | /* Copyright (c) 2016 Facebook | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #include <linux/version.h> | ||
8 | #include <linux/ptrace.h> | ||
9 | #include <uapi/linux/bpf.h> | ||
10 | #include <bpf/bpf_helpers.h> | ||
11 | #include <bpf/bpf_tracing.h> | ||
12 | |||
13 | #define _(P) \ | ||
14 | ({ \ | ||
15 | typeof(P) val = 0; \ | ||
16 | bpf_probe_read_kernel(&val, sizeof(val), &(P)); \ | ||
17 | val; \ | ||
18 | }) | ||
19 | |||
20 | SEC("kprobe/__set_task_comm") | ||
21 | int prog(struct pt_regs *ctx) | ||
22 | { | ||
23 | struct signal_struct *signal; | ||
24 | struct task_struct *tsk; | ||
25 | char oldcomm[16] = {}; | ||
26 | char newcomm[16] = {}; | ||
27 | u16 oom_score_adj; | ||
28 | u32 pid; | ||
29 | |||
30 | tsk = (void *)PT_REGS_PARM1(ctx); | ||
31 | |||
32 | pid = _(tsk->pid); | ||
33 | bpf_probe_read_kernel(oldcomm, sizeof(oldcomm), &tsk->comm); | ||
34 | bpf_probe_read_kernel(newcomm, sizeof(newcomm), | ||
35 | (void *)PT_REGS_PARM2(ctx)); | ||
36 | signal = _(tsk->signal); | ||
37 | oom_score_adj = _(signal->oom_score_adj); | ||
38 | return 0; | ||
39 | } | ||
40 | |||
41 | SEC("kprobe/urandom_read") | ||
42 | int prog2(struct pt_regs *ctx) | ||
43 | { | ||
44 | return 0; | ||
45 | } | ||
46 | |||
47 | char _license[] SEC("license") = "GPL"; | ||
48 | u32 _version SEC("version") = LINUX_VERSION_CODE; | ||
diff --git a/samples/bpf/test_overhead_raw_tp_kern.c b/samples/bpf/test_overhead_raw_tp_kern.c new file mode 100644 index 000000000..8763181a3 --- /dev/null +++ b/samples/bpf/test_overhead_raw_tp_kern.c | |||
@@ -0,0 +1,17 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* Copyright (c) 2018 Facebook */ | ||
3 | #include <uapi/linux/bpf.h> | ||
4 | #include <bpf/bpf_helpers.h> | ||
5 | |||
6 | SEC("raw_tracepoint/task_rename") | ||
7 | int prog(struct bpf_raw_tracepoint_args *ctx) | ||
8 | { | ||
9 | return 0; | ||
10 | } | ||
11 | |||
12 | SEC("raw_tracepoint/urandom_read") | ||
13 | int prog2(struct bpf_raw_tracepoint_args *ctx) | ||
14 | { | ||
15 | return 0; | ||
16 | } | ||
17 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/test_overhead_tp_kern.c b/samples/bpf/test_overhead_tp_kern.c new file mode 100644 index 000000000..eaa32693f --- /dev/null +++ b/samples/bpf/test_overhead_tp_kern.c | |||
@@ -0,0 +1,36 @@ | |||
1 | /* Copyright (c) 2016 Facebook | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #include <uapi/linux/bpf.h> | ||
8 | #include <bpf/bpf_helpers.h> | ||
9 | |||
10 | /* from /sys/kernel/debug/tracing/events/task/task_rename/format */ | ||
11 | struct task_rename { | ||
12 | __u64 pad; | ||
13 | __u32 pid; | ||
14 | char oldcomm[16]; | ||
15 | char newcomm[16]; | ||
16 | __u16 oom_score_adj; | ||
17 | }; | ||
18 | SEC("tracepoint/task/task_rename") | ||
19 | int prog(struct task_rename *ctx) | ||
20 | { | ||
21 | return 0; | ||
22 | } | ||
23 | |||
24 | /* from /sys/kernel/debug/tracing/events/random/urandom_read/format */ | ||
25 | struct urandom_read { | ||
26 | __u64 pad; | ||
27 | int got_bits; | ||
28 | int pool_left; | ||
29 | int input_left; | ||
30 | }; | ||
31 | SEC("tracepoint/random/urandom_read") | ||
32 | int prog2(struct urandom_read *ctx) | ||
33 | { | ||
34 | return 0; | ||
35 | } | ||
36 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/test_overhead_user.c b/samples/bpf/test_overhead_user.c new file mode 100644 index 000000000..94f74112a --- /dev/null +++ b/samples/bpf/test_overhead_user.c | |||
@@ -0,0 +1,182 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* Copyright (c) 2016 Facebook | ||
3 | */ | ||
4 | #define _GNU_SOURCE | ||
5 | #include <sched.h> | ||
6 | #include <errno.h> | ||
7 | #include <stdio.h> | ||
8 | #include <sys/types.h> | ||
9 | #include <asm/unistd.h> | ||
10 | #include <fcntl.h> | ||
11 | #include <unistd.h> | ||
12 | #include <assert.h> | ||
13 | #include <sys/wait.h> | ||
14 | #include <stdlib.h> | ||
15 | #include <signal.h> | ||
16 | #include <linux/bpf.h> | ||
17 | #include <string.h> | ||
18 | #include <time.h> | ||
19 | #include <sys/resource.h> | ||
20 | #include <bpf/bpf.h> | ||
21 | #include "bpf_load.h" | ||
22 | |||
23 | #define MAX_CNT 1000000 | ||
24 | |||
25 | static __u64 time_get_ns(void) | ||
26 | { | ||
27 | struct timespec ts; | ||
28 | |||
29 | clock_gettime(CLOCK_MONOTONIC, &ts); | ||
30 | return ts.tv_sec * 1000000000ull + ts.tv_nsec; | ||
31 | } | ||
32 | |||
33 | static void test_task_rename(int cpu) | ||
34 | { | ||
35 | __u64 start_time; | ||
36 | char buf[] = "test\n"; | ||
37 | int i, fd; | ||
38 | |||
39 | fd = open("/proc/self/comm", O_WRONLY|O_TRUNC); | ||
40 | if (fd < 0) { | ||
41 | printf("couldn't open /proc\n"); | ||
42 | exit(1); | ||
43 | } | ||
44 | start_time = time_get_ns(); | ||
45 | for (i = 0; i < MAX_CNT; i++) { | ||
46 | if (write(fd, buf, sizeof(buf)) < 0) { | ||
47 | printf("task rename failed: %s\n", strerror(errno)); | ||
48 | close(fd); | ||
49 | return; | ||
50 | } | ||
51 | } | ||
52 | printf("task_rename:%d: %lld events per sec\n", | ||
53 | cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); | ||
54 | close(fd); | ||
55 | } | ||
56 | |||
57 | static void test_urandom_read(int cpu) | ||
58 | { | ||
59 | __u64 start_time; | ||
60 | char buf[4]; | ||
61 | int i, fd; | ||
62 | |||
63 | fd = open("/dev/urandom", O_RDONLY); | ||
64 | if (fd < 0) { | ||
65 | printf("couldn't open /dev/urandom\n"); | ||
66 | exit(1); | ||
67 | } | ||
68 | start_time = time_get_ns(); | ||
69 | for (i = 0; i < MAX_CNT; i++) { | ||
70 | if (read(fd, buf, sizeof(buf)) < 0) { | ||
71 | printf("failed to read from /dev/urandom: %s\n", strerror(errno)); | ||
72 | close(fd); | ||
73 | return; | ||
74 | } | ||
75 | } | ||
76 | printf("urandom_read:%d: %lld events per sec\n", | ||
77 | cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); | ||
78 | close(fd); | ||
79 | } | ||
80 | |||
81 | static void loop(int cpu, int flags) | ||
82 | { | ||
83 | cpu_set_t cpuset; | ||
84 | |||
85 | CPU_ZERO(&cpuset); | ||
86 | CPU_SET(cpu, &cpuset); | ||
87 | sched_setaffinity(0, sizeof(cpuset), &cpuset); | ||
88 | |||
89 | if (flags & 1) | ||
90 | test_task_rename(cpu); | ||
91 | if (flags & 2) | ||
92 | test_urandom_read(cpu); | ||
93 | } | ||
94 | |||
95 | static void run_perf_test(int tasks, int flags) | ||
96 | { | ||
97 | pid_t pid[tasks]; | ||
98 | int i; | ||
99 | |||
100 | for (i = 0; i < tasks; i++) { | ||
101 | pid[i] = fork(); | ||
102 | if (pid[i] == 0) { | ||
103 | loop(i, flags); | ||
104 | exit(0); | ||
105 | } else if (pid[i] == -1) { | ||
106 | printf("couldn't spawn #%d process\n", i); | ||
107 | exit(1); | ||
108 | } | ||
109 | } | ||
110 | for (i = 0; i < tasks; i++) { | ||
111 | int status; | ||
112 | |||
113 | assert(waitpid(pid[i], &status, 0) == pid[i]); | ||
114 | assert(status == 0); | ||
115 | } | ||
116 | } | ||
117 | |||
118 | static void unload_progs(void) | ||
119 | { | ||
120 | close(prog_fd[0]); | ||
121 | close(prog_fd[1]); | ||
122 | close(event_fd[0]); | ||
123 | close(event_fd[1]); | ||
124 | } | ||
125 | |||
126 | int main(int argc, char **argv) | ||
127 | { | ||
128 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
129 | char filename[256]; | ||
130 | int num_cpu = 8; | ||
131 | int test_flags = ~0; | ||
132 | |||
133 | setrlimit(RLIMIT_MEMLOCK, &r); | ||
134 | |||
135 | if (argc > 1) | ||
136 | test_flags = atoi(argv[1]) ? : test_flags; | ||
137 | if (argc > 2) | ||
138 | num_cpu = atoi(argv[2]) ? : num_cpu; | ||
139 | |||
140 | if (test_flags & 0x3) { | ||
141 | printf("BASE\n"); | ||
142 | run_perf_test(num_cpu, test_flags); | ||
143 | } | ||
144 | |||
145 | if (test_flags & 0xC) { | ||
146 | snprintf(filename, sizeof(filename), | ||
147 | "%s_kprobe_kern.o", argv[0]); | ||
148 | if (load_bpf_file(filename)) { | ||
149 | printf("%s", bpf_log_buf); | ||
150 | return 1; | ||
151 | } | ||
152 | printf("w/KPROBE\n"); | ||
153 | run_perf_test(num_cpu, test_flags >> 2); | ||
154 | unload_progs(); | ||
155 | } | ||
156 | |||
157 | if (test_flags & 0x30) { | ||
158 | snprintf(filename, sizeof(filename), | ||
159 | "%s_tp_kern.o", argv[0]); | ||
160 | if (load_bpf_file(filename)) { | ||
161 | printf("%s", bpf_log_buf); | ||
162 | return 1; | ||
163 | } | ||
164 | printf("w/TRACEPOINT\n"); | ||
165 | run_perf_test(num_cpu, test_flags >> 4); | ||
166 | unload_progs(); | ||
167 | } | ||
168 | |||
169 | if (test_flags & 0xC0) { | ||
170 | snprintf(filename, sizeof(filename), | ||
171 | "%s_raw_tp_kern.o", argv[0]); | ||
172 | if (load_bpf_file(filename)) { | ||
173 | printf("%s", bpf_log_buf); | ||
174 | return 1; | ||
175 | } | ||
176 | printf("w/RAW_TRACEPOINT\n"); | ||
177 | run_perf_test(num_cpu, test_flags >> 6); | ||
178 | unload_progs(); | ||
179 | } | ||
180 | |||
181 | return 0; | ||
182 | } | ||
diff --git a/samples/bpf/test_override_return.sh b/samples/bpf/test_override_return.sh new file mode 100755 index 000000000..35db26f73 --- /dev/null +++ b/samples/bpf/test_override_return.sh | |||
@@ -0,0 +1,16 @@ | |||
1 | #!/bin/bash | ||
2 | |||
3 | rm -r tmpmnt | ||
4 | rm -f testfile.img | ||
5 | dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1 | ||
6 | DEVICE=$(losetup --show -f testfile.img) | ||
7 | mkfs.btrfs -f $DEVICE | ||
8 | mkdir tmpmnt | ||
9 | ./tracex7 $DEVICE | ||
10 | if [ $? -eq 0 ] | ||
11 | then | ||
12 | echo "SUCCESS!" | ||
13 | else | ||
14 | echo "FAILED!" | ||
15 | fi | ||
16 | losetup -d $DEVICE | ||
diff --git a/samples/bpf/test_probe_write_user_kern.c b/samples/bpf/test_probe_write_user_kern.c new file mode 100644 index 000000000..220a96438 --- /dev/null +++ b/samples/bpf/test_probe_write_user_kern.c | |||
@@ -0,0 +1,56 @@ | |||
1 | /* Copyright (c) 2016 Sargun Dhillon <sargun@sargun.me> | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #include <linux/skbuff.h> | ||
8 | #include <linux/netdevice.h> | ||
9 | #include <uapi/linux/bpf.h> | ||
10 | #include <linux/version.h> | ||
11 | #include <bpf/bpf_helpers.h> | ||
12 | #include <bpf/bpf_tracing.h> | ||
13 | #include <bpf/bpf_core_read.h> | ||
14 | #include "trace_common.h" | ||
15 | |||
16 | struct { | ||
17 | __uint(type, BPF_MAP_TYPE_HASH); | ||
18 | __type(key, struct sockaddr_in); | ||
19 | __type(value, struct sockaddr_in); | ||
20 | __uint(max_entries, 256); | ||
21 | } dnat_map SEC(".maps"); | ||
22 | |||
23 | /* kprobe is NOT a stable ABI | ||
24 | * kernel functions can be removed, renamed or completely change semantics. | ||
25 | * Number of arguments and their positions can change, etc. | ||
26 | * In such case this bpf+kprobe example will no longer be meaningful | ||
27 | * | ||
28 | * This example sits on a syscall, and the syscall ABI is relatively stable | ||
29 | * of course, across platforms, and over time, the ABI may change. | ||
30 | */ | ||
31 | SEC("kprobe/" SYSCALL(sys_connect)) | ||
32 | int bpf_prog1(struct pt_regs *ctx) | ||
33 | { | ||
34 | struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx); | ||
35 | void *sockaddr_arg = (void *)PT_REGS_PARM2_CORE(real_regs); | ||
36 | int sockaddr_len = (int)PT_REGS_PARM3_CORE(real_regs); | ||
37 | struct sockaddr_in new_addr, orig_addr = {}; | ||
38 | struct sockaddr_in *mapped_addr; | ||
39 | |||
40 | if (sockaddr_len > sizeof(orig_addr)) | ||
41 | return 0; | ||
42 | |||
43 | if (bpf_probe_read_user(&orig_addr, sizeof(orig_addr), sockaddr_arg) != 0) | ||
44 | return 0; | ||
45 | |||
46 | mapped_addr = bpf_map_lookup_elem(&dnat_map, &orig_addr); | ||
47 | if (mapped_addr != NULL) { | ||
48 | memcpy(&new_addr, mapped_addr, sizeof(new_addr)); | ||
49 | bpf_probe_write_user(sockaddr_arg, &new_addr, | ||
50 | sizeof(new_addr)); | ||
51 | } | ||
52 | return 0; | ||
53 | } | ||
54 | |||
55 | char _license[] SEC("license") = "GPL"; | ||
56 | u32 _version SEC("version") = LINUX_VERSION_CODE; | ||
diff --git a/samples/bpf/test_probe_write_user_user.c b/samples/bpf/test_probe_write_user_user.c new file mode 100644 index 000000000..00ccfb834 --- /dev/null +++ b/samples/bpf/test_probe_write_user_user.c | |||
@@ -0,0 +1,108 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #include <stdio.h> | ||
3 | #include <assert.h> | ||
4 | #include <unistd.h> | ||
5 | #include <bpf/bpf.h> | ||
6 | #include <bpf/libbpf.h> | ||
7 | #include <sys/socket.h> | ||
8 | #include <netinet/in.h> | ||
9 | #include <arpa/inet.h> | ||
10 | |||
11 | int main(int ac, char **argv) | ||
12 | { | ||
13 | struct sockaddr_in *serv_addr_in, *mapped_addr_in, *tmp_addr_in; | ||
14 | struct sockaddr serv_addr, mapped_addr, tmp_addr; | ||
15 | int serverfd, serverconnfd, clientfd, map_fd; | ||
16 | struct bpf_link *link = NULL; | ||
17 | struct bpf_program *prog; | ||
18 | struct bpf_object *obj; | ||
19 | socklen_t sockaddr_len; | ||
20 | char filename[256]; | ||
21 | char *ip; | ||
22 | |||
23 | serv_addr_in = (struct sockaddr_in *)&serv_addr; | ||
24 | mapped_addr_in = (struct sockaddr_in *)&mapped_addr; | ||
25 | tmp_addr_in = (struct sockaddr_in *)&tmp_addr; | ||
26 | |||
27 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
28 | obj = bpf_object__open_file(filename, NULL); | ||
29 | if (libbpf_get_error(obj)) { | ||
30 | fprintf(stderr, "ERROR: opening BPF object file failed\n"); | ||
31 | return 0; | ||
32 | } | ||
33 | |||
34 | prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); | ||
35 | if (libbpf_get_error(prog)) { | ||
36 | fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); | ||
37 | goto cleanup; | ||
38 | } | ||
39 | |||
40 | /* load BPF program */ | ||
41 | if (bpf_object__load(obj)) { | ||
42 | fprintf(stderr, "ERROR: loading BPF object file failed\n"); | ||
43 | goto cleanup; | ||
44 | } | ||
45 | |||
46 | map_fd = bpf_object__find_map_fd_by_name(obj, "dnat_map"); | ||
47 | if (map_fd < 0) { | ||
48 | fprintf(stderr, "ERROR: finding a map in obj file failed\n"); | ||
49 | goto cleanup; | ||
50 | } | ||
51 | |||
52 | link = bpf_program__attach(prog); | ||
53 | if (libbpf_get_error(link)) { | ||
54 | fprintf(stderr, "ERROR: bpf_program__attach failed\n"); | ||
55 | link = NULL; | ||
56 | goto cleanup; | ||
57 | } | ||
58 | |||
59 | assert((serverfd = socket(AF_INET, SOCK_STREAM, 0)) > 0); | ||
60 | assert((clientfd = socket(AF_INET, SOCK_STREAM, 0)) > 0); | ||
61 | |||
62 | /* Bind server to ephemeral port on lo */ | ||
63 | memset(&serv_addr, 0, sizeof(serv_addr)); | ||
64 | serv_addr_in->sin_family = AF_INET; | ||
65 | serv_addr_in->sin_port = 0; | ||
66 | serv_addr_in->sin_addr.s_addr = htonl(INADDR_LOOPBACK); | ||
67 | |||
68 | assert(bind(serverfd, &serv_addr, sizeof(serv_addr)) == 0); | ||
69 | |||
70 | sockaddr_len = sizeof(serv_addr); | ||
71 | assert(getsockname(serverfd, &serv_addr, &sockaddr_len) == 0); | ||
72 | ip = inet_ntoa(serv_addr_in->sin_addr); | ||
73 | printf("Server bound to: %s:%d\n", ip, ntohs(serv_addr_in->sin_port)); | ||
74 | |||
75 | memset(&mapped_addr, 0, sizeof(mapped_addr)); | ||
76 | mapped_addr_in->sin_family = AF_INET; | ||
77 | mapped_addr_in->sin_port = htons(5555); | ||
78 | mapped_addr_in->sin_addr.s_addr = inet_addr("255.255.255.255"); | ||
79 | |||
80 | assert(!bpf_map_update_elem(map_fd, &mapped_addr, &serv_addr, BPF_ANY)); | ||
81 | |||
82 | assert(listen(serverfd, 5) == 0); | ||
83 | |||
84 | ip = inet_ntoa(mapped_addr_in->sin_addr); | ||
85 | printf("Client connecting to: %s:%d\n", | ||
86 | ip, ntohs(mapped_addr_in->sin_port)); | ||
87 | assert(connect(clientfd, &mapped_addr, sizeof(mapped_addr)) == 0); | ||
88 | |||
89 | sockaddr_len = sizeof(tmp_addr); | ||
90 | ip = inet_ntoa(tmp_addr_in->sin_addr); | ||
91 | assert((serverconnfd = accept(serverfd, &tmp_addr, &sockaddr_len)) > 0); | ||
92 | printf("Server received connection from: %s:%d\n", | ||
93 | ip, ntohs(tmp_addr_in->sin_port)); | ||
94 | |||
95 | sockaddr_len = sizeof(tmp_addr); | ||
96 | assert(getpeername(clientfd, &tmp_addr, &sockaddr_len) == 0); | ||
97 | ip = inet_ntoa(tmp_addr_in->sin_addr); | ||
98 | printf("Client's peer address: %s:%d\n", | ||
99 | ip, ntohs(tmp_addr_in->sin_port)); | ||
100 | |||
101 | /* Is the server's getsockname = the socket getpeername */ | ||
102 | assert(memcmp(&serv_addr, &tmp_addr, sizeof(struct sockaddr_in)) == 0); | ||
103 | |||
104 | cleanup: | ||
105 | bpf_link__destroy(link); | ||
106 | bpf_object__close(obj); | ||
107 | return 0; | ||
108 | } | ||
diff --git a/samples/bpf/trace_common.h b/samples/bpf/trace_common.h new file mode 100644 index 000000000..8cb5400ae --- /dev/null +++ b/samples/bpf/trace_common.h | |||
@@ -0,0 +1,13 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #ifndef __TRACE_COMMON_H | ||
3 | #define __TRACE_COMMON_H | ||
4 | |||
5 | #ifdef __x86_64__ | ||
6 | #define SYSCALL(SYS) "__x64_" __stringify(SYS) | ||
7 | #elif defined(__s390x__) | ||
8 | #define SYSCALL(SYS) "__s390x_" __stringify(SYS) | ||
9 | #else | ||
10 | #define SYSCALL(SYS) __stringify(SYS) | ||
11 | #endif | ||
12 | |||
13 | #endif | ||
diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c new file mode 100644 index 000000000..7d3c66fb3 --- /dev/null +++ b/samples/bpf/trace_event_kern.c | |||
@@ -0,0 +1,80 @@ | |||
1 | /* Copyright (c) 2016 Facebook | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #include <linux/ptrace.h> | ||
8 | #include <linux/version.h> | ||
9 | #include <uapi/linux/bpf.h> | ||
10 | #include <uapi/linux/bpf_perf_event.h> | ||
11 | #include <uapi/linux/perf_event.h> | ||
12 | #include <bpf/bpf_helpers.h> | ||
13 | #include <bpf/bpf_tracing.h> | ||
14 | |||
15 | struct key_t { | ||
16 | char comm[TASK_COMM_LEN]; | ||
17 | u32 kernstack; | ||
18 | u32 userstack; | ||
19 | }; | ||
20 | |||
21 | struct { | ||
22 | __uint(type, BPF_MAP_TYPE_HASH); | ||
23 | __type(key, struct key_t); | ||
24 | __type(value, u64); | ||
25 | __uint(max_entries, 10000); | ||
26 | } counts SEC(".maps"); | ||
27 | |||
28 | struct { | ||
29 | __uint(type, BPF_MAP_TYPE_STACK_TRACE); | ||
30 | __uint(key_size, sizeof(u32)); | ||
31 | __uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64)); | ||
32 | __uint(max_entries, 10000); | ||
33 | } stackmap SEC(".maps"); | ||
34 | |||
35 | #define KERN_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP) | ||
36 | #define USER_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK) | ||
37 | |||
38 | SEC("perf_event") | ||
39 | int bpf_prog1(struct bpf_perf_event_data *ctx) | ||
40 | { | ||
41 | char time_fmt1[] = "Time Enabled: %llu, Time Running: %llu"; | ||
42 | char time_fmt2[] = "Get Time Failed, ErrCode: %d"; | ||
43 | char addr_fmt[] = "Address recorded on event: %llx"; | ||
44 | char fmt[] = "CPU-%d period %lld ip %llx"; | ||
45 | u32 cpu = bpf_get_smp_processor_id(); | ||
46 | struct bpf_perf_event_value value_buf; | ||
47 | struct key_t key; | ||
48 | u64 *val, one = 1; | ||
49 | int ret; | ||
50 | |||
51 | if (ctx->sample_period < 10000) | ||
52 | /* ignore warmup */ | ||
53 | return 0; | ||
54 | bpf_get_current_comm(&key.comm, sizeof(key.comm)); | ||
55 | key.kernstack = bpf_get_stackid(ctx, &stackmap, KERN_STACKID_FLAGS); | ||
56 | key.userstack = bpf_get_stackid(ctx, &stackmap, USER_STACKID_FLAGS); | ||
57 | if ((int)key.kernstack < 0 && (int)key.userstack < 0) { | ||
58 | bpf_trace_printk(fmt, sizeof(fmt), cpu, ctx->sample_period, | ||
59 | PT_REGS_IP(&ctx->regs)); | ||
60 | return 0; | ||
61 | } | ||
62 | |||
63 | ret = bpf_perf_prog_read_value(ctx, (void *)&value_buf, sizeof(struct bpf_perf_event_value)); | ||
64 | if (!ret) | ||
65 | bpf_trace_printk(time_fmt1, sizeof(time_fmt1), value_buf.enabled, value_buf.running); | ||
66 | else | ||
67 | bpf_trace_printk(time_fmt2, sizeof(time_fmt2), ret); | ||
68 | |||
69 | if (ctx->addr != 0) | ||
70 | bpf_trace_printk(addr_fmt, sizeof(addr_fmt), ctx->addr); | ||
71 | |||
72 | val = bpf_map_lookup_elem(&counts, &key); | ||
73 | if (val) | ||
74 | (*val)++; | ||
75 | else | ||
76 | bpf_map_update_elem(&counts, &key, &one, BPF_NOEXIST); | ||
77 | return 0; | ||
78 | } | ||
79 | |||
80 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c new file mode 100644 index 000000000..ac1ba3681 --- /dev/null +++ b/samples/bpf/trace_event_user.c | |||
@@ -0,0 +1,354 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* Copyright (c) 2016 Facebook | ||
3 | */ | ||
4 | #include <stdio.h> | ||
5 | #include <unistd.h> | ||
6 | #include <stdlib.h> | ||
7 | #include <stdbool.h> | ||
8 | #include <string.h> | ||
9 | #include <linux/perf_event.h> | ||
10 | #include <linux/bpf.h> | ||
11 | #include <signal.h> | ||
12 | #include <errno.h> | ||
13 | #include <sys/resource.h> | ||
14 | #include <bpf/bpf.h> | ||
15 | #include <bpf/libbpf.h> | ||
16 | #include "perf-sys.h" | ||
17 | #include "trace_helpers.h" | ||
18 | |||
19 | #define SAMPLE_FREQ 50 | ||
20 | |||
21 | static int pid; | ||
22 | /* counts, stackmap */ | ||
23 | static int map_fd[2]; | ||
24 | struct bpf_program *prog; | ||
25 | static bool sys_read_seen, sys_write_seen; | ||
26 | |||
27 | static void print_ksym(__u64 addr) | ||
28 | { | ||
29 | struct ksym *sym; | ||
30 | |||
31 | if (!addr) | ||
32 | return; | ||
33 | sym = ksym_search(addr); | ||
34 | if (!sym) { | ||
35 | printf("ksym not found. Is kallsyms loaded?\n"); | ||
36 | return; | ||
37 | } | ||
38 | |||
39 | printf("%s;", sym->name); | ||
40 | if (!strstr(sym->name, "sys_read")) | ||
41 | sys_read_seen = true; | ||
42 | else if (!strstr(sym->name, "sys_write")) | ||
43 | sys_write_seen = true; | ||
44 | } | ||
45 | |||
46 | static void print_addr(__u64 addr) | ||
47 | { | ||
48 | if (!addr) | ||
49 | return; | ||
50 | printf("%llx;", addr); | ||
51 | } | ||
52 | |||
53 | #define TASK_COMM_LEN 16 | ||
54 | |||
55 | struct key_t { | ||
56 | char comm[TASK_COMM_LEN]; | ||
57 | __u32 kernstack; | ||
58 | __u32 userstack; | ||
59 | }; | ||
60 | |||
61 | static void print_stack(struct key_t *key, __u64 count) | ||
62 | { | ||
63 | __u64 ip[PERF_MAX_STACK_DEPTH] = {}; | ||
64 | static bool warned; | ||
65 | int i; | ||
66 | |||
67 | printf("%3lld %s;", count, key->comm); | ||
68 | if (bpf_map_lookup_elem(map_fd[1], &key->kernstack, ip) != 0) { | ||
69 | printf("---;"); | ||
70 | } else { | ||
71 | for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--) | ||
72 | print_ksym(ip[i]); | ||
73 | } | ||
74 | printf("-;"); | ||
75 | if (bpf_map_lookup_elem(map_fd[1], &key->userstack, ip) != 0) { | ||
76 | printf("---;"); | ||
77 | } else { | ||
78 | for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--) | ||
79 | print_addr(ip[i]); | ||
80 | } | ||
81 | if (count < 6) | ||
82 | printf("\r"); | ||
83 | else | ||
84 | printf("\n"); | ||
85 | |||
86 | if (key->kernstack == -EEXIST && !warned) { | ||
87 | printf("stackmap collisions seen. Consider increasing size\n"); | ||
88 | warned = true; | ||
89 | } else if ((int)key->kernstack < 0 && (int)key->userstack < 0) { | ||
90 | printf("err stackid %d %d\n", key->kernstack, key->userstack); | ||
91 | } | ||
92 | } | ||
93 | |||
94 | static void err_exit(int err) | ||
95 | { | ||
96 | kill(pid, SIGKILL); | ||
97 | exit(err); | ||
98 | } | ||
99 | |||
100 | static void print_stacks(void) | ||
101 | { | ||
102 | struct key_t key = {}, next_key; | ||
103 | __u64 value; | ||
104 | __u32 stackid = 0, next_id; | ||
105 | int error = 1, fd = map_fd[0], stack_map = map_fd[1]; | ||
106 | |||
107 | sys_read_seen = sys_write_seen = false; | ||
108 | while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { | ||
109 | bpf_map_lookup_elem(fd, &next_key, &value); | ||
110 | print_stack(&next_key, value); | ||
111 | bpf_map_delete_elem(fd, &next_key); | ||
112 | key = next_key; | ||
113 | } | ||
114 | printf("\n"); | ||
115 | if (!sys_read_seen || !sys_write_seen) { | ||
116 | printf("BUG kernel stack doesn't contain sys_read() and sys_write()\n"); | ||
117 | err_exit(error); | ||
118 | } | ||
119 | |||
120 | /* clear stack map */ | ||
121 | while (bpf_map_get_next_key(stack_map, &stackid, &next_id) == 0) { | ||
122 | bpf_map_delete_elem(stack_map, &next_id); | ||
123 | stackid = next_id; | ||
124 | } | ||
125 | } | ||
126 | |||
127 | static inline int generate_load(void) | ||
128 | { | ||
129 | if (system("dd if=/dev/zero of=/dev/null count=5000k status=none") < 0) { | ||
130 | printf("failed to generate some load with dd: %s\n", strerror(errno)); | ||
131 | return -1; | ||
132 | } | ||
133 | |||
134 | return 0; | ||
135 | } | ||
136 | |||
137 | static void test_perf_event_all_cpu(struct perf_event_attr *attr) | ||
138 | { | ||
139 | int nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); | ||
140 | struct bpf_link **links = calloc(nr_cpus, sizeof(struct bpf_link *)); | ||
141 | int i, pmu_fd, error = 1; | ||
142 | |||
143 | if (!links) { | ||
144 | printf("malloc of links failed\n"); | ||
145 | goto err; | ||
146 | } | ||
147 | |||
148 | /* system wide perf event, no need to inherit */ | ||
149 | attr->inherit = 0; | ||
150 | |||
151 | /* open perf_event on all cpus */ | ||
152 | for (i = 0; i < nr_cpus; i++) { | ||
153 | pmu_fd = sys_perf_event_open(attr, -1, i, -1, 0); | ||
154 | if (pmu_fd < 0) { | ||
155 | printf("sys_perf_event_open failed\n"); | ||
156 | goto all_cpu_err; | ||
157 | } | ||
158 | links[i] = bpf_program__attach_perf_event(prog, pmu_fd); | ||
159 | if (libbpf_get_error(links[i])) { | ||
160 | printf("bpf_program__attach_perf_event failed\n"); | ||
161 | links[i] = NULL; | ||
162 | close(pmu_fd); | ||
163 | goto all_cpu_err; | ||
164 | } | ||
165 | } | ||
166 | |||
167 | if (generate_load() < 0) | ||
168 | goto all_cpu_err; | ||
169 | |||
170 | print_stacks(); | ||
171 | error = 0; | ||
172 | all_cpu_err: | ||
173 | for (i--; i >= 0; i--) | ||
174 | bpf_link__destroy(links[i]); | ||
175 | err: | ||
176 | free(links); | ||
177 | if (error) | ||
178 | err_exit(error); | ||
179 | } | ||
180 | |||
181 | static void test_perf_event_task(struct perf_event_attr *attr) | ||
182 | { | ||
183 | struct bpf_link *link = NULL; | ||
184 | int pmu_fd, error = 1; | ||
185 | |||
186 | /* per task perf event, enable inherit so the "dd ..." command can be traced properly. | ||
187 | * Enabling inherit will cause bpf_perf_prog_read_time helper failure. | ||
188 | */ | ||
189 | attr->inherit = 1; | ||
190 | |||
191 | /* open task bound event */ | ||
192 | pmu_fd = sys_perf_event_open(attr, 0, -1, -1, 0); | ||
193 | if (pmu_fd < 0) { | ||
194 | printf("sys_perf_event_open failed\n"); | ||
195 | goto err; | ||
196 | } | ||
197 | link = bpf_program__attach_perf_event(prog, pmu_fd); | ||
198 | if (libbpf_get_error(link)) { | ||
199 | printf("bpf_program__attach_perf_event failed\n"); | ||
200 | link = NULL; | ||
201 | close(pmu_fd); | ||
202 | goto err; | ||
203 | } | ||
204 | |||
205 | if (generate_load() < 0) | ||
206 | goto err; | ||
207 | |||
208 | print_stacks(); | ||
209 | error = 0; | ||
210 | err: | ||
211 | bpf_link__destroy(link); | ||
212 | if (error) | ||
213 | err_exit(error); | ||
214 | } | ||
215 | |||
216 | static void test_bpf_perf_event(void) | ||
217 | { | ||
218 | struct perf_event_attr attr_type_hw = { | ||
219 | .sample_freq = SAMPLE_FREQ, | ||
220 | .freq = 1, | ||
221 | .type = PERF_TYPE_HARDWARE, | ||
222 | .config = PERF_COUNT_HW_CPU_CYCLES, | ||
223 | }; | ||
224 | struct perf_event_attr attr_type_sw = { | ||
225 | .sample_freq = SAMPLE_FREQ, | ||
226 | .freq = 1, | ||
227 | .type = PERF_TYPE_SOFTWARE, | ||
228 | .config = PERF_COUNT_SW_CPU_CLOCK, | ||
229 | }; | ||
230 | struct perf_event_attr attr_hw_cache_l1d = { | ||
231 | .sample_freq = SAMPLE_FREQ, | ||
232 | .freq = 1, | ||
233 | .type = PERF_TYPE_HW_CACHE, | ||
234 | .config = | ||
235 | PERF_COUNT_HW_CACHE_L1D | | ||
236 | (PERF_COUNT_HW_CACHE_OP_READ << 8) | | ||
237 | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), | ||
238 | }; | ||
239 | struct perf_event_attr attr_hw_cache_branch_miss = { | ||
240 | .sample_freq = SAMPLE_FREQ, | ||
241 | .freq = 1, | ||
242 | .type = PERF_TYPE_HW_CACHE, | ||
243 | .config = | ||
244 | PERF_COUNT_HW_CACHE_BPU | | ||
245 | (PERF_COUNT_HW_CACHE_OP_READ << 8) | | ||
246 | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), | ||
247 | }; | ||
248 | struct perf_event_attr attr_type_raw = { | ||
249 | .sample_freq = SAMPLE_FREQ, | ||
250 | .freq = 1, | ||
251 | .type = PERF_TYPE_RAW, | ||
252 | /* Intel Instruction Retired */ | ||
253 | .config = 0xc0, | ||
254 | }; | ||
255 | struct perf_event_attr attr_type_raw_lock_load = { | ||
256 | .sample_freq = SAMPLE_FREQ, | ||
257 | .freq = 1, | ||
258 | .type = PERF_TYPE_RAW, | ||
259 | /* Intel MEM_UOPS_RETIRED.LOCK_LOADS */ | ||
260 | .config = 0x21d0, | ||
261 | /* Request to record lock address from PEBS */ | ||
262 | .sample_type = PERF_SAMPLE_ADDR, | ||
263 | /* Record address value requires precise event */ | ||
264 | .precise_ip = 2, | ||
265 | }; | ||
266 | |||
267 | printf("Test HW_CPU_CYCLES\n"); | ||
268 | test_perf_event_all_cpu(&attr_type_hw); | ||
269 | test_perf_event_task(&attr_type_hw); | ||
270 | |||
271 | printf("Test SW_CPU_CLOCK\n"); | ||
272 | test_perf_event_all_cpu(&attr_type_sw); | ||
273 | test_perf_event_task(&attr_type_sw); | ||
274 | |||
275 | printf("Test HW_CACHE_L1D\n"); | ||
276 | test_perf_event_all_cpu(&attr_hw_cache_l1d); | ||
277 | test_perf_event_task(&attr_hw_cache_l1d); | ||
278 | |||
279 | printf("Test HW_CACHE_BPU\n"); | ||
280 | test_perf_event_all_cpu(&attr_hw_cache_branch_miss); | ||
281 | test_perf_event_task(&attr_hw_cache_branch_miss); | ||
282 | |||
283 | printf("Test Instruction Retired\n"); | ||
284 | test_perf_event_all_cpu(&attr_type_raw); | ||
285 | test_perf_event_task(&attr_type_raw); | ||
286 | |||
287 | printf("Test Lock Load\n"); | ||
288 | test_perf_event_all_cpu(&attr_type_raw_lock_load); | ||
289 | test_perf_event_task(&attr_type_raw_lock_load); | ||
290 | |||
291 | printf("*** PASS ***\n"); | ||
292 | } | ||
293 | |||
294 | |||
295 | int main(int argc, char **argv) | ||
296 | { | ||
297 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
298 | struct bpf_object *obj = NULL; | ||
299 | char filename[256]; | ||
300 | int error = 1; | ||
301 | |||
302 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
303 | setrlimit(RLIMIT_MEMLOCK, &r); | ||
304 | |||
305 | signal(SIGINT, err_exit); | ||
306 | signal(SIGTERM, err_exit); | ||
307 | |||
308 | if (load_kallsyms()) { | ||
309 | printf("failed to process /proc/kallsyms\n"); | ||
310 | goto cleanup; | ||
311 | } | ||
312 | |||
313 | obj = bpf_object__open_file(filename, NULL); | ||
314 | if (libbpf_get_error(obj)) { | ||
315 | printf("opening BPF object file failed\n"); | ||
316 | obj = NULL; | ||
317 | goto cleanup; | ||
318 | } | ||
319 | |||
320 | prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); | ||
321 | if (!prog) { | ||
322 | printf("finding a prog in obj file failed\n"); | ||
323 | goto cleanup; | ||
324 | } | ||
325 | |||
326 | /* load BPF program */ | ||
327 | if (bpf_object__load(obj)) { | ||
328 | printf("loading BPF object file failed\n"); | ||
329 | goto cleanup; | ||
330 | } | ||
331 | |||
332 | map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counts"); | ||
333 | map_fd[1] = bpf_object__find_map_fd_by_name(obj, "stackmap"); | ||
334 | if (map_fd[0] < 0 || map_fd[1] < 0) { | ||
335 | printf("finding a counts/stackmap map in obj file failed\n"); | ||
336 | goto cleanup; | ||
337 | } | ||
338 | |||
339 | pid = fork(); | ||
340 | if (pid == 0) { | ||
341 | read_trace_pipe(); | ||
342 | return 0; | ||
343 | } else if (pid == -1) { | ||
344 | printf("couldn't spawn process\n"); | ||
345 | goto cleanup; | ||
346 | } | ||
347 | |||
348 | test_bpf_perf_event(); | ||
349 | error = 0; | ||
350 | |||
351 | cleanup: | ||
352 | bpf_object__close(obj); | ||
353 | err_exit(error); | ||
354 | } | ||
diff --git a/samples/bpf/trace_output_kern.c b/samples/bpf/trace_output_kern.c new file mode 100644 index 000000000..b64815af0 --- /dev/null +++ b/samples/bpf/trace_output_kern.c | |||
@@ -0,0 +1,31 @@ | |||
1 | #include <linux/ptrace.h> | ||
2 | #include <linux/version.h> | ||
3 | #include <uapi/linux/bpf.h> | ||
4 | #include <bpf/bpf_helpers.h> | ||
5 | #include "trace_common.h" | ||
6 | |||
7 | struct { | ||
8 | __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); | ||
9 | __uint(key_size, sizeof(int)); | ||
10 | __uint(value_size, sizeof(u32)); | ||
11 | __uint(max_entries, 2); | ||
12 | } my_map SEC(".maps"); | ||
13 | |||
14 | SEC("kprobe/" SYSCALL(sys_write)) | ||
15 | int bpf_prog1(struct pt_regs *ctx) | ||
16 | { | ||
17 | struct S { | ||
18 | u64 pid; | ||
19 | u64 cookie; | ||
20 | } data; | ||
21 | |||
22 | data.pid = bpf_get_current_pid_tgid(); | ||
23 | data.cookie = 0x12345678; | ||
24 | |||
25 | bpf_perf_event_output(ctx, &my_map, 0, &data, sizeof(data)); | ||
26 | |||
27 | return 0; | ||
28 | } | ||
29 | |||
30 | char _license[] SEC("license") = "GPL"; | ||
31 | u32 _version SEC("version") = LINUX_VERSION_CODE; | ||
diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c new file mode 100644 index 000000000..364b98764 --- /dev/null +++ b/samples/bpf/trace_output_user.c | |||
@@ -0,0 +1,107 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | #include <stdio.h> | ||
3 | #include <fcntl.h> | ||
4 | #include <poll.h> | ||
5 | #include <time.h> | ||
6 | #include <signal.h> | ||
7 | #include <bpf/libbpf.h> | ||
8 | |||
9 | static __u64 time_get_ns(void) | ||
10 | { | ||
11 | struct timespec ts; | ||
12 | |||
13 | clock_gettime(CLOCK_MONOTONIC, &ts); | ||
14 | return ts.tv_sec * 1000000000ull + ts.tv_nsec; | ||
15 | } | ||
16 | |||
17 | static __u64 start_time; | ||
18 | static __u64 cnt; | ||
19 | |||
20 | #define MAX_CNT 100000ll | ||
21 | |||
22 | static void print_bpf_output(void *ctx, int cpu, void *data, __u32 size) | ||
23 | { | ||
24 | struct { | ||
25 | __u64 pid; | ||
26 | __u64 cookie; | ||
27 | } *e = data; | ||
28 | |||
29 | if (e->cookie != 0x12345678) { | ||
30 | printf("BUG pid %llx cookie %llx sized %d\n", | ||
31 | e->pid, e->cookie, size); | ||
32 | return; | ||
33 | } | ||
34 | |||
35 | cnt++; | ||
36 | |||
37 | if (cnt == MAX_CNT) { | ||
38 | printf("recv %lld events per sec\n", | ||
39 | MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); | ||
40 | return; | ||
41 | } | ||
42 | } | ||
43 | |||
44 | int main(int argc, char **argv) | ||
45 | { | ||
46 | struct perf_buffer_opts pb_opts = {}; | ||
47 | struct bpf_link *link = NULL; | ||
48 | struct bpf_program *prog; | ||
49 | struct perf_buffer *pb; | ||
50 | struct bpf_object *obj; | ||
51 | int map_fd, ret = 0; | ||
52 | char filename[256]; | ||
53 | FILE *f; | ||
54 | |||
55 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
56 | obj = bpf_object__open_file(filename, NULL); | ||
57 | if (libbpf_get_error(obj)) { | ||
58 | fprintf(stderr, "ERROR: opening BPF object file failed\n"); | ||
59 | return 0; | ||
60 | } | ||
61 | |||
62 | /* load BPF program */ | ||
63 | if (bpf_object__load(obj)) { | ||
64 | fprintf(stderr, "ERROR: loading BPF object file failed\n"); | ||
65 | goto cleanup; | ||
66 | } | ||
67 | |||
68 | map_fd = bpf_object__find_map_fd_by_name(obj, "my_map"); | ||
69 | if (map_fd < 0) { | ||
70 | fprintf(stderr, "ERROR: finding a map in obj file failed\n"); | ||
71 | goto cleanup; | ||
72 | } | ||
73 | |||
74 | prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); | ||
75 | if (libbpf_get_error(prog)) { | ||
76 | fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); | ||
77 | goto cleanup; | ||
78 | } | ||
79 | |||
80 | link = bpf_program__attach(prog); | ||
81 | if (libbpf_get_error(link)) { | ||
82 | fprintf(stderr, "ERROR: bpf_program__attach failed\n"); | ||
83 | link = NULL; | ||
84 | goto cleanup; | ||
85 | } | ||
86 | |||
87 | pb_opts.sample_cb = print_bpf_output; | ||
88 | pb = perf_buffer__new(map_fd, 8, &pb_opts); | ||
89 | ret = libbpf_get_error(pb); | ||
90 | if (ret) { | ||
91 | printf("failed to setup perf_buffer: %d\n", ret); | ||
92 | return 1; | ||
93 | } | ||
94 | |||
95 | f = popen("taskset 1 dd if=/dev/zero of=/dev/null", "r"); | ||
96 | (void) f; | ||
97 | |||
98 | start_time = time_get_ns(); | ||
99 | while ((ret = perf_buffer__poll(pb, 1000)) >= 0 && cnt < MAX_CNT) { | ||
100 | } | ||
101 | kill(0, SIGINT); | ||
102 | |||
103 | cleanup: | ||
104 | bpf_link__destroy(link); | ||
105 | bpf_object__close(obj); | ||
106 | return ret; | ||
107 | } | ||
diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c new file mode 100644 index 000000000..ef30d2b35 --- /dev/null +++ b/samples/bpf/tracex1_kern.c | |||
@@ -0,0 +1,54 @@ | |||
1 | /* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #include <linux/skbuff.h> | ||
8 | #include <linux/netdevice.h> | ||
9 | #include <uapi/linux/bpf.h> | ||
10 | #include <linux/version.h> | ||
11 | #include <bpf/bpf_helpers.h> | ||
12 | #include <bpf/bpf_tracing.h> | ||
13 | |||
14 | #define _(P) \ | ||
15 | ({ \ | ||
16 | typeof(P) val = 0; \ | ||
17 | bpf_probe_read_kernel(&val, sizeof(val), &(P)); \ | ||
18 | val; \ | ||
19 | }) | ||
20 | |||
21 | /* kprobe is NOT a stable ABI | ||
22 | * kernel functions can be removed, renamed or completely change semantics. | ||
23 | * Number of arguments and their positions can change, etc. | ||
24 | * In such case this bpf+kprobe example will no longer be meaningful | ||
25 | */ | ||
26 | SEC("kprobe/__netif_receive_skb_core") | ||
27 | int bpf_prog1(struct pt_regs *ctx) | ||
28 | { | ||
29 | /* attaches to kprobe __netif_receive_skb_core, | ||
30 | * looks for packets on loobpack device and prints them | ||
31 | */ | ||
32 | char devname[IFNAMSIZ]; | ||
33 | struct net_device *dev; | ||
34 | struct sk_buff *skb; | ||
35 | int len; | ||
36 | |||
37 | /* non-portable! works for the given kernel only */ | ||
38 | bpf_probe_read_kernel(&skb, sizeof(skb), (void *)PT_REGS_PARM1(ctx)); | ||
39 | dev = _(skb->dev); | ||
40 | len = _(skb->len); | ||
41 | |||
42 | bpf_probe_read_kernel(devname, sizeof(devname), dev->name); | ||
43 | |||
44 | if (devname[0] == 'l' && devname[1] == 'o') { | ||
45 | char fmt[] = "skb %p len %d\n"; | ||
46 | /* using bpf_trace_printk() for DEBUG ONLY */ | ||
47 | bpf_trace_printk(fmt, sizeof(fmt), skb, len); | ||
48 | } | ||
49 | |||
50 | return 0; | ||
51 | } | ||
52 | |||
53 | char _license[] SEC("license") = "GPL"; | ||
54 | u32 _version SEC("version") = LINUX_VERSION_CODE; | ||
diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c new file mode 100644 index 000000000..9d4adb7fd --- /dev/null +++ b/samples/bpf/tracex1_user.c | |||
@@ -0,0 +1,50 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #include <stdio.h> | ||
3 | #include <unistd.h> | ||
4 | #include <bpf/libbpf.h> | ||
5 | #include "trace_helpers.h" | ||
6 | |||
7 | int main(int ac, char **argv) | ||
8 | { | ||
9 | struct bpf_link *link = NULL; | ||
10 | struct bpf_program *prog; | ||
11 | struct bpf_object *obj; | ||
12 | char filename[256]; | ||
13 | FILE *f; | ||
14 | |||
15 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
16 | obj = bpf_object__open_file(filename, NULL); | ||
17 | if (libbpf_get_error(obj)) { | ||
18 | fprintf(stderr, "ERROR: opening BPF object file failed\n"); | ||
19 | return 0; | ||
20 | } | ||
21 | |||
22 | prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); | ||
23 | if (!prog) { | ||
24 | fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); | ||
25 | goto cleanup; | ||
26 | } | ||
27 | |||
28 | /* load BPF program */ | ||
29 | if (bpf_object__load(obj)) { | ||
30 | fprintf(stderr, "ERROR: loading BPF object file failed\n"); | ||
31 | goto cleanup; | ||
32 | } | ||
33 | |||
34 | link = bpf_program__attach(prog); | ||
35 | if (libbpf_get_error(link)) { | ||
36 | fprintf(stderr, "ERROR: bpf_program__attach failed\n"); | ||
37 | link = NULL; | ||
38 | goto cleanup; | ||
39 | } | ||
40 | |||
41 | f = popen("taskset 1 ping -c5 localhost", "r"); | ||
42 | (void) f; | ||
43 | |||
44 | read_trace_pipe(); | ||
45 | |||
46 | cleanup: | ||
47 | bpf_link__destroy(link); | ||
48 | bpf_object__close(obj); | ||
49 | return 0; | ||
50 | } | ||
diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c new file mode 100644 index 000000000..5bc696bac --- /dev/null +++ b/samples/bpf/tracex2_kern.c | |||
@@ -0,0 +1,102 @@ | |||
1 | /* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #include <linux/skbuff.h> | ||
8 | #include <linux/netdevice.h> | ||
9 | #include <linux/version.h> | ||
10 | #include <uapi/linux/bpf.h> | ||
11 | #include <bpf/bpf_helpers.h> | ||
12 | #include <bpf/bpf_tracing.h> | ||
13 | #include "trace_common.h" | ||
14 | |||
15 | struct { | ||
16 | __uint(type, BPF_MAP_TYPE_HASH); | ||
17 | __type(key, long); | ||
18 | __type(value, long); | ||
19 | __uint(max_entries, 1024); | ||
20 | } my_map SEC(".maps"); | ||
21 | |||
22 | /* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe | ||
23 | * example will no longer be meaningful | ||
24 | */ | ||
25 | SEC("kprobe/kfree_skb") | ||
26 | int bpf_prog2(struct pt_regs *ctx) | ||
27 | { | ||
28 | long loc = 0; | ||
29 | long init_val = 1; | ||
30 | long *value; | ||
31 | |||
32 | /* read ip of kfree_skb caller. | ||
33 | * non-portable version of __builtin_return_address(0) | ||
34 | */ | ||
35 | BPF_KPROBE_READ_RET_IP(loc, ctx); | ||
36 | |||
37 | value = bpf_map_lookup_elem(&my_map, &loc); | ||
38 | if (value) | ||
39 | *value += 1; | ||
40 | else | ||
41 | bpf_map_update_elem(&my_map, &loc, &init_val, BPF_ANY); | ||
42 | return 0; | ||
43 | } | ||
44 | |||
45 | static unsigned int log2(unsigned int v) | ||
46 | { | ||
47 | unsigned int r; | ||
48 | unsigned int shift; | ||
49 | |||
50 | r = (v > 0xFFFF) << 4; v >>= r; | ||
51 | shift = (v > 0xFF) << 3; v >>= shift; r |= shift; | ||
52 | shift = (v > 0xF) << 2; v >>= shift; r |= shift; | ||
53 | shift = (v > 0x3) << 1; v >>= shift; r |= shift; | ||
54 | r |= (v >> 1); | ||
55 | return r; | ||
56 | } | ||
57 | |||
58 | static unsigned int log2l(unsigned long v) | ||
59 | { | ||
60 | unsigned int hi = v >> 32; | ||
61 | if (hi) | ||
62 | return log2(hi) + 32; | ||
63 | else | ||
64 | return log2(v); | ||
65 | } | ||
66 | |||
67 | struct hist_key { | ||
68 | char comm[16]; | ||
69 | u64 pid_tgid; | ||
70 | u64 uid_gid; | ||
71 | u64 index; | ||
72 | }; | ||
73 | |||
74 | struct { | ||
75 | __uint(type, BPF_MAP_TYPE_PERCPU_HASH); | ||
76 | __uint(key_size, sizeof(struct hist_key)); | ||
77 | __uint(value_size, sizeof(long)); | ||
78 | __uint(max_entries, 1024); | ||
79 | } my_hist_map SEC(".maps"); | ||
80 | |||
81 | SEC("kprobe/" SYSCALL(sys_write)) | ||
82 | int bpf_prog3(struct pt_regs *ctx) | ||
83 | { | ||
84 | long write_size = PT_REGS_PARM3(ctx); | ||
85 | long init_val = 1; | ||
86 | long *value; | ||
87 | struct hist_key key; | ||
88 | |||
89 | key.index = log2l(write_size); | ||
90 | key.pid_tgid = bpf_get_current_pid_tgid(); | ||
91 | key.uid_gid = bpf_get_current_uid_gid(); | ||
92 | bpf_get_current_comm(&key.comm, sizeof(key.comm)); | ||
93 | |||
94 | value = bpf_map_lookup_elem(&my_hist_map, &key); | ||
95 | if (value) | ||
96 | __sync_fetch_and_add(value, 1); | ||
97 | else | ||
98 | bpf_map_update_elem(&my_hist_map, &key, &init_val, BPF_ANY); | ||
99 | return 0; | ||
100 | } | ||
101 | char _license[] SEC("license") = "GPL"; | ||
102 | u32 _version SEC("version") = LINUX_VERSION_CODE; | ||
diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c new file mode 100644 index 000000000..3d6eab711 --- /dev/null +++ b/samples/bpf/tracex2_user.c | |||
@@ -0,0 +1,193 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #include <stdio.h> | ||
3 | #include <unistd.h> | ||
4 | #include <stdlib.h> | ||
5 | #include <signal.h> | ||
6 | #include <string.h> | ||
7 | #include <sys/resource.h> | ||
8 | |||
9 | #include <bpf/bpf.h> | ||
10 | #include <bpf/libbpf.h> | ||
11 | #include "bpf_util.h" | ||
12 | |||
13 | #define MAX_INDEX 64 | ||
14 | #define MAX_STARS 38 | ||
15 | |||
16 | /* my_map, my_hist_map */ | ||
17 | static int map_fd[2]; | ||
18 | |||
19 | static void stars(char *str, long val, long max, int width) | ||
20 | { | ||
21 | int i; | ||
22 | |||
23 | for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++) | ||
24 | str[i] = '*'; | ||
25 | if (val > max) | ||
26 | str[i - 1] = '+'; | ||
27 | str[i] = '\0'; | ||
28 | } | ||
29 | |||
30 | struct task { | ||
31 | char comm[16]; | ||
32 | __u64 pid_tgid; | ||
33 | __u64 uid_gid; | ||
34 | }; | ||
35 | |||
36 | struct hist_key { | ||
37 | struct task t; | ||
38 | __u32 index; | ||
39 | }; | ||
40 | |||
41 | #define SIZE sizeof(struct task) | ||
42 | |||
43 | static void print_hist_for_pid(int fd, void *task) | ||
44 | { | ||
45 | unsigned int nr_cpus = bpf_num_possible_cpus(); | ||
46 | struct hist_key key = {}, next_key; | ||
47 | long values[nr_cpus]; | ||
48 | char starstr[MAX_STARS]; | ||
49 | long value; | ||
50 | long data[MAX_INDEX] = {}; | ||
51 | int max_ind = -1; | ||
52 | long max_value = 0; | ||
53 | int i, ind; | ||
54 | |||
55 | while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { | ||
56 | if (memcmp(&next_key, task, SIZE)) { | ||
57 | key = next_key; | ||
58 | continue; | ||
59 | } | ||
60 | bpf_map_lookup_elem(fd, &next_key, values); | ||
61 | value = 0; | ||
62 | for (i = 0; i < nr_cpus; i++) | ||
63 | value += values[i]; | ||
64 | ind = next_key.index; | ||
65 | data[ind] = value; | ||
66 | if (value && ind > max_ind) | ||
67 | max_ind = ind; | ||
68 | if (value > max_value) | ||
69 | max_value = value; | ||
70 | key = next_key; | ||
71 | } | ||
72 | |||
73 | printf(" syscall write() stats\n"); | ||
74 | printf(" byte_size : count distribution\n"); | ||
75 | for (i = 1; i <= max_ind + 1; i++) { | ||
76 | stars(starstr, data[i - 1], max_value, MAX_STARS); | ||
77 | printf("%8ld -> %-8ld : %-8ld |%-*s|\n", | ||
78 | (1l << i) >> 1, (1l << i) - 1, data[i - 1], | ||
79 | MAX_STARS, starstr); | ||
80 | } | ||
81 | } | ||
82 | |||
83 | static void print_hist(int fd) | ||
84 | { | ||
85 | struct hist_key key = {}, next_key; | ||
86 | static struct task tasks[1024]; | ||
87 | int task_cnt = 0; | ||
88 | int i; | ||
89 | |||
90 | while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { | ||
91 | int found = 0; | ||
92 | |||
93 | for (i = 0; i < task_cnt; i++) | ||
94 | if (memcmp(&tasks[i], &next_key, SIZE) == 0) | ||
95 | found = 1; | ||
96 | if (!found) | ||
97 | memcpy(&tasks[task_cnt++], &next_key, SIZE); | ||
98 | key = next_key; | ||
99 | } | ||
100 | |||
101 | for (i = 0; i < task_cnt; i++) { | ||
102 | printf("\npid %d cmd %s uid %d\n", | ||
103 | (__u32) tasks[i].pid_tgid, | ||
104 | tasks[i].comm, | ||
105 | (__u32) tasks[i].uid_gid); | ||
106 | print_hist_for_pid(fd, &tasks[i]); | ||
107 | } | ||
108 | |||
109 | } | ||
110 | |||
111 | static void int_exit(int sig) | ||
112 | { | ||
113 | print_hist(map_fd[1]); | ||
114 | exit(0); | ||
115 | } | ||
116 | |||
117 | int main(int ac, char **argv) | ||
118 | { | ||
119 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
120 | long key, next_key, value; | ||
121 | struct bpf_link *links[2]; | ||
122 | struct bpf_program *prog; | ||
123 | struct bpf_object *obj; | ||
124 | char filename[256]; | ||
125 | int i, j = 0; | ||
126 | FILE *f; | ||
127 | |||
128 | if (setrlimit(RLIMIT_MEMLOCK, &r)) { | ||
129 | perror("setrlimit(RLIMIT_MEMLOCK)"); | ||
130 | return 1; | ||
131 | } | ||
132 | |||
133 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
134 | obj = bpf_object__open_file(filename, NULL); | ||
135 | if (libbpf_get_error(obj)) { | ||
136 | fprintf(stderr, "ERROR: opening BPF object file failed\n"); | ||
137 | return 0; | ||
138 | } | ||
139 | |||
140 | /* load BPF program */ | ||
141 | if (bpf_object__load(obj)) { | ||
142 | fprintf(stderr, "ERROR: loading BPF object file failed\n"); | ||
143 | goto cleanup; | ||
144 | } | ||
145 | |||
146 | map_fd[0] = bpf_object__find_map_fd_by_name(obj, "my_map"); | ||
147 | map_fd[1] = bpf_object__find_map_fd_by_name(obj, "my_hist_map"); | ||
148 | if (map_fd[0] < 0 || map_fd[1] < 0) { | ||
149 | fprintf(stderr, "ERROR: finding a map in obj file failed\n"); | ||
150 | goto cleanup; | ||
151 | } | ||
152 | |||
153 | signal(SIGINT, int_exit); | ||
154 | signal(SIGTERM, int_exit); | ||
155 | |||
156 | /* start 'ping' in the background to have some kfree_skb events */ | ||
157 | f = popen("ping -4 -c5 localhost", "r"); | ||
158 | (void) f; | ||
159 | |||
160 | /* start 'dd' in the background to have plenty of 'write' syscalls */ | ||
161 | f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r"); | ||
162 | (void) f; | ||
163 | |||
164 | bpf_object__for_each_program(prog, obj) { | ||
165 | links[j] = bpf_program__attach(prog); | ||
166 | if (libbpf_get_error(links[j])) { | ||
167 | fprintf(stderr, "ERROR: bpf_program__attach failed\n"); | ||
168 | links[j] = NULL; | ||
169 | goto cleanup; | ||
170 | } | ||
171 | j++; | ||
172 | } | ||
173 | |||
174 | for (i = 0; i < 5; i++) { | ||
175 | key = 0; | ||
176 | while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) { | ||
177 | bpf_map_lookup_elem(map_fd[0], &next_key, &value); | ||
178 | printf("location 0x%lx count %ld\n", next_key, value); | ||
179 | key = next_key; | ||
180 | } | ||
181 | if (key) | ||
182 | printf("\n"); | ||
183 | sleep(1); | ||
184 | } | ||
185 | print_hist(map_fd[1]); | ||
186 | |||
187 | cleanup: | ||
188 | for (j--; j >= 0; j--) | ||
189 | bpf_link__destroy(links[j]); | ||
190 | |||
191 | bpf_object__close(obj); | ||
192 | return 0; | ||
193 | } | ||
diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c new file mode 100644 index 000000000..710a4410b --- /dev/null +++ b/samples/bpf/tracex3_kern.c | |||
@@ -0,0 +1,90 @@ | |||
1 | /* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #include <linux/skbuff.h> | ||
8 | #include <linux/netdevice.h> | ||
9 | #include <linux/version.h> | ||
10 | #include <uapi/linux/bpf.h> | ||
11 | #include <bpf/bpf_helpers.h> | ||
12 | #include <bpf/bpf_tracing.h> | ||
13 | |||
14 | struct { | ||
15 | __uint(type, BPF_MAP_TYPE_HASH); | ||
16 | __type(key, long); | ||
17 | __type(value, u64); | ||
18 | __uint(max_entries, 4096); | ||
19 | } my_map SEC(".maps"); | ||
20 | |||
21 | /* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe | ||
22 | * example will no longer be meaningful | ||
23 | */ | ||
24 | SEC("kprobe/blk_mq_start_request") | ||
25 | int bpf_prog1(struct pt_regs *ctx) | ||
26 | { | ||
27 | long rq = PT_REGS_PARM1(ctx); | ||
28 | u64 val = bpf_ktime_get_ns(); | ||
29 | |||
30 | bpf_map_update_elem(&my_map, &rq, &val, BPF_ANY); | ||
31 | return 0; | ||
32 | } | ||
33 | |||
34 | static unsigned int log2l(unsigned long long n) | ||
35 | { | ||
36 | #define S(k) if (n >= (1ull << k)) { i += k; n >>= k; } | ||
37 | int i = -(n == 0); | ||
38 | S(32); S(16); S(8); S(4); S(2); S(1); | ||
39 | return i; | ||
40 | #undef S | ||
41 | } | ||
42 | |||
43 | #define SLOTS 100 | ||
44 | |||
45 | struct { | ||
46 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); | ||
47 | __uint(key_size, sizeof(u32)); | ||
48 | __uint(value_size, sizeof(u64)); | ||
49 | __uint(max_entries, SLOTS); | ||
50 | } lat_map SEC(".maps"); | ||
51 | |||
52 | SEC("kprobe/blk_account_io_done") | ||
53 | int bpf_prog2(struct pt_regs *ctx) | ||
54 | { | ||
55 | long rq = PT_REGS_PARM1(ctx); | ||
56 | u64 *value, l, base; | ||
57 | u32 index; | ||
58 | |||
59 | value = bpf_map_lookup_elem(&my_map, &rq); | ||
60 | if (!value) | ||
61 | return 0; | ||
62 | |||
63 | u64 cur_time = bpf_ktime_get_ns(); | ||
64 | u64 delta = cur_time - *value; | ||
65 | |||
66 | bpf_map_delete_elem(&my_map, &rq); | ||
67 | |||
68 | /* the lines below are computing index = log10(delta)*10 | ||
69 | * using integer arithmetic | ||
70 | * index = 29 ~ 1 usec | ||
71 | * index = 59 ~ 1 msec | ||
72 | * index = 89 ~ 1 sec | ||
73 | * index = 99 ~ 10sec or more | ||
74 | * log10(x)*10 = log2(x)*10/log2(10) = log2(x)*3 | ||
75 | */ | ||
76 | l = log2l(delta); | ||
77 | base = 1ll << l; | ||
78 | index = (l * 64 + (delta - base) * 64 / base) * 3 / 64; | ||
79 | |||
80 | if (index >= SLOTS) | ||
81 | index = SLOTS - 1; | ||
82 | |||
83 | value = bpf_map_lookup_elem(&lat_map, &index); | ||
84 | if (value) | ||
85 | *value += 1; | ||
86 | |||
87 | return 0; | ||
88 | } | ||
89 | char _license[] SEC("license") = "GPL"; | ||
90 | u32 _version SEC("version") = LINUX_VERSION_CODE; | ||
diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c new file mode 100644 index 000000000..83e0fecbb --- /dev/null +++ b/samples/bpf/tracex3_user.c | |||
@@ -0,0 +1,190 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com | ||
3 | */ | ||
4 | #include <stdio.h> | ||
5 | #include <stdlib.h> | ||
6 | #include <signal.h> | ||
7 | #include <unistd.h> | ||
8 | #include <stdbool.h> | ||
9 | #include <string.h> | ||
10 | #include <sys/resource.h> | ||
11 | |||
12 | #include <bpf/bpf.h> | ||
13 | #include <bpf/libbpf.h> | ||
14 | #include "bpf_util.h" | ||
15 | |||
16 | #define SLOTS 100 | ||
17 | |||
18 | static void clear_stats(int fd) | ||
19 | { | ||
20 | unsigned int nr_cpus = bpf_num_possible_cpus(); | ||
21 | __u64 values[nr_cpus]; | ||
22 | __u32 key; | ||
23 | |||
24 | memset(values, 0, sizeof(values)); | ||
25 | for (key = 0; key < SLOTS; key++) | ||
26 | bpf_map_update_elem(fd, &key, values, BPF_ANY); | ||
27 | } | ||
28 | |||
29 | const char *color[] = { | ||
30 | "\033[48;5;255m", | ||
31 | "\033[48;5;252m", | ||
32 | "\033[48;5;250m", | ||
33 | "\033[48;5;248m", | ||
34 | "\033[48;5;246m", | ||
35 | "\033[48;5;244m", | ||
36 | "\033[48;5;242m", | ||
37 | "\033[48;5;240m", | ||
38 | "\033[48;5;238m", | ||
39 | "\033[48;5;236m", | ||
40 | "\033[48;5;234m", | ||
41 | "\033[48;5;232m", | ||
42 | }; | ||
43 | const int num_colors = ARRAY_SIZE(color); | ||
44 | |||
45 | const char nocolor[] = "\033[00m"; | ||
46 | |||
47 | const char *sym[] = { | ||
48 | " ", | ||
49 | " ", | ||
50 | ".", | ||
51 | ".", | ||
52 | "*", | ||
53 | "*", | ||
54 | "o", | ||
55 | "o", | ||
56 | "O", | ||
57 | "O", | ||
58 | "#", | ||
59 | "#", | ||
60 | }; | ||
61 | |||
62 | bool full_range = false; | ||
63 | bool text_only = false; | ||
64 | |||
65 | static void print_banner(void) | ||
66 | { | ||
67 | if (full_range) | ||
68 | printf("|1ns |10ns |100ns |1us |10us |100us" | ||
69 | " |1ms |10ms |100ms |1s |10s\n"); | ||
70 | else | ||
71 | printf("|1us |10us |100us |1ms |10ms " | ||
72 | "|100ms |1s |10s\n"); | ||
73 | } | ||
74 | |||
75 | static void print_hist(int fd) | ||
76 | { | ||
77 | unsigned int nr_cpus = bpf_num_possible_cpus(); | ||
78 | __u64 total_events = 0; | ||
79 | long values[nr_cpus]; | ||
80 | __u64 max_cnt = 0; | ||
81 | __u64 cnt[SLOTS]; | ||
82 | __u64 value; | ||
83 | __u32 key; | ||
84 | int i; | ||
85 | |||
86 | for (key = 0; key < SLOTS; key++) { | ||
87 | bpf_map_lookup_elem(fd, &key, values); | ||
88 | value = 0; | ||
89 | for (i = 0; i < nr_cpus; i++) | ||
90 | value += values[i]; | ||
91 | cnt[key] = value; | ||
92 | total_events += value; | ||
93 | if (value > max_cnt) | ||
94 | max_cnt = value; | ||
95 | } | ||
96 | clear_stats(fd); | ||
97 | for (key = full_range ? 0 : 29; key < SLOTS; key++) { | ||
98 | int c = num_colors * cnt[key] / (max_cnt + 1); | ||
99 | |||
100 | if (text_only) | ||
101 | printf("%s", sym[c]); | ||
102 | else | ||
103 | printf("%s %s", color[c], nocolor); | ||
104 | } | ||
105 | printf(" # %lld\n", total_events); | ||
106 | } | ||
107 | |||
108 | int main(int ac, char **argv) | ||
109 | { | ||
110 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
111 | struct bpf_link *links[2]; | ||
112 | struct bpf_program *prog; | ||
113 | struct bpf_object *obj; | ||
114 | char filename[256]; | ||
115 | int map_fd, i, j = 0; | ||
116 | |||
117 | for (i = 1; i < ac; i++) { | ||
118 | if (strcmp(argv[i], "-a") == 0) { | ||
119 | full_range = true; | ||
120 | } else if (strcmp(argv[i], "-t") == 0) { | ||
121 | text_only = true; | ||
122 | } else if (strcmp(argv[i], "-h") == 0) { | ||
123 | printf("Usage:\n" | ||
124 | " -a display wider latency range\n" | ||
125 | " -t text only\n"); | ||
126 | return 1; | ||
127 | } | ||
128 | } | ||
129 | |||
130 | if (setrlimit(RLIMIT_MEMLOCK, &r)) { | ||
131 | perror("setrlimit(RLIMIT_MEMLOCK)"); | ||
132 | return 1; | ||
133 | } | ||
134 | |||
135 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
136 | obj = bpf_object__open_file(filename, NULL); | ||
137 | if (libbpf_get_error(obj)) { | ||
138 | fprintf(stderr, "ERROR: opening BPF object file failed\n"); | ||
139 | return 0; | ||
140 | } | ||
141 | |||
142 | /* load BPF program */ | ||
143 | if (bpf_object__load(obj)) { | ||
144 | fprintf(stderr, "ERROR: loading BPF object file failed\n"); | ||
145 | goto cleanup; | ||
146 | } | ||
147 | |||
148 | map_fd = bpf_object__find_map_fd_by_name(obj, "lat_map"); | ||
149 | if (map_fd < 0) { | ||
150 | fprintf(stderr, "ERROR: finding a map in obj file failed\n"); | ||
151 | goto cleanup; | ||
152 | } | ||
153 | |||
154 | bpf_object__for_each_program(prog, obj) { | ||
155 | links[j] = bpf_program__attach(prog); | ||
156 | if (libbpf_get_error(links[j])) { | ||
157 | fprintf(stderr, "ERROR: bpf_program__attach failed\n"); | ||
158 | links[j] = NULL; | ||
159 | goto cleanup; | ||
160 | } | ||
161 | j++; | ||
162 | } | ||
163 | |||
164 | printf(" heatmap of IO latency\n"); | ||
165 | if (text_only) | ||
166 | printf(" %s", sym[num_colors - 1]); | ||
167 | else | ||
168 | printf(" %s %s", color[num_colors - 1], nocolor); | ||
169 | printf(" - many events with this latency\n"); | ||
170 | |||
171 | if (text_only) | ||
172 | printf(" %s", sym[0]); | ||
173 | else | ||
174 | printf(" %s %s", color[0], nocolor); | ||
175 | printf(" - few events\n"); | ||
176 | |||
177 | for (i = 0; ; i++) { | ||
178 | if (i % 20 == 0) | ||
179 | print_banner(); | ||
180 | print_hist(map_fd); | ||
181 | sleep(2); | ||
182 | } | ||
183 | |||
184 | cleanup: | ||
185 | for (j--; j >= 0; j--) | ||
186 | bpf_link__destroy(links[j]); | ||
187 | |||
188 | bpf_object__close(obj); | ||
189 | return 0; | ||
190 | } | ||
diff --git a/samples/bpf/tracex4_kern.c b/samples/bpf/tracex4_kern.c new file mode 100644 index 000000000..eb0f8fdd1 --- /dev/null +++ b/samples/bpf/tracex4_kern.c | |||
@@ -0,0 +1,55 @@ | |||
1 | /* Copyright (c) 2015 PLUMgrid, http://plumgrid.com | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #include <linux/ptrace.h> | ||
8 | #include <linux/version.h> | ||
9 | #include <uapi/linux/bpf.h> | ||
10 | #include <bpf/bpf_helpers.h> | ||
11 | #include <bpf/bpf_tracing.h> | ||
12 | |||
13 | struct pair { | ||
14 | u64 val; | ||
15 | u64 ip; | ||
16 | }; | ||
17 | |||
18 | struct { | ||
19 | __uint(type, BPF_MAP_TYPE_HASH); | ||
20 | __type(key, long); | ||
21 | __type(value, struct pair); | ||
22 | __uint(max_entries, 1000000); | ||
23 | } my_map SEC(".maps"); | ||
24 | |||
25 | /* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe | ||
26 | * example will no longer be meaningful | ||
27 | */ | ||
28 | SEC("kprobe/kmem_cache_free") | ||
29 | int bpf_prog1(struct pt_regs *ctx) | ||
30 | { | ||
31 | long ptr = PT_REGS_PARM2(ctx); | ||
32 | |||
33 | bpf_map_delete_elem(&my_map, &ptr); | ||
34 | return 0; | ||
35 | } | ||
36 | |||
37 | SEC("kretprobe/kmem_cache_alloc_node") | ||
38 | int bpf_prog2(struct pt_regs *ctx) | ||
39 | { | ||
40 | long ptr = PT_REGS_RC(ctx); | ||
41 | long ip = 0; | ||
42 | |||
43 | /* get ip address of kmem_cache_alloc_node() caller */ | ||
44 | BPF_KRETPROBE_READ_RET_IP(ip, ctx); | ||
45 | |||
46 | struct pair v = { | ||
47 | .val = bpf_ktime_get_ns(), | ||
48 | .ip = ip, | ||
49 | }; | ||
50 | |||
51 | bpf_map_update_elem(&my_map, &ptr, &v, BPF_ANY); | ||
52 | return 0; | ||
53 | } | ||
54 | char _license[] SEC("license") = "GPL"; | ||
55 | u32 _version SEC("version") = LINUX_VERSION_CODE; | ||
diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c new file mode 100644 index 000000000..e8faf8f18 --- /dev/null +++ b/samples/bpf/tracex4_user.c | |||
@@ -0,0 +1,103 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* Copyright (c) 2015 PLUMgrid, http://plumgrid.com | ||
3 | */ | ||
4 | #include <stdio.h> | ||
5 | #include <stdlib.h> | ||
6 | #include <signal.h> | ||
7 | #include <unistd.h> | ||
8 | #include <stdbool.h> | ||
9 | #include <string.h> | ||
10 | #include <time.h> | ||
11 | #include <sys/resource.h> | ||
12 | |||
13 | #include <bpf/bpf.h> | ||
14 | #include <bpf/libbpf.h> | ||
15 | |||
16 | struct pair { | ||
17 | long long val; | ||
18 | __u64 ip; | ||
19 | }; | ||
20 | |||
21 | static __u64 time_get_ns(void) | ||
22 | { | ||
23 | struct timespec ts; | ||
24 | |||
25 | clock_gettime(CLOCK_MONOTONIC, &ts); | ||
26 | return ts.tv_sec * 1000000000ull + ts.tv_nsec; | ||
27 | } | ||
28 | |||
29 | static void print_old_objects(int fd) | ||
30 | { | ||
31 | long long val = time_get_ns(); | ||
32 | __u64 key, next_key; | ||
33 | struct pair v; | ||
34 | |||
35 | key = write(1, "\e[1;1H\e[2J", 12); /* clear screen */ | ||
36 | |||
37 | key = -1; | ||
38 | while (bpf_map_get_next_key(fd, &key, &next_key) == 0) { | ||
39 | bpf_map_lookup_elem(fd, &next_key, &v); | ||
40 | key = next_key; | ||
41 | if (val - v.val < 1000000000ll) | ||
42 | /* object was allocated more then 1 sec ago */ | ||
43 | continue; | ||
44 | printf("obj 0x%llx is %2lldsec old was allocated at ip %llx\n", | ||
45 | next_key, (val - v.val) / 1000000000ll, v.ip); | ||
46 | } | ||
47 | } | ||
48 | |||
49 | int main(int ac, char **argv) | ||
50 | { | ||
51 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
52 | struct bpf_link *links[2]; | ||
53 | struct bpf_program *prog; | ||
54 | struct bpf_object *obj; | ||
55 | char filename[256]; | ||
56 | int map_fd, i, j = 0; | ||
57 | |||
58 | if (setrlimit(RLIMIT_MEMLOCK, &r)) { | ||
59 | perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); | ||
60 | return 1; | ||
61 | } | ||
62 | |||
63 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
64 | obj = bpf_object__open_file(filename, NULL); | ||
65 | if (libbpf_get_error(obj)) { | ||
66 | fprintf(stderr, "ERROR: opening BPF object file failed\n"); | ||
67 | return 0; | ||
68 | } | ||
69 | |||
70 | /* load BPF program */ | ||
71 | if (bpf_object__load(obj)) { | ||
72 | fprintf(stderr, "ERROR: loading BPF object file failed\n"); | ||
73 | goto cleanup; | ||
74 | } | ||
75 | |||
76 | map_fd = bpf_object__find_map_fd_by_name(obj, "my_map"); | ||
77 | if (map_fd < 0) { | ||
78 | fprintf(stderr, "ERROR: finding a map in obj file failed\n"); | ||
79 | goto cleanup; | ||
80 | } | ||
81 | |||
82 | bpf_object__for_each_program(prog, obj) { | ||
83 | links[j] = bpf_program__attach(prog); | ||
84 | if (libbpf_get_error(links[j])) { | ||
85 | fprintf(stderr, "ERROR: bpf_program__attach failed\n"); | ||
86 | links[j] = NULL; | ||
87 | goto cleanup; | ||
88 | } | ||
89 | j++; | ||
90 | } | ||
91 | |||
92 | for (i = 0; ; i++) { | ||
93 | print_old_objects(map_fd); | ||
94 | sleep(1); | ||
95 | } | ||
96 | |||
97 | cleanup: | ||
98 | for (j--; j >= 0; j--) | ||
99 | bpf_link__destroy(links[j]); | ||
100 | |||
101 | bpf_object__close(obj); | ||
102 | return 0; | ||
103 | } | ||
diff --git a/samples/bpf/tracex5_kern.c b/samples/bpf/tracex5_kern.c new file mode 100644 index 000000000..64a1f7550 --- /dev/null +++ b/samples/bpf/tracex5_kern.c | |||
@@ -0,0 +1,93 @@ | |||
1 | /* Copyright (c) 2015 PLUMgrid, http://plumgrid.com | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #include <linux/ptrace.h> | ||
8 | #include <linux/version.h> | ||
9 | #include <uapi/linux/bpf.h> | ||
10 | #include <uapi/linux/seccomp.h> | ||
11 | #include <uapi/linux/unistd.h> | ||
12 | #include "syscall_nrs.h" | ||
13 | #include <bpf/bpf_helpers.h> | ||
14 | #include <bpf/bpf_tracing.h> | ||
15 | |||
16 | #define PROG(F) SEC("kprobe/"__stringify(F)) int bpf_func_##F | ||
17 | |||
18 | struct { | ||
19 | __uint(type, BPF_MAP_TYPE_PROG_ARRAY); | ||
20 | __uint(key_size, sizeof(u32)); | ||
21 | __uint(value_size, sizeof(u32)); | ||
22 | #ifdef __mips__ | ||
23 | __uint(max_entries, 6000); /* MIPS n64 syscalls start at 5000 */ | ||
24 | #else | ||
25 | __uint(max_entries, 1024); | ||
26 | #endif | ||
27 | } progs SEC(".maps"); | ||
28 | |||
29 | SEC("kprobe/__seccomp_filter") | ||
30 | int bpf_prog1(struct pt_regs *ctx) | ||
31 | { | ||
32 | int sc_nr = (int)PT_REGS_PARM1(ctx); | ||
33 | |||
34 | /* dispatch into next BPF program depending on syscall number */ | ||
35 | bpf_tail_call(ctx, &progs, sc_nr); | ||
36 | |||
37 | /* fall through -> unknown syscall */ | ||
38 | if (sc_nr >= __NR_getuid && sc_nr <= __NR_getsid) { | ||
39 | char fmt[] = "syscall=%d (one of get/set uid/pid/gid)\n"; | ||
40 | bpf_trace_printk(fmt, sizeof(fmt), sc_nr); | ||
41 | } | ||
42 | return 0; | ||
43 | } | ||
44 | |||
45 | /* we jump here when syscall number == __NR_write */ | ||
46 | PROG(SYS__NR_write)(struct pt_regs *ctx) | ||
47 | { | ||
48 | struct seccomp_data sd; | ||
49 | |||
50 | bpf_probe_read_kernel(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx)); | ||
51 | if (sd.args[2] == 512) { | ||
52 | char fmt[] = "write(fd=%d, buf=%p, size=%d)\n"; | ||
53 | bpf_trace_printk(fmt, sizeof(fmt), | ||
54 | sd.args[0], sd.args[1], sd.args[2]); | ||
55 | } | ||
56 | return 0; | ||
57 | } | ||
58 | |||
59 | PROG(SYS__NR_read)(struct pt_regs *ctx) | ||
60 | { | ||
61 | struct seccomp_data sd; | ||
62 | |||
63 | bpf_probe_read_kernel(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx)); | ||
64 | if (sd.args[2] > 128 && sd.args[2] <= 1024) { | ||
65 | char fmt[] = "read(fd=%d, buf=%p, size=%d)\n"; | ||
66 | bpf_trace_printk(fmt, sizeof(fmt), | ||
67 | sd.args[0], sd.args[1], sd.args[2]); | ||
68 | } | ||
69 | return 0; | ||
70 | } | ||
71 | |||
72 | #ifdef __NR_mmap2 | ||
73 | PROG(SYS__NR_mmap2)(struct pt_regs *ctx) | ||
74 | { | ||
75 | char fmt[] = "mmap2\n"; | ||
76 | |||
77 | bpf_trace_printk(fmt, sizeof(fmt)); | ||
78 | return 0; | ||
79 | } | ||
80 | #endif | ||
81 | |||
82 | #ifdef __NR_mmap | ||
83 | PROG(SYS__NR_mmap)(struct pt_regs *ctx) | ||
84 | { | ||
85 | char fmt[] = "mmap\n"; | ||
86 | |||
87 | bpf_trace_printk(fmt, sizeof(fmt)); | ||
88 | return 0; | ||
89 | } | ||
90 | #endif | ||
91 | |||
92 | char _license[] SEC("license") = "GPL"; | ||
93 | u32 _version SEC("version") = LINUX_VERSION_CODE; | ||
diff --git a/samples/bpf/tracex5_user.c b/samples/bpf/tracex5_user.c new file mode 100644 index 000000000..c17d3fb5f --- /dev/null +++ b/samples/bpf/tracex5_user.c | |||
@@ -0,0 +1,101 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #include <stdio.h> | ||
3 | #include <stdlib.h> | ||
4 | #include <unistd.h> | ||
5 | #include <linux/filter.h> | ||
6 | #include <linux/seccomp.h> | ||
7 | #include <sys/prctl.h> | ||
8 | #include <bpf/bpf.h> | ||
9 | #include <bpf/libbpf.h> | ||
10 | #include <sys/resource.h> | ||
11 | #include "trace_helpers.h" | ||
12 | |||
13 | #ifdef __mips__ | ||
14 | #define MAX_ENTRIES 6000 /* MIPS n64 syscalls start at 5000 */ | ||
15 | #else | ||
16 | #define MAX_ENTRIES 1024 | ||
17 | #endif | ||
18 | |||
19 | /* install fake seccomp program to enable seccomp code path inside the kernel, | ||
20 | * so that our kprobe attached to seccomp_phase1() can be triggered | ||
21 | */ | ||
22 | static void install_accept_all_seccomp(void) | ||
23 | { | ||
24 | struct sock_filter filter[] = { | ||
25 | BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), | ||
26 | }; | ||
27 | struct sock_fprog prog = { | ||
28 | .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), | ||
29 | .filter = filter, | ||
30 | }; | ||
31 | if (prctl(PR_SET_SECCOMP, 2, &prog)) | ||
32 | perror("prctl"); | ||
33 | } | ||
34 | |||
35 | int main(int ac, char **argv) | ||
36 | { | ||
37 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
38 | struct bpf_link *link = NULL; | ||
39 | struct bpf_program *prog; | ||
40 | struct bpf_object *obj; | ||
41 | int key, fd, progs_fd; | ||
42 | const char *section; | ||
43 | char filename[256]; | ||
44 | FILE *f; | ||
45 | |||
46 | setrlimit(RLIMIT_MEMLOCK, &r); | ||
47 | |||
48 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
49 | obj = bpf_object__open_file(filename, NULL); | ||
50 | if (libbpf_get_error(obj)) { | ||
51 | fprintf(stderr, "ERROR: opening BPF object file failed\n"); | ||
52 | return 0; | ||
53 | } | ||
54 | |||
55 | prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); | ||
56 | if (!prog) { | ||
57 | printf("finding a prog in obj file failed\n"); | ||
58 | goto cleanup; | ||
59 | } | ||
60 | |||
61 | /* load BPF program */ | ||
62 | if (bpf_object__load(obj)) { | ||
63 | fprintf(stderr, "ERROR: loading BPF object file failed\n"); | ||
64 | goto cleanup; | ||
65 | } | ||
66 | |||
67 | link = bpf_program__attach(prog); | ||
68 | if (libbpf_get_error(link)) { | ||
69 | fprintf(stderr, "ERROR: bpf_program__attach failed\n"); | ||
70 | link = NULL; | ||
71 | goto cleanup; | ||
72 | } | ||
73 | |||
74 | progs_fd = bpf_object__find_map_fd_by_name(obj, "progs"); | ||
75 | if (progs_fd < 0) { | ||
76 | fprintf(stderr, "ERROR: finding a map in obj file failed\n"); | ||
77 | goto cleanup; | ||
78 | } | ||
79 | |||
80 | bpf_object__for_each_program(prog, obj) { | ||
81 | section = bpf_program__section_name(prog); | ||
82 | /* register only syscalls to PROG_ARRAY */ | ||
83 | if (sscanf(section, "kprobe/%d", &key) != 1) | ||
84 | continue; | ||
85 | |||
86 | fd = bpf_program__fd(prog); | ||
87 | bpf_map_update_elem(progs_fd, &key, &fd, BPF_ANY); | ||
88 | } | ||
89 | |||
90 | install_accept_all_seccomp(); | ||
91 | |||
92 | f = popen("dd if=/dev/zero of=/dev/null count=5", "r"); | ||
93 | (void) f; | ||
94 | |||
95 | read_trace_pipe(); | ||
96 | |||
97 | cleanup: | ||
98 | bpf_link__destroy(link); | ||
99 | bpf_object__close(obj); | ||
100 | return 0; | ||
101 | } | ||
diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c new file mode 100644 index 000000000..acad5712d --- /dev/null +++ b/samples/bpf/tracex6_kern.c | |||
@@ -0,0 +1,69 @@ | |||
1 | #include <linux/ptrace.h> | ||
2 | #include <linux/version.h> | ||
3 | #include <uapi/linux/bpf.h> | ||
4 | #include <bpf/bpf_helpers.h> | ||
5 | |||
6 | struct { | ||
7 | __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); | ||
8 | __uint(key_size, sizeof(int)); | ||
9 | __uint(value_size, sizeof(u32)); | ||
10 | __uint(max_entries, 64); | ||
11 | } counters SEC(".maps"); | ||
12 | |||
13 | struct { | ||
14 | __uint(type, BPF_MAP_TYPE_HASH); | ||
15 | __type(key, int); | ||
16 | __type(value, u64); | ||
17 | __uint(max_entries, 64); | ||
18 | } values SEC(".maps"); | ||
19 | |||
20 | struct { | ||
21 | __uint(type, BPF_MAP_TYPE_HASH); | ||
22 | __type(key, int); | ||
23 | __type(value, struct bpf_perf_event_value); | ||
24 | __uint(max_entries, 64); | ||
25 | } values2 SEC(".maps"); | ||
26 | |||
27 | SEC("kprobe/htab_map_get_next_key") | ||
28 | int bpf_prog1(struct pt_regs *ctx) | ||
29 | { | ||
30 | u32 key = bpf_get_smp_processor_id(); | ||
31 | u64 count, *val; | ||
32 | s64 error; | ||
33 | |||
34 | count = bpf_perf_event_read(&counters, key); | ||
35 | error = (s64)count; | ||
36 | if (error <= -2 && error >= -22) | ||
37 | return 0; | ||
38 | |||
39 | val = bpf_map_lookup_elem(&values, &key); | ||
40 | if (val) | ||
41 | *val = count; | ||
42 | else | ||
43 | bpf_map_update_elem(&values, &key, &count, BPF_NOEXIST); | ||
44 | |||
45 | return 0; | ||
46 | } | ||
47 | |||
48 | SEC("kprobe/htab_map_lookup_elem") | ||
49 | int bpf_prog2(struct pt_regs *ctx) | ||
50 | { | ||
51 | u32 key = bpf_get_smp_processor_id(); | ||
52 | struct bpf_perf_event_value *val, buf; | ||
53 | int error; | ||
54 | |||
55 | error = bpf_perf_event_read_value(&counters, key, &buf, sizeof(buf)); | ||
56 | if (error) | ||
57 | return 0; | ||
58 | |||
59 | val = bpf_map_lookup_elem(&values2, &key); | ||
60 | if (val) | ||
61 | *val = buf; | ||
62 | else | ||
63 | bpf_map_update_elem(&values2, &key, &buf, BPF_NOEXIST); | ||
64 | |||
65 | return 0; | ||
66 | } | ||
67 | |||
68 | char _license[] SEC("license") = "GPL"; | ||
69 | u32 _version SEC("version") = LINUX_VERSION_CODE; | ||
diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c new file mode 100644 index 000000000..33df97847 --- /dev/null +++ b/samples/bpf/tracex6_user.c | |||
@@ -0,0 +1,226 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #define _GNU_SOURCE | ||
3 | |||
4 | #include <assert.h> | ||
5 | #include <fcntl.h> | ||
6 | #include <linux/perf_event.h> | ||
7 | #include <sched.h> | ||
8 | #include <stdio.h> | ||
9 | #include <stdlib.h> | ||
10 | #include <sys/ioctl.h> | ||
11 | #include <sys/resource.h> | ||
12 | #include <sys/time.h> | ||
13 | #include <sys/types.h> | ||
14 | #include <sys/wait.h> | ||
15 | #include <unistd.h> | ||
16 | |||
17 | #include <bpf/bpf.h> | ||
18 | #include <bpf/libbpf.h> | ||
19 | #include "perf-sys.h" | ||
20 | |||
21 | #define SAMPLE_PERIOD 0x7fffffffffffffffULL | ||
22 | |||
23 | /* counters, values, values2 */ | ||
24 | static int map_fd[3]; | ||
25 | |||
26 | static void check_on_cpu(int cpu, struct perf_event_attr *attr) | ||
27 | { | ||
28 | struct bpf_perf_event_value value2; | ||
29 | int pmu_fd, error = 0; | ||
30 | cpu_set_t set; | ||
31 | __u64 value; | ||
32 | |||
33 | /* Move to target CPU */ | ||
34 | CPU_ZERO(&set); | ||
35 | CPU_SET(cpu, &set); | ||
36 | assert(sched_setaffinity(0, sizeof(set), &set) == 0); | ||
37 | /* Open perf event and attach to the perf_event_array */ | ||
38 | pmu_fd = sys_perf_event_open(attr, -1/*pid*/, cpu/*cpu*/, -1/*group_fd*/, 0); | ||
39 | if (pmu_fd < 0) { | ||
40 | fprintf(stderr, "sys_perf_event_open failed on CPU %d\n", cpu); | ||
41 | error = 1; | ||
42 | goto on_exit; | ||
43 | } | ||
44 | assert(bpf_map_update_elem(map_fd[0], &cpu, &pmu_fd, BPF_ANY) == 0); | ||
45 | assert(ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0) == 0); | ||
46 | /* Trigger the kprobe */ | ||
47 | bpf_map_get_next_key(map_fd[1], &cpu, NULL); | ||
48 | /* Check the value */ | ||
49 | if (bpf_map_lookup_elem(map_fd[1], &cpu, &value)) { | ||
50 | fprintf(stderr, "Value missing for CPU %d\n", cpu); | ||
51 | error = 1; | ||
52 | goto on_exit; | ||
53 | } else { | ||
54 | fprintf(stderr, "CPU %d: %llu\n", cpu, value); | ||
55 | } | ||
56 | /* The above bpf_map_lookup_elem should trigger the second kprobe */ | ||
57 | if (bpf_map_lookup_elem(map_fd[2], &cpu, &value2)) { | ||
58 | fprintf(stderr, "Value2 missing for CPU %d\n", cpu); | ||
59 | error = 1; | ||
60 | goto on_exit; | ||
61 | } else { | ||
62 | fprintf(stderr, "CPU %d: counter: %llu, enabled: %llu, running: %llu\n", cpu, | ||
63 | value2.counter, value2.enabled, value2.running); | ||
64 | } | ||
65 | |||
66 | on_exit: | ||
67 | assert(bpf_map_delete_elem(map_fd[0], &cpu) == 0 || error); | ||
68 | assert(ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE, 0) == 0 || error); | ||
69 | assert(close(pmu_fd) == 0 || error); | ||
70 | assert(bpf_map_delete_elem(map_fd[1], &cpu) == 0 || error); | ||
71 | exit(error); | ||
72 | } | ||
73 | |||
74 | static void test_perf_event_array(struct perf_event_attr *attr, | ||
75 | const char *name) | ||
76 | { | ||
77 | int i, status, nr_cpus = sysconf(_SC_NPROCESSORS_CONF); | ||
78 | pid_t pid[nr_cpus]; | ||
79 | int err = 0; | ||
80 | |||
81 | printf("Test reading %s counters\n", name); | ||
82 | |||
83 | for (i = 0; i < nr_cpus; i++) { | ||
84 | pid[i] = fork(); | ||
85 | assert(pid[i] >= 0); | ||
86 | if (pid[i] == 0) { | ||
87 | check_on_cpu(i, attr); | ||
88 | exit(1); | ||
89 | } | ||
90 | } | ||
91 | |||
92 | for (i = 0; i < nr_cpus; i++) { | ||
93 | assert(waitpid(pid[i], &status, 0) == pid[i]); | ||
94 | err |= status; | ||
95 | } | ||
96 | |||
97 | if (err) | ||
98 | printf("Test: %s FAILED\n", name); | ||
99 | } | ||
100 | |||
101 | static void test_bpf_perf_event(void) | ||
102 | { | ||
103 | struct perf_event_attr attr_cycles = { | ||
104 | .freq = 0, | ||
105 | .sample_period = SAMPLE_PERIOD, | ||
106 | .inherit = 0, | ||
107 | .type = PERF_TYPE_HARDWARE, | ||
108 | .read_format = 0, | ||
109 | .sample_type = 0, | ||
110 | .config = PERF_COUNT_HW_CPU_CYCLES, | ||
111 | }; | ||
112 | struct perf_event_attr attr_clock = { | ||
113 | .freq = 0, | ||
114 | .sample_period = SAMPLE_PERIOD, | ||
115 | .inherit = 0, | ||
116 | .type = PERF_TYPE_SOFTWARE, | ||
117 | .read_format = 0, | ||
118 | .sample_type = 0, | ||
119 | .config = PERF_COUNT_SW_CPU_CLOCK, | ||
120 | }; | ||
121 | struct perf_event_attr attr_raw = { | ||
122 | .freq = 0, | ||
123 | .sample_period = SAMPLE_PERIOD, | ||
124 | .inherit = 0, | ||
125 | .type = PERF_TYPE_RAW, | ||
126 | .read_format = 0, | ||
127 | .sample_type = 0, | ||
128 | /* Intel Instruction Retired */ | ||
129 | .config = 0xc0, | ||
130 | }; | ||
131 | struct perf_event_attr attr_l1d_load = { | ||
132 | .freq = 0, | ||
133 | .sample_period = SAMPLE_PERIOD, | ||
134 | .inherit = 0, | ||
135 | .type = PERF_TYPE_HW_CACHE, | ||
136 | .read_format = 0, | ||
137 | .sample_type = 0, | ||
138 | .config = | ||
139 | PERF_COUNT_HW_CACHE_L1D | | ||
140 | (PERF_COUNT_HW_CACHE_OP_READ << 8) | | ||
141 | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), | ||
142 | }; | ||
143 | struct perf_event_attr attr_llc_miss = { | ||
144 | .freq = 0, | ||
145 | .sample_period = SAMPLE_PERIOD, | ||
146 | .inherit = 0, | ||
147 | .type = PERF_TYPE_HW_CACHE, | ||
148 | .read_format = 0, | ||
149 | .sample_type = 0, | ||
150 | .config = | ||
151 | PERF_COUNT_HW_CACHE_LL | | ||
152 | (PERF_COUNT_HW_CACHE_OP_READ << 8) | | ||
153 | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), | ||
154 | }; | ||
155 | struct perf_event_attr attr_msr_tsc = { | ||
156 | .freq = 0, | ||
157 | .sample_period = 0, | ||
158 | .inherit = 0, | ||
159 | /* From /sys/bus/event_source/devices/msr/ */ | ||
160 | .type = 7, | ||
161 | .read_format = 0, | ||
162 | .sample_type = 0, | ||
163 | .config = 0, | ||
164 | }; | ||
165 | |||
166 | test_perf_event_array(&attr_cycles, "HARDWARE-cycles"); | ||
167 | test_perf_event_array(&attr_clock, "SOFTWARE-clock"); | ||
168 | test_perf_event_array(&attr_raw, "RAW-instruction-retired"); | ||
169 | test_perf_event_array(&attr_l1d_load, "HW_CACHE-L1D-load"); | ||
170 | |||
171 | /* below tests may fail in qemu */ | ||
172 | test_perf_event_array(&attr_llc_miss, "HW_CACHE-LLC-miss"); | ||
173 | test_perf_event_array(&attr_msr_tsc, "Dynamic-msr-tsc"); | ||
174 | } | ||
175 | |||
176 | int main(int argc, char **argv) | ||
177 | { | ||
178 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
179 | struct bpf_link *links[2]; | ||
180 | struct bpf_program *prog; | ||
181 | struct bpf_object *obj; | ||
182 | char filename[256]; | ||
183 | int i = 0; | ||
184 | |||
185 | setrlimit(RLIMIT_MEMLOCK, &r); | ||
186 | |||
187 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
188 | obj = bpf_object__open_file(filename, NULL); | ||
189 | if (libbpf_get_error(obj)) { | ||
190 | fprintf(stderr, "ERROR: opening BPF object file failed\n"); | ||
191 | return 0; | ||
192 | } | ||
193 | |||
194 | /* load BPF program */ | ||
195 | if (bpf_object__load(obj)) { | ||
196 | fprintf(stderr, "ERROR: loading BPF object file failed\n"); | ||
197 | goto cleanup; | ||
198 | } | ||
199 | |||
200 | map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counters"); | ||
201 | map_fd[1] = bpf_object__find_map_fd_by_name(obj, "values"); | ||
202 | map_fd[2] = bpf_object__find_map_fd_by_name(obj, "values2"); | ||
203 | if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0) { | ||
204 | fprintf(stderr, "ERROR: finding a map in obj file failed\n"); | ||
205 | goto cleanup; | ||
206 | } | ||
207 | |||
208 | bpf_object__for_each_program(prog, obj) { | ||
209 | links[i] = bpf_program__attach(prog); | ||
210 | if (libbpf_get_error(links[i])) { | ||
211 | fprintf(stderr, "ERROR: bpf_program__attach failed\n"); | ||
212 | links[i] = NULL; | ||
213 | goto cleanup; | ||
214 | } | ||
215 | i++; | ||
216 | } | ||
217 | |||
218 | test_bpf_perf_event(); | ||
219 | |||
220 | cleanup: | ||
221 | for (i--; i >= 0; i--) | ||
222 | bpf_link__destroy(links[i]); | ||
223 | |||
224 | bpf_object__close(obj); | ||
225 | return 0; | ||
226 | } | ||
diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c new file mode 100644 index 000000000..c5a92df8a --- /dev/null +++ b/samples/bpf/tracex7_kern.c | |||
@@ -0,0 +1,16 @@ | |||
1 | #include <uapi/linux/ptrace.h> | ||
2 | #include <uapi/linux/bpf.h> | ||
3 | #include <linux/version.h> | ||
4 | #include <bpf/bpf_helpers.h> | ||
5 | |||
6 | SEC("kprobe/open_ctree") | ||
7 | int bpf_prog1(struct pt_regs *ctx) | ||
8 | { | ||
9 | unsigned long rc = -12; | ||
10 | |||
11 | bpf_override_return(ctx, rc); | ||
12 | return 0; | ||
13 | } | ||
14 | |||
15 | char _license[] SEC("license") = "GPL"; | ||
16 | u32 _version SEC("version") = LINUX_VERSION_CODE; | ||
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c new file mode 100644 index 000000000..8be7ce18d --- /dev/null +++ b/samples/bpf/tracex7_user.c | |||
@@ -0,0 +1,56 @@ | |||
1 | #define _GNU_SOURCE | ||
2 | |||
3 | #include <stdio.h> | ||
4 | #include <unistd.h> | ||
5 | #include <bpf/libbpf.h> | ||
6 | |||
7 | int main(int argc, char **argv) | ||
8 | { | ||
9 | struct bpf_link *link = NULL; | ||
10 | struct bpf_program *prog; | ||
11 | struct bpf_object *obj; | ||
12 | char filename[256]; | ||
13 | char command[256]; | ||
14 | int ret = 0; | ||
15 | FILE *f; | ||
16 | |||
17 | if (!argv[1]) { | ||
18 | fprintf(stderr, "ERROR: Run with the btrfs device argument!\n"); | ||
19 | return 0; | ||
20 | } | ||
21 | |||
22 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
23 | obj = bpf_object__open_file(filename, NULL); | ||
24 | if (libbpf_get_error(obj)) { | ||
25 | fprintf(stderr, "ERROR: opening BPF object file failed\n"); | ||
26 | return 0; | ||
27 | } | ||
28 | |||
29 | prog = bpf_object__find_program_by_name(obj, "bpf_prog1"); | ||
30 | if (!prog) { | ||
31 | fprintf(stderr, "ERROR: finding a prog in obj file failed\n"); | ||
32 | goto cleanup; | ||
33 | } | ||
34 | |||
35 | /* load BPF program */ | ||
36 | if (bpf_object__load(obj)) { | ||
37 | fprintf(stderr, "ERROR: loading BPF object file failed\n"); | ||
38 | goto cleanup; | ||
39 | } | ||
40 | |||
41 | link = bpf_program__attach(prog); | ||
42 | if (libbpf_get_error(link)) { | ||
43 | fprintf(stderr, "ERROR: bpf_program__attach failed\n"); | ||
44 | link = NULL; | ||
45 | goto cleanup; | ||
46 | } | ||
47 | |||
48 | snprintf(command, 256, "mount %s tmpmnt/", argv[1]); | ||
49 | f = popen(command, "r"); | ||
50 | ret = pclose(f); | ||
51 | |||
52 | cleanup: | ||
53 | bpf_link__destroy(link); | ||
54 | bpf_object__close(obj); | ||
55 | return ret ? 0 : 1; | ||
56 | } | ||
diff --git a/samples/bpf/xdp1_kern.c b/samples/bpf/xdp1_kern.c new file mode 100644 index 000000000..34b64394e --- /dev/null +++ b/samples/bpf/xdp1_kern.c | |||
@@ -0,0 +1,93 @@ | |||
1 | /* Copyright (c) 2016 PLUMgrid | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #define KBUILD_MODNAME "foo" | ||
8 | #include <uapi/linux/bpf.h> | ||
9 | #include <linux/in.h> | ||
10 | #include <linux/if_ether.h> | ||
11 | #include <linux/if_packet.h> | ||
12 | #include <linux/if_vlan.h> | ||
13 | #include <linux/ip.h> | ||
14 | #include <linux/ipv6.h> | ||
15 | #include <bpf/bpf_helpers.h> | ||
16 | |||
17 | struct { | ||
18 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); | ||
19 | __type(key, u32); | ||
20 | __type(value, long); | ||
21 | __uint(max_entries, 256); | ||
22 | } rxcnt SEC(".maps"); | ||
23 | |||
24 | static int parse_ipv4(void *data, u64 nh_off, void *data_end) | ||
25 | { | ||
26 | struct iphdr *iph = data + nh_off; | ||
27 | |||
28 | if (iph + 1 > data_end) | ||
29 | return 0; | ||
30 | return iph->protocol; | ||
31 | } | ||
32 | |||
33 | static int parse_ipv6(void *data, u64 nh_off, void *data_end) | ||
34 | { | ||
35 | struct ipv6hdr *ip6h = data + nh_off; | ||
36 | |||
37 | if (ip6h + 1 > data_end) | ||
38 | return 0; | ||
39 | return ip6h->nexthdr; | ||
40 | } | ||
41 | |||
42 | SEC("xdp1") | ||
43 | int xdp_prog1(struct xdp_md *ctx) | ||
44 | { | ||
45 | void *data_end = (void *)(long)ctx->data_end; | ||
46 | void *data = (void *)(long)ctx->data; | ||
47 | struct ethhdr *eth = data; | ||
48 | int rc = XDP_DROP; | ||
49 | long *value; | ||
50 | u16 h_proto; | ||
51 | u64 nh_off; | ||
52 | u32 ipproto; | ||
53 | |||
54 | nh_off = sizeof(*eth); | ||
55 | if (data + nh_off > data_end) | ||
56 | return rc; | ||
57 | |||
58 | h_proto = eth->h_proto; | ||
59 | |||
60 | if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { | ||
61 | struct vlan_hdr *vhdr; | ||
62 | |||
63 | vhdr = data + nh_off; | ||
64 | nh_off += sizeof(struct vlan_hdr); | ||
65 | if (data + nh_off > data_end) | ||
66 | return rc; | ||
67 | h_proto = vhdr->h_vlan_encapsulated_proto; | ||
68 | } | ||
69 | if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { | ||
70 | struct vlan_hdr *vhdr; | ||
71 | |||
72 | vhdr = data + nh_off; | ||
73 | nh_off += sizeof(struct vlan_hdr); | ||
74 | if (data + nh_off > data_end) | ||
75 | return rc; | ||
76 | h_proto = vhdr->h_vlan_encapsulated_proto; | ||
77 | } | ||
78 | |||
79 | if (h_proto == htons(ETH_P_IP)) | ||
80 | ipproto = parse_ipv4(data, nh_off, data_end); | ||
81 | else if (h_proto == htons(ETH_P_IPV6)) | ||
82 | ipproto = parse_ipv6(data, nh_off, data_end); | ||
83 | else | ||
84 | ipproto = 0; | ||
85 | |||
86 | value = bpf_map_lookup_elem(&rxcnt, &ipproto); | ||
87 | if (value) | ||
88 | *value += 1; | ||
89 | |||
90 | return rc; | ||
91 | } | ||
92 | |||
93 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c new file mode 100644 index 000000000..c447ad9e3 --- /dev/null +++ b/samples/bpf/xdp1_user.c | |||
@@ -0,0 +1,167 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* Copyright (c) 2016 PLUMgrid | ||
3 | */ | ||
4 | #include <linux/bpf.h> | ||
5 | #include <linux/if_link.h> | ||
6 | #include <assert.h> | ||
7 | #include <errno.h> | ||
8 | #include <signal.h> | ||
9 | #include <stdio.h> | ||
10 | #include <stdlib.h> | ||
11 | #include <string.h> | ||
12 | #include <unistd.h> | ||
13 | #include <libgen.h> | ||
14 | #include <sys/resource.h> | ||
15 | #include <net/if.h> | ||
16 | |||
17 | #include "bpf_util.h" | ||
18 | #include <bpf/bpf.h> | ||
19 | #include <bpf/libbpf.h> | ||
20 | |||
21 | static int ifindex; | ||
22 | static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; | ||
23 | static __u32 prog_id; | ||
24 | |||
25 | static void int_exit(int sig) | ||
26 | { | ||
27 | __u32 curr_prog_id = 0; | ||
28 | |||
29 | if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) { | ||
30 | printf("bpf_get_link_xdp_id failed\n"); | ||
31 | exit(1); | ||
32 | } | ||
33 | if (prog_id == curr_prog_id) | ||
34 | bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); | ||
35 | else if (!curr_prog_id) | ||
36 | printf("couldn't find a prog id on a given interface\n"); | ||
37 | else | ||
38 | printf("program on interface changed, not removing\n"); | ||
39 | exit(0); | ||
40 | } | ||
41 | |||
42 | /* simple per-protocol drop counter | ||
43 | */ | ||
44 | static void poll_stats(int map_fd, int interval) | ||
45 | { | ||
46 | unsigned int nr_cpus = bpf_num_possible_cpus(); | ||
47 | __u64 values[nr_cpus], prev[UINT8_MAX] = { 0 }; | ||
48 | int i; | ||
49 | |||
50 | while (1) { | ||
51 | __u32 key = UINT32_MAX; | ||
52 | |||
53 | sleep(interval); | ||
54 | |||
55 | while (bpf_map_get_next_key(map_fd, &key, &key) != -1) { | ||
56 | __u64 sum = 0; | ||
57 | |||
58 | assert(bpf_map_lookup_elem(map_fd, &key, values) == 0); | ||
59 | for (i = 0; i < nr_cpus; i++) | ||
60 | sum += values[i]; | ||
61 | if (sum > prev[key]) | ||
62 | printf("proto %u: %10llu pkt/s\n", | ||
63 | key, (sum - prev[key]) / interval); | ||
64 | prev[key] = sum; | ||
65 | } | ||
66 | } | ||
67 | } | ||
68 | |||
69 | static void usage(const char *prog) | ||
70 | { | ||
71 | fprintf(stderr, | ||
72 | "usage: %s [OPTS] IFACE\n\n" | ||
73 | "OPTS:\n" | ||
74 | " -S use skb-mode\n" | ||
75 | " -N enforce native mode\n" | ||
76 | " -F force loading prog\n", | ||
77 | prog); | ||
78 | } | ||
79 | |||
80 | int main(int argc, char **argv) | ||
81 | { | ||
82 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
83 | struct bpf_prog_load_attr prog_load_attr = { | ||
84 | .prog_type = BPF_PROG_TYPE_XDP, | ||
85 | }; | ||
86 | struct bpf_prog_info info = {}; | ||
87 | __u32 info_len = sizeof(info); | ||
88 | const char *optstr = "FSN"; | ||
89 | int prog_fd, map_fd, opt; | ||
90 | struct bpf_object *obj; | ||
91 | struct bpf_map *map; | ||
92 | char filename[256]; | ||
93 | int err; | ||
94 | |||
95 | while ((opt = getopt(argc, argv, optstr)) != -1) { | ||
96 | switch (opt) { | ||
97 | case 'S': | ||
98 | xdp_flags |= XDP_FLAGS_SKB_MODE; | ||
99 | break; | ||
100 | case 'N': | ||
101 | /* default, set below */ | ||
102 | break; | ||
103 | case 'F': | ||
104 | xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; | ||
105 | break; | ||
106 | default: | ||
107 | usage(basename(argv[0])); | ||
108 | return 1; | ||
109 | } | ||
110 | } | ||
111 | |||
112 | if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) | ||
113 | xdp_flags |= XDP_FLAGS_DRV_MODE; | ||
114 | |||
115 | if (optind == argc) { | ||
116 | usage(basename(argv[0])); | ||
117 | return 1; | ||
118 | } | ||
119 | |||
120 | if (setrlimit(RLIMIT_MEMLOCK, &r)) { | ||
121 | perror("setrlimit(RLIMIT_MEMLOCK)"); | ||
122 | return 1; | ||
123 | } | ||
124 | |||
125 | ifindex = if_nametoindex(argv[optind]); | ||
126 | if (!ifindex) { | ||
127 | perror("if_nametoindex"); | ||
128 | return 1; | ||
129 | } | ||
130 | |||
131 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
132 | prog_load_attr.file = filename; | ||
133 | |||
134 | if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) | ||
135 | return 1; | ||
136 | |||
137 | map = bpf_map__next(NULL, obj); | ||
138 | if (!map) { | ||
139 | printf("finding a map in obj file failed\n"); | ||
140 | return 1; | ||
141 | } | ||
142 | map_fd = bpf_map__fd(map); | ||
143 | |||
144 | if (!prog_fd) { | ||
145 | printf("bpf_prog_load_xattr: %s\n", strerror(errno)); | ||
146 | return 1; | ||
147 | } | ||
148 | |||
149 | signal(SIGINT, int_exit); | ||
150 | signal(SIGTERM, int_exit); | ||
151 | |||
152 | if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) { | ||
153 | printf("link set xdp fd failed\n"); | ||
154 | return 1; | ||
155 | } | ||
156 | |||
157 | err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); | ||
158 | if (err) { | ||
159 | printf("can't get prog info - %s\n", strerror(errno)); | ||
160 | return err; | ||
161 | } | ||
162 | prog_id = info.id; | ||
163 | |||
164 | poll_stats(map_fd, 2); | ||
165 | |||
166 | return 0; | ||
167 | } | ||
diff --git a/samples/bpf/xdp2_kern.c b/samples/bpf/xdp2_kern.c new file mode 100644 index 000000000..c787f4b49 --- /dev/null +++ b/samples/bpf/xdp2_kern.c | |||
@@ -0,0 +1,114 @@ | |||
1 | /* Copyright (c) 2016 PLUMgrid | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #define KBUILD_MODNAME "foo" | ||
8 | #include <uapi/linux/bpf.h> | ||
9 | #include <linux/in.h> | ||
10 | #include <linux/if_ether.h> | ||
11 | #include <linux/if_packet.h> | ||
12 | #include <linux/if_vlan.h> | ||
13 | #include <linux/ip.h> | ||
14 | #include <linux/ipv6.h> | ||
15 | #include <bpf/bpf_helpers.h> | ||
16 | |||
17 | struct { | ||
18 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); | ||
19 | __type(key, u32); | ||
20 | __type(value, long); | ||
21 | __uint(max_entries, 256); | ||
22 | } rxcnt SEC(".maps"); | ||
23 | |||
24 | static void swap_src_dst_mac(void *data) | ||
25 | { | ||
26 | unsigned short *p = data; | ||
27 | unsigned short dst[3]; | ||
28 | |||
29 | dst[0] = p[0]; | ||
30 | dst[1] = p[1]; | ||
31 | dst[2] = p[2]; | ||
32 | p[0] = p[3]; | ||
33 | p[1] = p[4]; | ||
34 | p[2] = p[5]; | ||
35 | p[3] = dst[0]; | ||
36 | p[4] = dst[1]; | ||
37 | p[5] = dst[2]; | ||
38 | } | ||
39 | |||
40 | static int parse_ipv4(void *data, u64 nh_off, void *data_end) | ||
41 | { | ||
42 | struct iphdr *iph = data + nh_off; | ||
43 | |||
44 | if (iph + 1 > data_end) | ||
45 | return 0; | ||
46 | return iph->protocol; | ||
47 | } | ||
48 | |||
49 | static int parse_ipv6(void *data, u64 nh_off, void *data_end) | ||
50 | { | ||
51 | struct ipv6hdr *ip6h = data + nh_off; | ||
52 | |||
53 | if (ip6h + 1 > data_end) | ||
54 | return 0; | ||
55 | return ip6h->nexthdr; | ||
56 | } | ||
57 | |||
58 | SEC("xdp1") | ||
59 | int xdp_prog1(struct xdp_md *ctx) | ||
60 | { | ||
61 | void *data_end = (void *)(long)ctx->data_end; | ||
62 | void *data = (void *)(long)ctx->data; | ||
63 | struct ethhdr *eth = data; | ||
64 | int rc = XDP_DROP; | ||
65 | long *value; | ||
66 | u16 h_proto; | ||
67 | u64 nh_off; | ||
68 | u32 ipproto; | ||
69 | |||
70 | nh_off = sizeof(*eth); | ||
71 | if (data + nh_off > data_end) | ||
72 | return rc; | ||
73 | |||
74 | h_proto = eth->h_proto; | ||
75 | |||
76 | if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { | ||
77 | struct vlan_hdr *vhdr; | ||
78 | |||
79 | vhdr = data + nh_off; | ||
80 | nh_off += sizeof(struct vlan_hdr); | ||
81 | if (data + nh_off > data_end) | ||
82 | return rc; | ||
83 | h_proto = vhdr->h_vlan_encapsulated_proto; | ||
84 | } | ||
85 | if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { | ||
86 | struct vlan_hdr *vhdr; | ||
87 | |||
88 | vhdr = data + nh_off; | ||
89 | nh_off += sizeof(struct vlan_hdr); | ||
90 | if (data + nh_off > data_end) | ||
91 | return rc; | ||
92 | h_proto = vhdr->h_vlan_encapsulated_proto; | ||
93 | } | ||
94 | |||
95 | if (h_proto == htons(ETH_P_IP)) | ||
96 | ipproto = parse_ipv4(data, nh_off, data_end); | ||
97 | else if (h_proto == htons(ETH_P_IPV6)) | ||
98 | ipproto = parse_ipv6(data, nh_off, data_end); | ||
99 | else | ||
100 | ipproto = 0; | ||
101 | |||
102 | value = bpf_map_lookup_elem(&rxcnt, &ipproto); | ||
103 | if (value) | ||
104 | *value += 1; | ||
105 | |||
106 | if (ipproto == IPPROTO_UDP) { | ||
107 | swap_src_dst_mac(data); | ||
108 | rc = XDP_TX; | ||
109 | } | ||
110 | |||
111 | return rc; | ||
112 | } | ||
113 | |||
114 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/xdp2skb_meta.sh b/samples/bpf/xdp2skb_meta.sh new file mode 100755 index 000000000..4bde9d066 --- /dev/null +++ b/samples/bpf/xdp2skb_meta.sh | |||
@@ -0,0 +1,220 @@ | |||
1 | #!/bin/bash | ||
2 | # | ||
3 | # SPDX-License-Identifier: GPL-2.0 | ||
4 | # Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc. | ||
5 | # | ||
6 | # Bash-shell example on using iproute2 tools 'tc' and 'ip' to load | ||
7 | # eBPF programs, both for XDP and clsbpf. Shell script function | ||
8 | # wrappers and even long options parsing is illustrated, for ease of | ||
9 | # use. | ||
10 | # | ||
11 | # Related to sample/bpf/xdp2skb_meta_kern.c, which contains BPF-progs | ||
12 | # that need to collaborate between XDP and TC hooks. Thus, it is | ||
13 | # convenient that the same tool load both programs that need to work | ||
14 | # together. | ||
15 | # | ||
16 | BPF_FILE=xdp2skb_meta_kern.o | ||
17 | DIR=$(dirname $0) | ||
18 | |||
19 | [ -z "$TC" ] && TC=tc | ||
20 | [ -z "$IP" ] && IP=ip | ||
21 | |||
22 | function usage() { | ||
23 | echo "" | ||
24 | echo "Usage: $0 [-vfh] --dev ethX" | ||
25 | echo " -d | --dev : Network device (required)" | ||
26 | echo " --flush : Cleanup flush TC and XDP progs" | ||
27 | echo " --list : (\$LIST) List TC and XDP progs" | ||
28 | echo " -v | --verbose : (\$VERBOSE) Verbose" | ||
29 | echo " --dry-run : (\$DRYRUN) Dry-run only (echo commands)" | ||
30 | echo "" | ||
31 | } | ||
32 | |||
33 | ## -- General shell logging cmds -- | ||
34 | function err() { | ||
35 | local exitcode=$1 | ||
36 | shift | ||
37 | echo "ERROR: $@" >&2 | ||
38 | exit $exitcode | ||
39 | } | ||
40 | |||
41 | function info() { | ||
42 | if [[ -n "$VERBOSE" ]]; then | ||
43 | echo "# $@" | ||
44 | fi | ||
45 | } | ||
46 | |||
47 | ## -- Helper function calls -- | ||
48 | |||
49 | # Wrapper call for TC and IP | ||
50 | # - Will display the offending command on failure | ||
51 | function _call_cmd() { | ||
52 | local cmd="$1" | ||
53 | local allow_fail="$2" | ||
54 | shift 2 | ||
55 | if [[ -n "$VERBOSE" ]]; then | ||
56 | echo "$cmd $@" | ||
57 | fi | ||
58 | if [[ -n "$DRYRUN" ]]; then | ||
59 | return | ||
60 | fi | ||
61 | $cmd "$@" | ||
62 | local status=$? | ||
63 | if (( $status != 0 )); then | ||
64 | if [[ "$allow_fail" == "" ]]; then | ||
65 | err 2 "Exec error($status) occurred cmd: \"$cmd $@\"" | ||
66 | fi | ||
67 | fi | ||
68 | } | ||
69 | function call_tc() { | ||
70 | _call_cmd "$TC" "" "$@" | ||
71 | } | ||
72 | function call_tc_allow_fail() { | ||
73 | _call_cmd "$TC" "allow_fail" "$@" | ||
74 | } | ||
75 | function call_ip() { | ||
76 | _call_cmd "$IP" "" "$@" | ||
77 | } | ||
78 | |||
79 | ## --- Parse command line arguments / parameters --- | ||
80 | # Using external program "getopt" to get --long-options | ||
81 | OPTIONS=$(getopt -o vfhd: \ | ||
82 | --long verbose,flush,help,list,dev:,dry-run -- "$@") | ||
83 | if (( $? != 0 )); then | ||
84 | err 4 "Error calling getopt" | ||
85 | fi | ||
86 | eval set -- "$OPTIONS" | ||
87 | |||
88 | unset DEV | ||
89 | unset FLUSH | ||
90 | while true; do | ||
91 | case "$1" in | ||
92 | -d | --dev ) # device | ||
93 | DEV=$2 | ||
94 | info "Device set to: DEV=$DEV" >&2 | ||
95 | shift 2 | ||
96 | ;; | ||
97 | -v | --verbose) | ||
98 | VERBOSE=yes | ||
99 | # info "Verbose mode: VERBOSE=$VERBOSE" >&2 | ||
100 | shift | ||
101 | ;; | ||
102 | --dry-run ) | ||
103 | DRYRUN=yes | ||
104 | VERBOSE=yes | ||
105 | info "Dry-run mode: enable VERBOSE and don't call TC+IP" >&2 | ||
106 | shift | ||
107 | ;; | ||
108 | -f | --flush ) | ||
109 | FLUSH=yes | ||
110 | shift | ||
111 | ;; | ||
112 | --list ) | ||
113 | LIST=yes | ||
114 | shift | ||
115 | ;; | ||
116 | -- ) | ||
117 | shift | ||
118 | break | ||
119 | ;; | ||
120 | -h | --help ) | ||
121 | usage; | ||
122 | exit 0 | ||
123 | ;; | ||
124 | * ) | ||
125 | shift | ||
126 | break | ||
127 | ;; | ||
128 | esac | ||
129 | done | ||
130 | |||
131 | FILE="$DIR/$BPF_FILE" | ||
132 | if [[ ! -e $FILE ]]; then | ||
133 | err 3 "Missing BPF object file ($FILE)" | ||
134 | fi | ||
135 | |||
136 | if [[ -z $DEV ]]; then | ||
137 | usage | ||
138 | err 2 "Please specify network device -- required option --dev" | ||
139 | fi | ||
140 | |||
141 | ## -- Function calls -- | ||
142 | |||
143 | function list_tc() | ||
144 | { | ||
145 | local device="$1" | ||
146 | shift | ||
147 | info "Listing current TC ingress rules" | ||
148 | call_tc filter show dev $device ingress | ||
149 | } | ||
150 | |||
151 | function list_xdp() | ||
152 | { | ||
153 | local device="$1" | ||
154 | shift | ||
155 | info "Listing current XDP device($device) setting" | ||
156 | call_ip link show dev $device | grep --color=auto xdp | ||
157 | } | ||
158 | |||
159 | function flush_tc() | ||
160 | { | ||
161 | local device="$1" | ||
162 | shift | ||
163 | info "Flush TC on device: $device" | ||
164 | call_tc_allow_fail filter del dev $device ingress | ||
165 | call_tc_allow_fail qdisc del dev $device clsact | ||
166 | } | ||
167 | |||
168 | function flush_xdp() | ||
169 | { | ||
170 | local device="$1" | ||
171 | shift | ||
172 | info "Flush XDP on device: $device" | ||
173 | call_ip link set dev $device xdp off | ||
174 | } | ||
175 | |||
176 | function attach_tc_mark() | ||
177 | { | ||
178 | local device="$1" | ||
179 | local file="$2" | ||
180 | local prog="tc_mark" | ||
181 | shift 2 | ||
182 | |||
183 | # Re-attach clsact to clear/flush existing role | ||
184 | call_tc_allow_fail qdisc del dev $device clsact 2> /dev/null | ||
185 | call_tc qdisc add dev $device clsact | ||
186 | |||
187 | # Attach BPF prog | ||
188 | call_tc filter add dev $device ingress \ | ||
189 | prio 1 handle 1 bpf da obj $file sec $prog | ||
190 | } | ||
191 | |||
192 | function attach_xdp_mark() | ||
193 | { | ||
194 | local device="$1" | ||
195 | local file="$2" | ||
196 | local prog="xdp_mark" | ||
197 | shift 2 | ||
198 | |||
199 | # Remove XDP prog in-case it's already loaded | ||
200 | # TODO: Need ip-link option to override/replace existing XDP prog | ||
201 | flush_xdp $device | ||
202 | |||
203 | # Attach XDP/BPF prog | ||
204 | call_ip link set dev $device xdp obj $file sec $prog | ||
205 | } | ||
206 | |||
207 | if [[ -n $FLUSH ]]; then | ||
208 | flush_tc $DEV | ||
209 | flush_xdp $DEV | ||
210 | exit 0 | ||
211 | fi | ||
212 | |||
213 | if [[ -n $LIST ]]; then | ||
214 | list_tc $DEV | ||
215 | list_xdp $DEV | ||
216 | exit 0 | ||
217 | fi | ||
218 | |||
219 | attach_tc_mark $DEV $FILE | ||
220 | attach_xdp_mark $DEV $FILE | ||
diff --git a/samples/bpf/xdp2skb_meta_kern.c b/samples/bpf/xdp2skb_meta_kern.c new file mode 100644 index 000000000..9b783316e --- /dev/null +++ b/samples/bpf/xdp2skb_meta_kern.c | |||
@@ -0,0 +1,105 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 | ||
2 | * Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc. | ||
3 | * | ||
4 | * Example howto transfer info from XDP to SKB, e.g. skb->mark | ||
5 | * ----------------------------------------------------------- | ||
6 | * This uses the XDP data_meta infrastructure, and is a cooperation | ||
7 | * between two bpf-programs (1) XDP and (2) clsact at TC-ingress hook. | ||
8 | * | ||
9 | * Notice: This example does not use the BPF C-loader (bpf_load.c), | ||
10 | * but instead rely on the iproute2 TC tool for loading BPF-objects. | ||
11 | */ | ||
12 | #include <uapi/linux/bpf.h> | ||
13 | #include <uapi/linux/pkt_cls.h> | ||
14 | |||
15 | #include <bpf/bpf_helpers.h> | ||
16 | |||
17 | /* | ||
18 | * This struct is stored in the XDP 'data_meta' area, which is located | ||
19 | * just in-front-of the raw packet payload data. The meaning is | ||
20 | * specific to these two BPF programs that use it as a communication | ||
21 | * channel. XDP adjust/increase the area via a bpf-helper, and TC use | ||
22 | * boundary checks to see if data have been provided. | ||
23 | * | ||
24 | * The struct must be 4 byte aligned, which here is enforced by the | ||
25 | * struct __attribute__((aligned(4))). | ||
26 | */ | ||
27 | struct meta_info { | ||
28 | __u32 mark; | ||
29 | } __attribute__((aligned(4))); | ||
30 | |||
31 | SEC("xdp_mark") | ||
32 | int _xdp_mark(struct xdp_md *ctx) | ||
33 | { | ||
34 | struct meta_info *meta; | ||
35 | void *data, *data_end; | ||
36 | int ret; | ||
37 | |||
38 | /* Reserve space in-front of data pointer for our meta info. | ||
39 | * (Notice drivers not supporting data_meta will fail here!) | ||
40 | */ | ||
41 | ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(*meta)); | ||
42 | if (ret < 0) | ||
43 | return XDP_ABORTED; | ||
44 | |||
45 | /* Notice: Kernel-side verifier requires that loading of | ||
46 | * ctx->data MUST happen _after_ helper bpf_xdp_adjust_meta(), | ||
47 | * as pkt-data pointers are invalidated. Helpers that require | ||
48 | * this are determined/marked by bpf_helper_changes_pkt_data() | ||
49 | */ | ||
50 | data = (void *)(unsigned long)ctx->data; | ||
51 | |||
52 | /* Check data_meta have room for meta_info struct */ | ||
53 | meta = (void *)(unsigned long)ctx->data_meta; | ||
54 | if (meta + 1 > data) | ||
55 | return XDP_ABORTED; | ||
56 | |||
57 | meta->mark = 42; | ||
58 | |||
59 | return XDP_PASS; | ||
60 | } | ||
61 | |||
62 | SEC("tc_mark") | ||
63 | int _tc_mark(struct __sk_buff *ctx) | ||
64 | { | ||
65 | void *data = (void *)(unsigned long)ctx->data; | ||
66 | void *data_end = (void *)(unsigned long)ctx->data_end; | ||
67 | void *data_meta = (void *)(unsigned long)ctx->data_meta; | ||
68 | struct meta_info *meta = data_meta; | ||
69 | |||
70 | /* Check XDP gave us some data_meta */ | ||
71 | if (meta + 1 > data) { | ||
72 | ctx->mark = 41; | ||
73 | /* Skip "accept" if no data_meta is avail */ | ||
74 | return TC_ACT_OK; | ||
75 | } | ||
76 | |||
77 | /* Hint: See func tc_cls_act_is_valid_access() for BPF_WRITE access */ | ||
78 | ctx->mark = meta->mark; /* Transfer XDP-mark to SKB-mark */ | ||
79 | |||
80 | return TC_ACT_OK; | ||
81 | } | ||
82 | |||
83 | /* Manually attaching these programs: | ||
84 | export DEV=ixgbe2 | ||
85 | export FILE=xdp2skb_meta_kern.o | ||
86 | |||
87 | # via TC command | ||
88 | tc qdisc del dev $DEV clsact 2> /dev/null | ||
89 | tc qdisc add dev $DEV clsact | ||
90 | tc filter add dev $DEV ingress prio 1 handle 1 bpf da obj $FILE sec tc_mark | ||
91 | tc filter show dev $DEV ingress | ||
92 | |||
93 | # XDP via IP command: | ||
94 | ip link set dev $DEV xdp off | ||
95 | ip link set dev $DEV xdp obj $FILE sec xdp_mark | ||
96 | |||
97 | # Use iptable to "see" if SKBs are marked | ||
98 | iptables -I INPUT -p icmp -m mark --mark 41 # == 0x29 | ||
99 | iptables -I INPUT -p icmp -m mark --mark 42 # == 0x2a | ||
100 | |||
101 | # Hint: catch XDP_ABORTED errors via | ||
102 | perf record -e xdp:* | ||
103 | perf script | ||
104 | |||
105 | */ | ||
diff --git a/samples/bpf/xdp_adjust_tail_kern.c b/samples/bpf/xdp_adjust_tail_kern.c new file mode 100644 index 000000000..ffdd54862 --- /dev/null +++ b/samples/bpf/xdp_adjust_tail_kern.c | |||
@@ -0,0 +1,155 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 | ||
2 | * Copyright (c) 2018 Facebook | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of version 2 of the GNU General Public | ||
6 | * License as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program shows how to use bpf_xdp_adjust_tail() by | ||
9 | * generating ICMPv4 "packet to big" (unreachable/ df bit set frag needed | ||
10 | * to be more preice in case of v4)" where receiving packets bigger then | ||
11 | * 600 bytes. | ||
12 | */ | ||
13 | #define KBUILD_MODNAME "foo" | ||
14 | #include <uapi/linux/bpf.h> | ||
15 | #include <linux/in.h> | ||
16 | #include <linux/if_ether.h> | ||
17 | #include <linux/if_packet.h> | ||
18 | #include <linux/if_vlan.h> | ||
19 | #include <linux/ip.h> | ||
20 | #include <linux/icmp.h> | ||
21 | #include <bpf/bpf_helpers.h> | ||
22 | |||
23 | #define DEFAULT_TTL 64 | ||
24 | #define MAX_PCKT_SIZE 600 | ||
25 | #define ICMP_TOOBIG_SIZE 98 | ||
26 | #define ICMP_TOOBIG_PAYLOAD_SIZE 92 | ||
27 | |||
28 | /* volatile to prevent compiler optimizations */ | ||
29 | static volatile __u32 max_pcktsz = MAX_PCKT_SIZE; | ||
30 | |||
31 | struct { | ||
32 | __uint(type, BPF_MAP_TYPE_ARRAY); | ||
33 | __type(key, __u32); | ||
34 | __type(value, __u64); | ||
35 | __uint(max_entries, 1); | ||
36 | } icmpcnt SEC(".maps"); | ||
37 | |||
38 | static __always_inline void count_icmp(void) | ||
39 | { | ||
40 | u64 key = 0; | ||
41 | u64 *icmp_count; | ||
42 | |||
43 | icmp_count = bpf_map_lookup_elem(&icmpcnt, &key); | ||
44 | if (icmp_count) | ||
45 | *icmp_count += 1; | ||
46 | } | ||
47 | |||
48 | static __always_inline void swap_mac(void *data, struct ethhdr *orig_eth) | ||
49 | { | ||
50 | struct ethhdr *eth; | ||
51 | |||
52 | eth = data; | ||
53 | memcpy(eth->h_source, orig_eth->h_dest, ETH_ALEN); | ||
54 | memcpy(eth->h_dest, orig_eth->h_source, ETH_ALEN); | ||
55 | eth->h_proto = orig_eth->h_proto; | ||
56 | } | ||
57 | |||
58 | static __always_inline __u16 csum_fold_helper(__u32 csum) | ||
59 | { | ||
60 | return ~((csum & 0xffff) + (csum >> 16)); | ||
61 | } | ||
62 | |||
63 | static __always_inline void ipv4_csum(void *data_start, int data_size, | ||
64 | __u32 *csum) | ||
65 | { | ||
66 | *csum = bpf_csum_diff(0, 0, data_start, data_size, *csum); | ||
67 | *csum = csum_fold_helper(*csum); | ||
68 | } | ||
69 | |||
70 | static __always_inline int send_icmp4_too_big(struct xdp_md *xdp) | ||
71 | { | ||
72 | int headroom = (int)sizeof(struct iphdr) + (int)sizeof(struct icmphdr); | ||
73 | |||
74 | if (bpf_xdp_adjust_head(xdp, 0 - headroom)) | ||
75 | return XDP_DROP; | ||
76 | void *data = (void *)(long)xdp->data; | ||
77 | void *data_end = (void *)(long)xdp->data_end; | ||
78 | |||
79 | if (data + (ICMP_TOOBIG_SIZE + headroom) > data_end) | ||
80 | return XDP_DROP; | ||
81 | |||
82 | struct iphdr *iph, *orig_iph; | ||
83 | struct icmphdr *icmp_hdr; | ||
84 | struct ethhdr *orig_eth; | ||
85 | __u32 csum = 0; | ||
86 | __u64 off = 0; | ||
87 | |||
88 | orig_eth = data + headroom; | ||
89 | swap_mac(data, orig_eth); | ||
90 | off += sizeof(struct ethhdr); | ||
91 | iph = data + off; | ||
92 | off += sizeof(struct iphdr); | ||
93 | icmp_hdr = data + off; | ||
94 | off += sizeof(struct icmphdr); | ||
95 | orig_iph = data + off; | ||
96 | icmp_hdr->type = ICMP_DEST_UNREACH; | ||
97 | icmp_hdr->code = ICMP_FRAG_NEEDED; | ||
98 | icmp_hdr->un.frag.mtu = htons(max_pcktsz - sizeof(struct ethhdr)); | ||
99 | icmp_hdr->checksum = 0; | ||
100 | ipv4_csum(icmp_hdr, ICMP_TOOBIG_PAYLOAD_SIZE, &csum); | ||
101 | icmp_hdr->checksum = csum; | ||
102 | iph->ttl = DEFAULT_TTL; | ||
103 | iph->daddr = orig_iph->saddr; | ||
104 | iph->saddr = orig_iph->daddr; | ||
105 | iph->version = 4; | ||
106 | iph->ihl = 5; | ||
107 | iph->protocol = IPPROTO_ICMP; | ||
108 | iph->tos = 0; | ||
109 | iph->tot_len = htons( | ||
110 | ICMP_TOOBIG_SIZE + headroom - sizeof(struct ethhdr)); | ||
111 | iph->check = 0; | ||
112 | csum = 0; | ||
113 | ipv4_csum(iph, sizeof(struct iphdr), &csum); | ||
114 | iph->check = csum; | ||
115 | count_icmp(); | ||
116 | return XDP_TX; | ||
117 | } | ||
118 | |||
119 | |||
120 | static __always_inline int handle_ipv4(struct xdp_md *xdp) | ||
121 | { | ||
122 | void *data_end = (void *)(long)xdp->data_end; | ||
123 | void *data = (void *)(long)xdp->data; | ||
124 | int pckt_size = data_end - data; | ||
125 | int offset; | ||
126 | |||
127 | if (pckt_size > max(max_pcktsz, ICMP_TOOBIG_SIZE)) { | ||
128 | offset = pckt_size - ICMP_TOOBIG_SIZE; | ||
129 | if (bpf_xdp_adjust_tail(xdp, 0 - offset)) | ||
130 | return XDP_PASS; | ||
131 | return send_icmp4_too_big(xdp); | ||
132 | } | ||
133 | return XDP_PASS; | ||
134 | } | ||
135 | |||
136 | SEC("xdp_icmp") | ||
137 | int _xdp_icmp(struct xdp_md *xdp) | ||
138 | { | ||
139 | void *data_end = (void *)(long)xdp->data_end; | ||
140 | void *data = (void *)(long)xdp->data; | ||
141 | struct ethhdr *eth = data; | ||
142 | __u16 h_proto; | ||
143 | |||
144 | if (eth + 1 > data_end) | ||
145 | return XDP_DROP; | ||
146 | |||
147 | h_proto = eth->h_proto; | ||
148 | |||
149 | if (h_proto == htons(ETH_P_IP)) | ||
150 | return handle_ipv4(xdp); | ||
151 | else | ||
152 | return XDP_PASS; | ||
153 | } | ||
154 | |||
155 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/xdp_adjust_tail_user.c b/samples/bpf/xdp_adjust_tail_user.c new file mode 100644 index 000000000..ba482dc3d --- /dev/null +++ b/samples/bpf/xdp_adjust_tail_user.c | |||
@@ -0,0 +1,198 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 | ||
2 | * Copyright (c) 2018 Facebook | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of version 2 of the GNU General Public | ||
6 | * License as published by the Free Software Foundation. | ||
7 | */ | ||
8 | #include <linux/bpf.h> | ||
9 | #include <linux/if_link.h> | ||
10 | #include <assert.h> | ||
11 | #include <errno.h> | ||
12 | #include <signal.h> | ||
13 | #include <stdio.h> | ||
14 | #include <stdlib.h> | ||
15 | #include <string.h> | ||
16 | #include <net/if.h> | ||
17 | #include <sys/resource.h> | ||
18 | #include <arpa/inet.h> | ||
19 | #include <netinet/ether.h> | ||
20 | #include <unistd.h> | ||
21 | #include <time.h> | ||
22 | #include <bpf/bpf.h> | ||
23 | #include <bpf/libbpf.h> | ||
24 | |||
25 | #define STATS_INTERVAL_S 2U | ||
26 | #define MAX_PCKT_SIZE 600 | ||
27 | |||
28 | static int ifindex = -1; | ||
29 | static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; | ||
30 | static __u32 prog_id; | ||
31 | |||
32 | static void int_exit(int sig) | ||
33 | { | ||
34 | __u32 curr_prog_id = 0; | ||
35 | |||
36 | if (ifindex > -1) { | ||
37 | if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) { | ||
38 | printf("bpf_get_link_xdp_id failed\n"); | ||
39 | exit(1); | ||
40 | } | ||
41 | if (prog_id == curr_prog_id) | ||
42 | bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); | ||
43 | else if (!curr_prog_id) | ||
44 | printf("couldn't find a prog id on a given iface\n"); | ||
45 | else | ||
46 | printf("program on interface changed, not removing\n"); | ||
47 | } | ||
48 | exit(0); | ||
49 | } | ||
50 | |||
51 | /* simple "icmp packet too big sent" counter | ||
52 | */ | ||
53 | static void poll_stats(unsigned int map_fd, unsigned int kill_after_s) | ||
54 | { | ||
55 | time_t started_at = time(NULL); | ||
56 | __u64 value = 0; | ||
57 | int key = 0; | ||
58 | |||
59 | |||
60 | while (!kill_after_s || time(NULL) - started_at <= kill_after_s) { | ||
61 | sleep(STATS_INTERVAL_S); | ||
62 | |||
63 | assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); | ||
64 | |||
65 | printf("icmp \"packet too big\" sent: %10llu pkts\n", value); | ||
66 | } | ||
67 | } | ||
68 | |||
69 | static void usage(const char *cmd) | ||
70 | { | ||
71 | printf("Start a XDP prog which send ICMP \"packet too big\" \n" | ||
72 | "messages if ingress packet is bigger then MAX_SIZE bytes\n"); | ||
73 | printf("Usage: %s [...]\n", cmd); | ||
74 | printf(" -i <ifname|ifindex> Interface\n"); | ||
75 | printf(" -T <stop-after-X-seconds> Default: 0 (forever)\n"); | ||
76 | printf(" -P <MAX_PCKT_SIZE> Default: %u\n", MAX_PCKT_SIZE); | ||
77 | printf(" -S use skb-mode\n"); | ||
78 | printf(" -N enforce native mode\n"); | ||
79 | printf(" -F force loading prog\n"); | ||
80 | printf(" -h Display this help\n"); | ||
81 | } | ||
82 | |||
83 | int main(int argc, char **argv) | ||
84 | { | ||
85 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
86 | struct bpf_prog_load_attr prog_load_attr = { | ||
87 | .prog_type = BPF_PROG_TYPE_XDP, | ||
88 | }; | ||
89 | unsigned char opt_flags[256] = {}; | ||
90 | const char *optstr = "i:T:P:SNFh"; | ||
91 | struct bpf_prog_info info = {}; | ||
92 | __u32 info_len = sizeof(info); | ||
93 | unsigned int kill_after_s = 0; | ||
94 | int i, prog_fd, map_fd, opt; | ||
95 | struct bpf_object *obj; | ||
96 | __u32 max_pckt_size = 0; | ||
97 | __u32 key = 0; | ||
98 | char filename[256]; | ||
99 | int err; | ||
100 | |||
101 | for (i = 0; i < strlen(optstr); i++) | ||
102 | if (optstr[i] != 'h' && 'a' <= optstr[i] && optstr[i] <= 'z') | ||
103 | opt_flags[(unsigned char)optstr[i]] = 1; | ||
104 | |||
105 | while ((opt = getopt(argc, argv, optstr)) != -1) { | ||
106 | |||
107 | switch (opt) { | ||
108 | case 'i': | ||
109 | ifindex = if_nametoindex(optarg); | ||
110 | if (!ifindex) | ||
111 | ifindex = atoi(optarg); | ||
112 | break; | ||
113 | case 'T': | ||
114 | kill_after_s = atoi(optarg); | ||
115 | break; | ||
116 | case 'P': | ||
117 | max_pckt_size = atoi(optarg); | ||
118 | break; | ||
119 | case 'S': | ||
120 | xdp_flags |= XDP_FLAGS_SKB_MODE; | ||
121 | break; | ||
122 | case 'N': | ||
123 | /* default, set below */ | ||
124 | break; | ||
125 | case 'F': | ||
126 | xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; | ||
127 | break; | ||
128 | default: | ||
129 | usage(argv[0]); | ||
130 | return 1; | ||
131 | } | ||
132 | opt_flags[opt] = 0; | ||
133 | } | ||
134 | |||
135 | if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) | ||
136 | xdp_flags |= XDP_FLAGS_DRV_MODE; | ||
137 | |||
138 | for (i = 0; i < strlen(optstr); i++) { | ||
139 | if (opt_flags[(unsigned int)optstr[i]]) { | ||
140 | fprintf(stderr, "Missing argument -%c\n", optstr[i]); | ||
141 | usage(argv[0]); | ||
142 | return 1; | ||
143 | } | ||
144 | } | ||
145 | |||
146 | if (setrlimit(RLIMIT_MEMLOCK, &r)) { | ||
147 | perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); | ||
148 | return 1; | ||
149 | } | ||
150 | |||
151 | if (!ifindex) { | ||
152 | fprintf(stderr, "Invalid ifname\n"); | ||
153 | return 1; | ||
154 | } | ||
155 | |||
156 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
157 | prog_load_attr.file = filename; | ||
158 | |||
159 | if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) | ||
160 | return 1; | ||
161 | |||
162 | /* static global var 'max_pcktsz' is accessible from .data section */ | ||
163 | if (max_pckt_size) { | ||
164 | map_fd = bpf_object__find_map_fd_by_name(obj, "xdp_adju.data"); | ||
165 | if (map_fd < 0) { | ||
166 | printf("finding a max_pcktsz map in obj file failed\n"); | ||
167 | return 1; | ||
168 | } | ||
169 | bpf_map_update_elem(map_fd, &key, &max_pckt_size, BPF_ANY); | ||
170 | } | ||
171 | |||
172 | /* fetch icmpcnt map */ | ||
173 | map_fd = bpf_object__find_map_fd_by_name(obj, "icmpcnt"); | ||
174 | if (map_fd < 0) { | ||
175 | printf("finding a icmpcnt map in obj file failed\n"); | ||
176 | return 1; | ||
177 | } | ||
178 | |||
179 | signal(SIGINT, int_exit); | ||
180 | signal(SIGTERM, int_exit); | ||
181 | |||
182 | if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) { | ||
183 | printf("link set xdp fd failed\n"); | ||
184 | return 1; | ||
185 | } | ||
186 | |||
187 | err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); | ||
188 | if (err) { | ||
189 | printf("can't get prog info - %s\n", strerror(errno)); | ||
190 | return 1; | ||
191 | } | ||
192 | prog_id = info.id; | ||
193 | |||
194 | poll_stats(map_fd, kill_after_s); | ||
195 | int_exit(0); | ||
196 | |||
197 | return 0; | ||
198 | } | ||
diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c new file mode 100644 index 000000000..54c099cbd --- /dev/null +++ b/samples/bpf/xdp_fwd_kern.c | |||
@@ -0,0 +1,158 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com> | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of version 2 of the GNU General Public | ||
6 | * License as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | */ | ||
13 | #define KBUILD_MODNAME "foo" | ||
14 | #include <uapi/linux/bpf.h> | ||
15 | #include <linux/in.h> | ||
16 | #include <linux/if_ether.h> | ||
17 | #include <linux/if_packet.h> | ||
18 | #include <linux/if_vlan.h> | ||
19 | #include <linux/ip.h> | ||
20 | #include <linux/ipv6.h> | ||
21 | |||
22 | #include <bpf/bpf_helpers.h> | ||
23 | |||
24 | #define IPV6_FLOWINFO_MASK cpu_to_be32(0x0FFFFFFF) | ||
25 | |||
26 | struct { | ||
27 | __uint(type, BPF_MAP_TYPE_DEVMAP); | ||
28 | __uint(key_size, sizeof(int)); | ||
29 | __uint(value_size, sizeof(int)); | ||
30 | __uint(max_entries, 64); | ||
31 | } xdp_tx_ports SEC(".maps"); | ||
32 | |||
33 | /* from include/net/ip.h */ | ||
34 | static __always_inline int ip_decrease_ttl(struct iphdr *iph) | ||
35 | { | ||
36 | u32 check = (__force u32)iph->check; | ||
37 | |||
38 | check += (__force u32)htons(0x0100); | ||
39 | iph->check = (__force __sum16)(check + (check >= 0xFFFF)); | ||
40 | return --iph->ttl; | ||
41 | } | ||
42 | |||
43 | static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags) | ||
44 | { | ||
45 | void *data_end = (void *)(long)ctx->data_end; | ||
46 | void *data = (void *)(long)ctx->data; | ||
47 | struct bpf_fib_lookup fib_params; | ||
48 | struct ethhdr *eth = data; | ||
49 | struct ipv6hdr *ip6h; | ||
50 | struct iphdr *iph; | ||
51 | u16 h_proto; | ||
52 | u64 nh_off; | ||
53 | int rc; | ||
54 | |||
55 | nh_off = sizeof(*eth); | ||
56 | if (data + nh_off > data_end) | ||
57 | return XDP_DROP; | ||
58 | |||
59 | __builtin_memset(&fib_params, 0, sizeof(fib_params)); | ||
60 | |||
61 | h_proto = eth->h_proto; | ||
62 | if (h_proto == htons(ETH_P_IP)) { | ||
63 | iph = data + nh_off; | ||
64 | |||
65 | if (iph + 1 > data_end) | ||
66 | return XDP_DROP; | ||
67 | |||
68 | if (iph->ttl <= 1) | ||
69 | return XDP_PASS; | ||
70 | |||
71 | fib_params.family = AF_INET; | ||
72 | fib_params.tos = iph->tos; | ||
73 | fib_params.l4_protocol = iph->protocol; | ||
74 | fib_params.sport = 0; | ||
75 | fib_params.dport = 0; | ||
76 | fib_params.tot_len = ntohs(iph->tot_len); | ||
77 | fib_params.ipv4_src = iph->saddr; | ||
78 | fib_params.ipv4_dst = iph->daddr; | ||
79 | } else if (h_proto == htons(ETH_P_IPV6)) { | ||
80 | struct in6_addr *src = (struct in6_addr *) fib_params.ipv6_src; | ||
81 | struct in6_addr *dst = (struct in6_addr *) fib_params.ipv6_dst; | ||
82 | |||
83 | ip6h = data + nh_off; | ||
84 | if (ip6h + 1 > data_end) | ||
85 | return XDP_DROP; | ||
86 | |||
87 | if (ip6h->hop_limit <= 1) | ||
88 | return XDP_PASS; | ||
89 | |||
90 | fib_params.family = AF_INET6; | ||
91 | fib_params.flowinfo = *(__be32 *)ip6h & IPV6_FLOWINFO_MASK; | ||
92 | fib_params.l4_protocol = ip6h->nexthdr; | ||
93 | fib_params.sport = 0; | ||
94 | fib_params.dport = 0; | ||
95 | fib_params.tot_len = ntohs(ip6h->payload_len); | ||
96 | *src = ip6h->saddr; | ||
97 | *dst = ip6h->daddr; | ||
98 | } else { | ||
99 | return XDP_PASS; | ||
100 | } | ||
101 | |||
102 | fib_params.ifindex = ctx->ingress_ifindex; | ||
103 | |||
104 | rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags); | ||
105 | /* | ||
106 | * Some rc (return codes) from bpf_fib_lookup() are important, | ||
107 | * to understand how this XDP-prog interacts with network stack. | ||
108 | * | ||
109 | * BPF_FIB_LKUP_RET_NO_NEIGH: | ||
110 | * Even if route lookup was a success, then the MAC-addresses are also | ||
111 | * needed. This is obtained from arp/neighbour table, but if table is | ||
112 | * (still) empty then BPF_FIB_LKUP_RET_NO_NEIGH is returned. To avoid | ||
113 | * doing ARP lookup directly from XDP, then send packet to normal | ||
114 | * network stack via XDP_PASS and expect it will do ARP resolution. | ||
115 | * | ||
116 | * BPF_FIB_LKUP_RET_FWD_DISABLED: | ||
117 | * The bpf_fib_lookup respect sysctl net.ipv{4,6}.conf.all.forwarding | ||
118 | * setting, and will return BPF_FIB_LKUP_RET_FWD_DISABLED if not | ||
119 | * enabled this on ingress device. | ||
120 | */ | ||
121 | if (rc == BPF_FIB_LKUP_RET_SUCCESS) { | ||
122 | /* Verify egress index has been configured as TX-port. | ||
123 | * (Note: User can still have inserted an egress ifindex that | ||
124 | * doesn't support XDP xmit, which will result in packet drops). | ||
125 | * | ||
126 | * Note: lookup in devmap supported since 0cdbb4b09a0. | ||
127 | * If not supported will fail with: | ||
128 | * cannot pass map_type 14 into func bpf_map_lookup_elem#1: | ||
129 | */ | ||
130 | if (!bpf_map_lookup_elem(&xdp_tx_ports, &fib_params.ifindex)) | ||
131 | return XDP_PASS; | ||
132 | |||
133 | if (h_proto == htons(ETH_P_IP)) | ||
134 | ip_decrease_ttl(iph); | ||
135 | else if (h_proto == htons(ETH_P_IPV6)) | ||
136 | ip6h->hop_limit--; | ||
137 | |||
138 | memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN); | ||
139 | memcpy(eth->h_source, fib_params.smac, ETH_ALEN); | ||
140 | return bpf_redirect_map(&xdp_tx_ports, fib_params.ifindex, 0); | ||
141 | } | ||
142 | |||
143 | return XDP_PASS; | ||
144 | } | ||
145 | |||
146 | SEC("xdp_fwd") | ||
147 | int xdp_fwd_prog(struct xdp_md *ctx) | ||
148 | { | ||
149 | return xdp_fwd_flags(ctx, 0); | ||
150 | } | ||
151 | |||
152 | SEC("xdp_fwd_direct") | ||
153 | int xdp_fwd_direct_prog(struct xdp_md *ctx) | ||
154 | { | ||
155 | return xdp_fwd_flags(ctx, BPF_FIB_LOOKUP_DIRECT); | ||
156 | } | ||
157 | |||
158 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c new file mode 100644 index 000000000..74a4583d0 --- /dev/null +++ b/samples/bpf/xdp_fwd_user.c | |||
@@ -0,0 +1,170 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com> | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of version 2 of the GNU General Public | ||
6 | * License as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | */ | ||
13 | |||
14 | #include <linux/bpf.h> | ||
15 | #include <linux/if_link.h> | ||
16 | #include <linux/limits.h> | ||
17 | #include <net/if.h> | ||
18 | #include <errno.h> | ||
19 | #include <stdio.h> | ||
20 | #include <stdlib.h> | ||
21 | #include <stdbool.h> | ||
22 | #include <string.h> | ||
23 | #include <unistd.h> | ||
24 | #include <fcntl.h> | ||
25 | #include <libgen.h> | ||
26 | |||
27 | #include <bpf/libbpf.h> | ||
28 | #include <bpf/bpf.h> | ||
29 | |||
30 | static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; | ||
31 | |||
32 | static int do_attach(int idx, int prog_fd, int map_fd, const char *name) | ||
33 | { | ||
34 | int err; | ||
35 | |||
36 | err = bpf_set_link_xdp_fd(idx, prog_fd, xdp_flags); | ||
37 | if (err < 0) { | ||
38 | printf("ERROR: failed to attach program to %s\n", name); | ||
39 | return err; | ||
40 | } | ||
41 | |||
42 | /* Adding ifindex as a possible egress TX port */ | ||
43 | err = bpf_map_update_elem(map_fd, &idx, &idx, 0); | ||
44 | if (err) | ||
45 | printf("ERROR: failed using device %s as TX-port\n", name); | ||
46 | |||
47 | return err; | ||
48 | } | ||
49 | |||
50 | static int do_detach(int idx, const char *name) | ||
51 | { | ||
52 | int err; | ||
53 | |||
54 | err = bpf_set_link_xdp_fd(idx, -1, xdp_flags); | ||
55 | if (err < 0) | ||
56 | printf("ERROR: failed to detach program from %s\n", name); | ||
57 | |||
58 | /* TODO: Remember to cleanup map, when adding use of shared map | ||
59 | * bpf_map_delete_elem((map_fd, &idx); | ||
60 | */ | ||
61 | return err; | ||
62 | } | ||
63 | |||
64 | static void usage(const char *prog) | ||
65 | { | ||
66 | fprintf(stderr, | ||
67 | "usage: %s [OPTS] interface-list\n" | ||
68 | "\nOPTS:\n" | ||
69 | " -d detach program\n" | ||
70 | " -D direct table lookups (skip fib rules)\n", | ||
71 | prog); | ||
72 | } | ||
73 | |||
74 | int main(int argc, char **argv) | ||
75 | { | ||
76 | struct bpf_prog_load_attr prog_load_attr = { | ||
77 | .prog_type = BPF_PROG_TYPE_XDP, | ||
78 | }; | ||
79 | const char *prog_name = "xdp_fwd"; | ||
80 | struct bpf_program *prog; | ||
81 | int prog_fd, map_fd = -1; | ||
82 | char filename[PATH_MAX]; | ||
83 | struct bpf_object *obj; | ||
84 | int opt, i, idx, err; | ||
85 | int attach = 1; | ||
86 | int ret = 0; | ||
87 | |||
88 | while ((opt = getopt(argc, argv, ":dDSF")) != -1) { | ||
89 | switch (opt) { | ||
90 | case 'd': | ||
91 | attach = 0; | ||
92 | break; | ||
93 | case 'S': | ||
94 | xdp_flags |= XDP_FLAGS_SKB_MODE; | ||
95 | break; | ||
96 | case 'F': | ||
97 | xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; | ||
98 | break; | ||
99 | case 'D': | ||
100 | prog_name = "xdp_fwd_direct"; | ||
101 | break; | ||
102 | default: | ||
103 | usage(basename(argv[0])); | ||
104 | return 1; | ||
105 | } | ||
106 | } | ||
107 | |||
108 | if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) | ||
109 | xdp_flags |= XDP_FLAGS_DRV_MODE; | ||
110 | |||
111 | if (optind == argc) { | ||
112 | usage(basename(argv[0])); | ||
113 | return 1; | ||
114 | } | ||
115 | |||
116 | if (attach) { | ||
117 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
118 | prog_load_attr.file = filename; | ||
119 | |||
120 | if (access(filename, O_RDONLY) < 0) { | ||
121 | printf("error accessing file %s: %s\n", | ||
122 | filename, strerror(errno)); | ||
123 | return 1; | ||
124 | } | ||
125 | |||
126 | err = bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd); | ||
127 | if (err) { | ||
128 | printf("Does kernel support devmap lookup?\n"); | ||
129 | /* If not, the error message will be: | ||
130 | * "cannot pass map_type 14 into func bpf_map_lookup_elem#1" | ||
131 | */ | ||
132 | return 1; | ||
133 | } | ||
134 | |||
135 | prog = bpf_object__find_program_by_title(obj, prog_name); | ||
136 | prog_fd = bpf_program__fd(prog); | ||
137 | if (prog_fd < 0) { | ||
138 | printf("program not found: %s\n", strerror(prog_fd)); | ||
139 | return 1; | ||
140 | } | ||
141 | map_fd = bpf_map__fd(bpf_object__find_map_by_name(obj, | ||
142 | "xdp_tx_ports")); | ||
143 | if (map_fd < 0) { | ||
144 | printf("map not found: %s\n", strerror(map_fd)); | ||
145 | return 1; | ||
146 | } | ||
147 | } | ||
148 | |||
149 | for (i = optind; i < argc; ++i) { | ||
150 | idx = if_nametoindex(argv[i]); | ||
151 | if (!idx) | ||
152 | idx = strtoul(argv[i], NULL, 0); | ||
153 | |||
154 | if (!idx) { | ||
155 | fprintf(stderr, "Invalid arg\n"); | ||
156 | return 1; | ||
157 | } | ||
158 | if (!attach) { | ||
159 | err = do_detach(idx, argv[i]); | ||
160 | if (err) | ||
161 | ret = err; | ||
162 | } else { | ||
163 | err = do_attach(idx, prog_fd, map_fd, argv[i]); | ||
164 | if (err) | ||
165 | ret = err; | ||
166 | } | ||
167 | } | ||
168 | |||
169 | return ret; | ||
170 | } | ||
diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c new file mode 100644 index 000000000..5c955b812 --- /dev/null +++ b/samples/bpf/xdp_monitor_kern.c | |||
@@ -0,0 +1,257 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 | ||
2 | * Copyright(c) 2017-2018 Jesper Dangaard Brouer, Red Hat Inc. | ||
3 | * | ||
4 | * XDP monitor tool, based on tracepoints | ||
5 | */ | ||
6 | #include <uapi/linux/bpf.h> | ||
7 | #include <bpf/bpf_helpers.h> | ||
8 | |||
9 | struct { | ||
10 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); | ||
11 | __type(key, u32); | ||
12 | __type(value, u64); | ||
13 | __uint(max_entries, 2); | ||
14 | /* TODO: have entries for all possible errno's */ | ||
15 | } redirect_err_cnt SEC(".maps"); | ||
16 | |||
17 | #define XDP_UNKNOWN XDP_REDIRECT + 1 | ||
18 | struct { | ||
19 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); | ||
20 | __type(key, u32); | ||
21 | __type(value, u64); | ||
22 | __uint(max_entries, XDP_UNKNOWN + 1); | ||
23 | } exception_cnt SEC(".maps"); | ||
24 | |||
25 | /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format | ||
26 | * Code in: kernel/include/trace/events/xdp.h | ||
27 | */ | ||
28 | struct xdp_redirect_ctx { | ||
29 | u64 __pad; // First 8 bytes are not accessible by bpf code | ||
30 | int prog_id; // offset:8; size:4; signed:1; | ||
31 | u32 act; // offset:12 size:4; signed:0; | ||
32 | int ifindex; // offset:16 size:4; signed:1; | ||
33 | int err; // offset:20 size:4; signed:1; | ||
34 | int to_ifindex; // offset:24 size:4; signed:1; | ||
35 | u32 map_id; // offset:28 size:4; signed:0; | ||
36 | int map_index; // offset:32 size:4; signed:1; | ||
37 | }; // offset:36 | ||
38 | |||
39 | enum { | ||
40 | XDP_REDIRECT_SUCCESS = 0, | ||
41 | XDP_REDIRECT_ERROR = 1 | ||
42 | }; | ||
43 | |||
44 | static __always_inline | ||
45 | int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx) | ||
46 | { | ||
47 | u32 key = XDP_REDIRECT_ERROR; | ||
48 | int err = ctx->err; | ||
49 | u64 *cnt; | ||
50 | |||
51 | if (!err) | ||
52 | key = XDP_REDIRECT_SUCCESS; | ||
53 | |||
54 | cnt = bpf_map_lookup_elem(&redirect_err_cnt, &key); | ||
55 | if (!cnt) | ||
56 | return 1; | ||
57 | *cnt += 1; | ||
58 | |||
59 | return 0; /* Indicate event was filtered (no further processing)*/ | ||
60 | /* | ||
61 | * Returning 1 here would allow e.g. a perf-record tracepoint | ||
62 | * to see and record these events, but it doesn't work well | ||
63 | * in-practice as stopping perf-record also unload this | ||
64 | * bpf_prog. Plus, there is additional overhead of doing so. | ||
65 | */ | ||
66 | } | ||
67 | |||
68 | SEC("tracepoint/xdp/xdp_redirect_err") | ||
69 | int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx) | ||
70 | { | ||
71 | return xdp_redirect_collect_stat(ctx); | ||
72 | } | ||
73 | |||
74 | |||
75 | SEC("tracepoint/xdp/xdp_redirect_map_err") | ||
76 | int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx) | ||
77 | { | ||
78 | return xdp_redirect_collect_stat(ctx); | ||
79 | } | ||
80 | |||
81 | /* Likely unloaded when prog starts */ | ||
82 | SEC("tracepoint/xdp/xdp_redirect") | ||
83 | int trace_xdp_redirect(struct xdp_redirect_ctx *ctx) | ||
84 | { | ||
85 | return xdp_redirect_collect_stat(ctx); | ||
86 | } | ||
87 | |||
88 | /* Likely unloaded when prog starts */ | ||
89 | SEC("tracepoint/xdp/xdp_redirect_map") | ||
90 | int trace_xdp_redirect_map(struct xdp_redirect_ctx *ctx) | ||
91 | { | ||
92 | return xdp_redirect_collect_stat(ctx); | ||
93 | } | ||
94 | |||
95 | /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format | ||
96 | * Code in: kernel/include/trace/events/xdp.h | ||
97 | */ | ||
98 | struct xdp_exception_ctx { | ||
99 | u64 __pad; // First 8 bytes are not accessible by bpf code | ||
100 | int prog_id; // offset:8; size:4; signed:1; | ||
101 | u32 act; // offset:12; size:4; signed:0; | ||
102 | int ifindex; // offset:16; size:4; signed:1; | ||
103 | }; | ||
104 | |||
105 | SEC("tracepoint/xdp/xdp_exception") | ||
106 | int trace_xdp_exception(struct xdp_exception_ctx *ctx) | ||
107 | { | ||
108 | u64 *cnt; | ||
109 | u32 key; | ||
110 | |||
111 | key = ctx->act; | ||
112 | if (key > XDP_REDIRECT) | ||
113 | key = XDP_UNKNOWN; | ||
114 | |||
115 | cnt = bpf_map_lookup_elem(&exception_cnt, &key); | ||
116 | if (!cnt) | ||
117 | return 1; | ||
118 | *cnt += 1; | ||
119 | |||
120 | return 0; | ||
121 | } | ||
122 | |||
123 | /* Common stats data record shared with _user.c */ | ||
124 | struct datarec { | ||
125 | u64 processed; | ||
126 | u64 dropped; | ||
127 | u64 info; | ||
128 | u64 err; | ||
129 | }; | ||
130 | #define MAX_CPUS 64 | ||
131 | |||
132 | struct { | ||
133 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); | ||
134 | __type(key, u32); | ||
135 | __type(value, struct datarec); | ||
136 | __uint(max_entries, MAX_CPUS); | ||
137 | } cpumap_enqueue_cnt SEC(".maps"); | ||
138 | |||
139 | struct { | ||
140 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); | ||
141 | __type(key, u32); | ||
142 | __type(value, struct datarec); | ||
143 | __uint(max_entries, 1); | ||
144 | } cpumap_kthread_cnt SEC(".maps"); | ||
145 | |||
146 | /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format | ||
147 | * Code in: kernel/include/trace/events/xdp.h | ||
148 | */ | ||
149 | struct cpumap_enqueue_ctx { | ||
150 | u64 __pad; // First 8 bytes are not accessible by bpf code | ||
151 | int map_id; // offset:8; size:4; signed:1; | ||
152 | u32 act; // offset:12; size:4; signed:0; | ||
153 | int cpu; // offset:16; size:4; signed:1; | ||
154 | unsigned int drops; // offset:20; size:4; signed:0; | ||
155 | unsigned int processed; // offset:24; size:4; signed:0; | ||
156 | int to_cpu; // offset:28; size:4; signed:1; | ||
157 | }; | ||
158 | |||
159 | SEC("tracepoint/xdp/xdp_cpumap_enqueue") | ||
160 | int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx) | ||
161 | { | ||
162 | u32 to_cpu = ctx->to_cpu; | ||
163 | struct datarec *rec; | ||
164 | |||
165 | if (to_cpu >= MAX_CPUS) | ||
166 | return 1; | ||
167 | |||
168 | rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu); | ||
169 | if (!rec) | ||
170 | return 0; | ||
171 | rec->processed += ctx->processed; | ||
172 | rec->dropped += ctx->drops; | ||
173 | |||
174 | /* Record bulk events, then userspace can calc average bulk size */ | ||
175 | if (ctx->processed > 0) | ||
176 | rec->info += 1; | ||
177 | |||
178 | return 0; | ||
179 | } | ||
180 | |||
181 | /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format | ||
182 | * Code in: kernel/include/trace/events/xdp.h | ||
183 | */ | ||
184 | struct cpumap_kthread_ctx { | ||
185 | u64 __pad; // First 8 bytes are not accessible by bpf code | ||
186 | int map_id; // offset:8; size:4; signed:1; | ||
187 | u32 act; // offset:12; size:4; signed:0; | ||
188 | int cpu; // offset:16; size:4; signed:1; | ||
189 | unsigned int drops; // offset:20; size:4; signed:0; | ||
190 | unsigned int processed; // offset:24; size:4; signed:0; | ||
191 | int sched; // offset:28; size:4; signed:1; | ||
192 | }; | ||
193 | |||
194 | SEC("tracepoint/xdp/xdp_cpumap_kthread") | ||
195 | int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx) | ||
196 | { | ||
197 | struct datarec *rec; | ||
198 | u32 key = 0; | ||
199 | |||
200 | rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key); | ||
201 | if (!rec) | ||
202 | return 0; | ||
203 | rec->processed += ctx->processed; | ||
204 | rec->dropped += ctx->drops; | ||
205 | |||
206 | /* Count times kthread yielded CPU via schedule call */ | ||
207 | if (ctx->sched) | ||
208 | rec->info++; | ||
209 | |||
210 | return 0; | ||
211 | } | ||
212 | |||
213 | struct { | ||
214 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); | ||
215 | __type(key, u32); | ||
216 | __type(value, struct datarec); | ||
217 | __uint(max_entries, 1); | ||
218 | } devmap_xmit_cnt SEC(".maps"); | ||
219 | |||
220 | /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format | ||
221 | * Code in: kernel/include/trace/events/xdp.h | ||
222 | */ | ||
223 | struct devmap_xmit_ctx { | ||
224 | u64 __pad; // First 8 bytes are not accessible by bpf code | ||
225 | int from_ifindex; // offset:8; size:4; signed:1; | ||
226 | u32 act; // offset:12; size:4; signed:0; | ||
227 | int to_ifindex; // offset:16; size:4; signed:1; | ||
228 | int drops; // offset:20; size:4; signed:1; | ||
229 | int sent; // offset:24; size:4; signed:1; | ||
230 | int err; // offset:28; size:4; signed:1; | ||
231 | }; | ||
232 | |||
233 | SEC("tracepoint/xdp/xdp_devmap_xmit") | ||
234 | int trace_xdp_devmap_xmit(struct devmap_xmit_ctx *ctx) | ||
235 | { | ||
236 | struct datarec *rec; | ||
237 | u32 key = 0; | ||
238 | |||
239 | rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &key); | ||
240 | if (!rec) | ||
241 | return 0; | ||
242 | rec->processed += ctx->sent; | ||
243 | rec->dropped += ctx->drops; | ||
244 | |||
245 | /* Record bulk events, then userspace can calc average bulk size */ | ||
246 | rec->info += 1; | ||
247 | |||
248 | /* Record error cases, where no frame were sent */ | ||
249 | if (ctx->err) | ||
250 | rec->err++; | ||
251 | |||
252 | /* Catch API error of drv ndo_xdp_xmit sent more than count */ | ||
253 | if (ctx->drops < 0) | ||
254 | rec->err++; | ||
255 | |||
256 | return 1; | ||
257 | } | ||
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c new file mode 100644 index 000000000..03d0a1829 --- /dev/null +++ b/samples/bpf/xdp_monitor_user.c | |||
@@ -0,0 +1,792 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 | ||
2 | * Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. | ||
3 | */ | ||
4 | static const char *__doc__= | ||
5 | "XDP monitor tool, based on tracepoints\n" | ||
6 | ; | ||
7 | |||
8 | static const char *__doc_err_only__= | ||
9 | " NOTICE: Only tracking XDP redirect errors\n" | ||
10 | " Enable TX success stats via '--stats'\n" | ||
11 | " (which comes with a per packet processing overhead)\n" | ||
12 | ; | ||
13 | |||
14 | #include <errno.h> | ||
15 | #include <stdio.h> | ||
16 | #include <stdlib.h> | ||
17 | #include <stdbool.h> | ||
18 | #include <stdint.h> | ||
19 | #include <string.h> | ||
20 | #include <ctype.h> | ||
21 | #include <unistd.h> | ||
22 | #include <locale.h> | ||
23 | |||
24 | #include <sys/resource.h> | ||
25 | #include <getopt.h> | ||
26 | #include <net/if.h> | ||
27 | #include <time.h> | ||
28 | |||
29 | #include <signal.h> | ||
30 | #include <bpf/bpf.h> | ||
31 | #include <bpf/libbpf.h> | ||
32 | #include "bpf_util.h" | ||
33 | |||
34 | enum map_type { | ||
35 | REDIRECT_ERR_CNT, | ||
36 | EXCEPTION_CNT, | ||
37 | CPUMAP_ENQUEUE_CNT, | ||
38 | CPUMAP_KTHREAD_CNT, | ||
39 | DEVMAP_XMIT_CNT, | ||
40 | }; | ||
41 | |||
42 | static const char *const map_type_strings[] = { | ||
43 | [REDIRECT_ERR_CNT] = "redirect_err_cnt", | ||
44 | [EXCEPTION_CNT] = "exception_cnt", | ||
45 | [CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt", | ||
46 | [CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt", | ||
47 | [DEVMAP_XMIT_CNT] = "devmap_xmit_cnt", | ||
48 | }; | ||
49 | |||
50 | #define NUM_MAP 5 | ||
51 | #define NUM_TP 8 | ||
52 | |||
53 | static int tp_cnt; | ||
54 | static int map_cnt; | ||
55 | static int verbose = 1; | ||
56 | static bool debug = false; | ||
57 | struct bpf_map *map_data[NUM_MAP] = {}; | ||
58 | struct bpf_link *tp_links[NUM_TP] = {}; | ||
59 | struct bpf_object *obj; | ||
60 | |||
61 | static const struct option long_options[] = { | ||
62 | {"help", no_argument, NULL, 'h' }, | ||
63 | {"debug", no_argument, NULL, 'D' }, | ||
64 | {"stats", no_argument, NULL, 'S' }, | ||
65 | {"sec", required_argument, NULL, 's' }, | ||
66 | {0, 0, NULL, 0 } | ||
67 | }; | ||
68 | |||
69 | static void int_exit(int sig) | ||
70 | { | ||
71 | /* Detach tracepoints */ | ||
72 | while (tp_cnt) | ||
73 | bpf_link__destroy(tp_links[--tp_cnt]); | ||
74 | |||
75 | bpf_object__close(obj); | ||
76 | exit(0); | ||
77 | } | ||
78 | |||
79 | /* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */ | ||
80 | #define EXIT_FAIL_MEM 5 | ||
81 | |||
82 | static void usage(char *argv[]) | ||
83 | { | ||
84 | int i; | ||
85 | printf("\nDOCUMENTATION:\n%s\n", __doc__); | ||
86 | printf("\n"); | ||
87 | printf(" Usage: %s (options-see-below)\n", | ||
88 | argv[0]); | ||
89 | printf(" Listing options:\n"); | ||
90 | for (i = 0; long_options[i].name != 0; i++) { | ||
91 | printf(" --%-15s", long_options[i].name); | ||
92 | if (long_options[i].flag != NULL) | ||
93 | printf(" flag (internal value:%d)", | ||
94 | *long_options[i].flag); | ||
95 | else | ||
96 | printf("short-option: -%c", | ||
97 | long_options[i].val); | ||
98 | printf("\n"); | ||
99 | } | ||
100 | printf("\n"); | ||
101 | } | ||
102 | |||
103 | #define NANOSEC_PER_SEC 1000000000 /* 10^9 */ | ||
104 | static __u64 gettime(void) | ||
105 | { | ||
106 | struct timespec t; | ||
107 | int res; | ||
108 | |||
109 | res = clock_gettime(CLOCK_MONOTONIC, &t); | ||
110 | if (res < 0) { | ||
111 | fprintf(stderr, "Error with gettimeofday! (%i)\n", res); | ||
112 | exit(EXIT_FAILURE); | ||
113 | } | ||
114 | return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; | ||
115 | } | ||
116 | |||
117 | enum { | ||
118 | REDIR_SUCCESS = 0, | ||
119 | REDIR_ERROR = 1, | ||
120 | }; | ||
121 | #define REDIR_RES_MAX 2 | ||
122 | static const char *redir_names[REDIR_RES_MAX] = { | ||
123 | [REDIR_SUCCESS] = "Success", | ||
124 | [REDIR_ERROR] = "Error", | ||
125 | }; | ||
126 | static const char *err2str(int err) | ||
127 | { | ||
128 | if (err < REDIR_RES_MAX) | ||
129 | return redir_names[err]; | ||
130 | return NULL; | ||
131 | } | ||
132 | /* enum xdp_action */ | ||
133 | #define XDP_UNKNOWN XDP_REDIRECT + 1 | ||
134 | #define XDP_ACTION_MAX (XDP_UNKNOWN + 1) | ||
135 | static const char *xdp_action_names[XDP_ACTION_MAX] = { | ||
136 | [XDP_ABORTED] = "XDP_ABORTED", | ||
137 | [XDP_DROP] = "XDP_DROP", | ||
138 | [XDP_PASS] = "XDP_PASS", | ||
139 | [XDP_TX] = "XDP_TX", | ||
140 | [XDP_REDIRECT] = "XDP_REDIRECT", | ||
141 | [XDP_UNKNOWN] = "XDP_UNKNOWN", | ||
142 | }; | ||
143 | static const char *action2str(int action) | ||
144 | { | ||
145 | if (action < XDP_ACTION_MAX) | ||
146 | return xdp_action_names[action]; | ||
147 | return NULL; | ||
148 | } | ||
149 | |||
150 | /* Common stats data record shared with _kern.c */ | ||
151 | struct datarec { | ||
152 | __u64 processed; | ||
153 | __u64 dropped; | ||
154 | __u64 info; | ||
155 | __u64 err; | ||
156 | }; | ||
157 | #define MAX_CPUS 64 | ||
158 | |||
159 | /* Userspace structs for collection of stats from maps */ | ||
160 | struct record { | ||
161 | __u64 timestamp; | ||
162 | struct datarec total; | ||
163 | struct datarec *cpu; | ||
164 | }; | ||
165 | struct u64rec { | ||
166 | __u64 processed; | ||
167 | }; | ||
168 | struct record_u64 { | ||
169 | /* record for _kern side __u64 values */ | ||
170 | __u64 timestamp; | ||
171 | struct u64rec total; | ||
172 | struct u64rec *cpu; | ||
173 | }; | ||
174 | |||
175 | struct stats_record { | ||
176 | struct record_u64 xdp_redirect[REDIR_RES_MAX]; | ||
177 | struct record_u64 xdp_exception[XDP_ACTION_MAX]; | ||
178 | struct record xdp_cpumap_kthread; | ||
179 | struct record xdp_cpumap_enqueue[MAX_CPUS]; | ||
180 | struct record xdp_devmap_xmit; | ||
181 | }; | ||
182 | |||
183 | static bool map_collect_record(int fd, __u32 key, struct record *rec) | ||
184 | { | ||
185 | /* For percpu maps, userspace gets a value per possible CPU */ | ||
186 | unsigned int nr_cpus = bpf_num_possible_cpus(); | ||
187 | struct datarec values[nr_cpus]; | ||
188 | __u64 sum_processed = 0; | ||
189 | __u64 sum_dropped = 0; | ||
190 | __u64 sum_info = 0; | ||
191 | __u64 sum_err = 0; | ||
192 | int i; | ||
193 | |||
194 | if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { | ||
195 | fprintf(stderr, | ||
196 | "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); | ||
197 | return false; | ||
198 | } | ||
199 | /* Get time as close as possible to reading map contents */ | ||
200 | rec->timestamp = gettime(); | ||
201 | |||
202 | /* Record and sum values from each CPU */ | ||
203 | for (i = 0; i < nr_cpus; i++) { | ||
204 | rec->cpu[i].processed = values[i].processed; | ||
205 | sum_processed += values[i].processed; | ||
206 | rec->cpu[i].dropped = values[i].dropped; | ||
207 | sum_dropped += values[i].dropped; | ||
208 | rec->cpu[i].info = values[i].info; | ||
209 | sum_info += values[i].info; | ||
210 | rec->cpu[i].err = values[i].err; | ||
211 | sum_err += values[i].err; | ||
212 | } | ||
213 | rec->total.processed = sum_processed; | ||
214 | rec->total.dropped = sum_dropped; | ||
215 | rec->total.info = sum_info; | ||
216 | rec->total.err = sum_err; | ||
217 | return true; | ||
218 | } | ||
219 | |||
220 | static bool map_collect_record_u64(int fd, __u32 key, struct record_u64 *rec) | ||
221 | { | ||
222 | /* For percpu maps, userspace gets a value per possible CPU */ | ||
223 | unsigned int nr_cpus = bpf_num_possible_cpus(); | ||
224 | struct u64rec values[nr_cpus]; | ||
225 | __u64 sum_total = 0; | ||
226 | int i; | ||
227 | |||
228 | if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { | ||
229 | fprintf(stderr, | ||
230 | "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); | ||
231 | return false; | ||
232 | } | ||
233 | /* Get time as close as possible to reading map contents */ | ||
234 | rec->timestamp = gettime(); | ||
235 | |||
236 | /* Record and sum values from each CPU */ | ||
237 | for (i = 0; i < nr_cpus; i++) { | ||
238 | rec->cpu[i].processed = values[i].processed; | ||
239 | sum_total += values[i].processed; | ||
240 | } | ||
241 | rec->total.processed = sum_total; | ||
242 | return true; | ||
243 | } | ||
244 | |||
245 | static double calc_period(struct record *r, struct record *p) | ||
246 | { | ||
247 | double period_ = 0; | ||
248 | __u64 period = 0; | ||
249 | |||
250 | period = r->timestamp - p->timestamp; | ||
251 | if (period > 0) | ||
252 | period_ = ((double) period / NANOSEC_PER_SEC); | ||
253 | |||
254 | return period_; | ||
255 | } | ||
256 | |||
257 | static double calc_period_u64(struct record_u64 *r, struct record_u64 *p) | ||
258 | { | ||
259 | double period_ = 0; | ||
260 | __u64 period = 0; | ||
261 | |||
262 | period = r->timestamp - p->timestamp; | ||
263 | if (period > 0) | ||
264 | period_ = ((double) period / NANOSEC_PER_SEC); | ||
265 | |||
266 | return period_; | ||
267 | } | ||
268 | |||
269 | static double calc_pps(struct datarec *r, struct datarec *p, double period) | ||
270 | { | ||
271 | __u64 packets = 0; | ||
272 | double pps = 0; | ||
273 | |||
274 | if (period > 0) { | ||
275 | packets = r->processed - p->processed; | ||
276 | pps = packets / period; | ||
277 | } | ||
278 | return pps; | ||
279 | } | ||
280 | |||
281 | static double calc_pps_u64(struct u64rec *r, struct u64rec *p, double period) | ||
282 | { | ||
283 | __u64 packets = 0; | ||
284 | double pps = 0; | ||
285 | |||
286 | if (period > 0) { | ||
287 | packets = r->processed - p->processed; | ||
288 | pps = packets / period; | ||
289 | } | ||
290 | return pps; | ||
291 | } | ||
292 | |||
293 | static double calc_drop(struct datarec *r, struct datarec *p, double period) | ||
294 | { | ||
295 | __u64 packets = 0; | ||
296 | double pps = 0; | ||
297 | |||
298 | if (period > 0) { | ||
299 | packets = r->dropped - p->dropped; | ||
300 | pps = packets / period; | ||
301 | } | ||
302 | return pps; | ||
303 | } | ||
304 | |||
305 | static double calc_info(struct datarec *r, struct datarec *p, double period) | ||
306 | { | ||
307 | __u64 packets = 0; | ||
308 | double pps = 0; | ||
309 | |||
310 | if (period > 0) { | ||
311 | packets = r->info - p->info; | ||
312 | pps = packets / period; | ||
313 | } | ||
314 | return pps; | ||
315 | } | ||
316 | |||
317 | static double calc_err(struct datarec *r, struct datarec *p, double period) | ||
318 | { | ||
319 | __u64 packets = 0; | ||
320 | double pps = 0; | ||
321 | |||
322 | if (period > 0) { | ||
323 | packets = r->err - p->err; | ||
324 | pps = packets / period; | ||
325 | } | ||
326 | return pps; | ||
327 | } | ||
328 | |||
329 | static void stats_print(struct stats_record *stats_rec, | ||
330 | struct stats_record *stats_prev, | ||
331 | bool err_only) | ||
332 | { | ||
333 | unsigned int nr_cpus = bpf_num_possible_cpus(); | ||
334 | int rec_i = 0, i, to_cpu; | ||
335 | double t = 0, pps = 0; | ||
336 | |||
337 | /* Header */ | ||
338 | printf("%-15s %-7s %-12s %-12s %-9s\n", | ||
339 | "XDP-event", "CPU:to", "pps", "drop-pps", "extra-info"); | ||
340 | |||
341 | /* tracepoint: xdp:xdp_redirect_* */ | ||
342 | if (err_only) | ||
343 | rec_i = REDIR_ERROR; | ||
344 | |||
345 | for (; rec_i < REDIR_RES_MAX; rec_i++) { | ||
346 | struct record_u64 *rec, *prev; | ||
347 | char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n"; | ||
348 | char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n"; | ||
349 | |||
350 | rec = &stats_rec->xdp_redirect[rec_i]; | ||
351 | prev = &stats_prev->xdp_redirect[rec_i]; | ||
352 | t = calc_period_u64(rec, prev); | ||
353 | |||
354 | for (i = 0; i < nr_cpus; i++) { | ||
355 | struct u64rec *r = &rec->cpu[i]; | ||
356 | struct u64rec *p = &prev->cpu[i]; | ||
357 | |||
358 | pps = calc_pps_u64(r, p, t); | ||
359 | if (pps > 0) | ||
360 | printf(fmt1, "XDP_REDIRECT", i, | ||
361 | rec_i ? 0.0: pps, rec_i ? pps : 0.0, | ||
362 | err2str(rec_i)); | ||
363 | } | ||
364 | pps = calc_pps_u64(&rec->total, &prev->total, t); | ||
365 | printf(fmt2, "XDP_REDIRECT", "total", | ||
366 | rec_i ? 0.0: pps, rec_i ? pps : 0.0, err2str(rec_i)); | ||
367 | } | ||
368 | |||
369 | /* tracepoint: xdp:xdp_exception */ | ||
370 | for (rec_i = 0; rec_i < XDP_ACTION_MAX; rec_i++) { | ||
371 | struct record_u64 *rec, *prev; | ||
372 | char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n"; | ||
373 | char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n"; | ||
374 | |||
375 | rec = &stats_rec->xdp_exception[rec_i]; | ||
376 | prev = &stats_prev->xdp_exception[rec_i]; | ||
377 | t = calc_period_u64(rec, prev); | ||
378 | |||
379 | for (i = 0; i < nr_cpus; i++) { | ||
380 | struct u64rec *r = &rec->cpu[i]; | ||
381 | struct u64rec *p = &prev->cpu[i]; | ||
382 | |||
383 | pps = calc_pps_u64(r, p, t); | ||
384 | if (pps > 0) | ||
385 | printf(fmt1, "Exception", i, | ||
386 | 0.0, pps, action2str(rec_i)); | ||
387 | } | ||
388 | pps = calc_pps_u64(&rec->total, &prev->total, t); | ||
389 | if (pps > 0) | ||
390 | printf(fmt2, "Exception", "total", | ||
391 | 0.0, pps, action2str(rec_i)); | ||
392 | } | ||
393 | |||
394 | /* cpumap enqueue stats */ | ||
395 | for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) { | ||
396 | char *fmt1 = "%-15s %3d:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n"; | ||
397 | char *fmt2 = "%-15s %3s:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n"; | ||
398 | struct record *rec, *prev; | ||
399 | char *info_str = ""; | ||
400 | double drop, info; | ||
401 | |||
402 | rec = &stats_rec->xdp_cpumap_enqueue[to_cpu]; | ||
403 | prev = &stats_prev->xdp_cpumap_enqueue[to_cpu]; | ||
404 | t = calc_period(rec, prev); | ||
405 | for (i = 0; i < nr_cpus; i++) { | ||
406 | struct datarec *r = &rec->cpu[i]; | ||
407 | struct datarec *p = &prev->cpu[i]; | ||
408 | |||
409 | pps = calc_pps(r, p, t); | ||
410 | drop = calc_drop(r, p, t); | ||
411 | info = calc_info(r, p, t); | ||
412 | if (info > 0) { | ||
413 | info_str = "bulk-average"; | ||
414 | info = pps / info; /* calc average bulk size */ | ||
415 | } | ||
416 | if (pps > 0) | ||
417 | printf(fmt1, "cpumap-enqueue", | ||
418 | i, to_cpu, pps, drop, info, info_str); | ||
419 | } | ||
420 | pps = calc_pps(&rec->total, &prev->total, t); | ||
421 | if (pps > 0) { | ||
422 | drop = calc_drop(&rec->total, &prev->total, t); | ||
423 | info = calc_info(&rec->total, &prev->total, t); | ||
424 | if (info > 0) { | ||
425 | info_str = "bulk-average"; | ||
426 | info = pps / info; /* calc average bulk size */ | ||
427 | } | ||
428 | printf(fmt2, "cpumap-enqueue", | ||
429 | "sum", to_cpu, pps, drop, info, info_str); | ||
430 | } | ||
431 | } | ||
432 | |||
433 | /* cpumap kthread stats */ | ||
434 | { | ||
435 | char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.0f %s\n"; | ||
436 | char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.0f %s\n"; | ||
437 | struct record *rec, *prev; | ||
438 | double drop, info; | ||
439 | char *i_str = ""; | ||
440 | |||
441 | rec = &stats_rec->xdp_cpumap_kthread; | ||
442 | prev = &stats_prev->xdp_cpumap_kthread; | ||
443 | t = calc_period(rec, prev); | ||
444 | for (i = 0; i < nr_cpus; i++) { | ||
445 | struct datarec *r = &rec->cpu[i]; | ||
446 | struct datarec *p = &prev->cpu[i]; | ||
447 | |||
448 | pps = calc_pps(r, p, t); | ||
449 | drop = calc_drop(r, p, t); | ||
450 | info = calc_info(r, p, t); | ||
451 | if (info > 0) | ||
452 | i_str = "sched"; | ||
453 | if (pps > 0 || drop > 0) | ||
454 | printf(fmt1, "cpumap-kthread", | ||
455 | i, pps, drop, info, i_str); | ||
456 | } | ||
457 | pps = calc_pps(&rec->total, &prev->total, t); | ||
458 | drop = calc_drop(&rec->total, &prev->total, t); | ||
459 | info = calc_info(&rec->total, &prev->total, t); | ||
460 | if (info > 0) | ||
461 | i_str = "sched-sum"; | ||
462 | printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str); | ||
463 | } | ||
464 | |||
465 | /* devmap ndo_xdp_xmit stats */ | ||
466 | { | ||
467 | char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s %s\n"; | ||
468 | char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s %s\n"; | ||
469 | struct record *rec, *prev; | ||
470 | double drop, info, err; | ||
471 | char *i_str = ""; | ||
472 | char *err_str = ""; | ||
473 | |||
474 | rec = &stats_rec->xdp_devmap_xmit; | ||
475 | prev = &stats_prev->xdp_devmap_xmit; | ||
476 | t = calc_period(rec, prev); | ||
477 | for (i = 0; i < nr_cpus; i++) { | ||
478 | struct datarec *r = &rec->cpu[i]; | ||
479 | struct datarec *p = &prev->cpu[i]; | ||
480 | |||
481 | pps = calc_pps(r, p, t); | ||
482 | drop = calc_drop(r, p, t); | ||
483 | info = calc_info(r, p, t); | ||
484 | err = calc_err(r, p, t); | ||
485 | if (info > 0) { | ||
486 | i_str = "bulk-average"; | ||
487 | info = (pps+drop) / info; /* calc avg bulk */ | ||
488 | } | ||
489 | if (err > 0) | ||
490 | err_str = "drv-err"; | ||
491 | if (pps > 0 || drop > 0) | ||
492 | printf(fmt1, "devmap-xmit", | ||
493 | i, pps, drop, info, i_str, err_str); | ||
494 | } | ||
495 | pps = calc_pps(&rec->total, &prev->total, t); | ||
496 | drop = calc_drop(&rec->total, &prev->total, t); | ||
497 | info = calc_info(&rec->total, &prev->total, t); | ||
498 | err = calc_err(&rec->total, &prev->total, t); | ||
499 | if (info > 0) { | ||
500 | i_str = "bulk-average"; | ||
501 | info = (pps+drop) / info; /* calc avg bulk */ | ||
502 | } | ||
503 | if (err > 0) | ||
504 | err_str = "drv-err"; | ||
505 | printf(fmt2, "devmap-xmit", "total", pps, drop, | ||
506 | info, i_str, err_str); | ||
507 | } | ||
508 | |||
509 | printf("\n"); | ||
510 | } | ||
511 | |||
512 | static bool stats_collect(struct stats_record *rec) | ||
513 | { | ||
514 | int fd; | ||
515 | int i; | ||
516 | |||
517 | /* TODO: Detect if someone unloaded the perf event_fd's, as | ||
518 | * this can happen by someone running perf-record -e | ||
519 | */ | ||
520 | |||
521 | fd = bpf_map__fd(map_data[REDIRECT_ERR_CNT]); | ||
522 | for (i = 0; i < REDIR_RES_MAX; i++) | ||
523 | map_collect_record_u64(fd, i, &rec->xdp_redirect[i]); | ||
524 | |||
525 | fd = bpf_map__fd(map_data[EXCEPTION_CNT]); | ||
526 | for (i = 0; i < XDP_ACTION_MAX; i++) { | ||
527 | map_collect_record_u64(fd, i, &rec->xdp_exception[i]); | ||
528 | } | ||
529 | |||
530 | fd = bpf_map__fd(map_data[CPUMAP_ENQUEUE_CNT]); | ||
531 | for (i = 0; i < MAX_CPUS; i++) | ||
532 | map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]); | ||
533 | |||
534 | fd = bpf_map__fd(map_data[CPUMAP_KTHREAD_CNT]); | ||
535 | map_collect_record(fd, 0, &rec->xdp_cpumap_kthread); | ||
536 | |||
537 | fd = bpf_map__fd(map_data[DEVMAP_XMIT_CNT]); | ||
538 | map_collect_record(fd, 0, &rec->xdp_devmap_xmit); | ||
539 | |||
540 | return true; | ||
541 | } | ||
542 | |||
543 | static void *alloc_rec_per_cpu(int record_size) | ||
544 | { | ||
545 | unsigned int nr_cpus = bpf_num_possible_cpus(); | ||
546 | void *array; | ||
547 | |||
548 | array = calloc(nr_cpus, record_size); | ||
549 | if (!array) { | ||
550 | fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus); | ||
551 | exit(EXIT_FAIL_MEM); | ||
552 | } | ||
553 | return array; | ||
554 | } | ||
555 | |||
556 | static struct stats_record *alloc_stats_record(void) | ||
557 | { | ||
558 | struct stats_record *rec; | ||
559 | int rec_sz; | ||
560 | int i; | ||
561 | |||
562 | /* Alloc main stats_record structure */ | ||
563 | rec = calloc(1, sizeof(*rec)); | ||
564 | if (!rec) { | ||
565 | fprintf(stderr, "Mem alloc error\n"); | ||
566 | exit(EXIT_FAIL_MEM); | ||
567 | } | ||
568 | |||
569 | /* Alloc stats stored per CPU for each record */ | ||
570 | rec_sz = sizeof(struct u64rec); | ||
571 | for (i = 0; i < REDIR_RES_MAX; i++) | ||
572 | rec->xdp_redirect[i].cpu = alloc_rec_per_cpu(rec_sz); | ||
573 | |||
574 | for (i = 0; i < XDP_ACTION_MAX; i++) | ||
575 | rec->xdp_exception[i].cpu = alloc_rec_per_cpu(rec_sz); | ||
576 | |||
577 | rec_sz = sizeof(struct datarec); | ||
578 | rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz); | ||
579 | rec->xdp_devmap_xmit.cpu = alloc_rec_per_cpu(rec_sz); | ||
580 | |||
581 | for (i = 0; i < MAX_CPUS; i++) | ||
582 | rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz); | ||
583 | |||
584 | return rec; | ||
585 | } | ||
586 | |||
587 | static void free_stats_record(struct stats_record *r) | ||
588 | { | ||
589 | int i; | ||
590 | |||
591 | for (i = 0; i < REDIR_RES_MAX; i++) | ||
592 | free(r->xdp_redirect[i].cpu); | ||
593 | |||
594 | for (i = 0; i < XDP_ACTION_MAX; i++) | ||
595 | free(r->xdp_exception[i].cpu); | ||
596 | |||
597 | free(r->xdp_cpumap_kthread.cpu); | ||
598 | free(r->xdp_devmap_xmit.cpu); | ||
599 | |||
600 | for (i = 0; i < MAX_CPUS; i++) | ||
601 | free(r->xdp_cpumap_enqueue[i].cpu); | ||
602 | |||
603 | free(r); | ||
604 | } | ||
605 | |||
606 | /* Pointer swap trick */ | ||
607 | static inline void swap(struct stats_record **a, struct stats_record **b) | ||
608 | { | ||
609 | struct stats_record *tmp; | ||
610 | |||
611 | tmp = *a; | ||
612 | *a = *b; | ||
613 | *b = tmp; | ||
614 | } | ||
615 | |||
616 | static void stats_poll(int interval, bool err_only) | ||
617 | { | ||
618 | struct stats_record *rec, *prev; | ||
619 | |||
620 | rec = alloc_stats_record(); | ||
621 | prev = alloc_stats_record(); | ||
622 | stats_collect(rec); | ||
623 | |||
624 | if (err_only) | ||
625 | printf("\n%s\n", __doc_err_only__); | ||
626 | |||
627 | /* Trick to pretty printf with thousands separators use %' */ | ||
628 | setlocale(LC_NUMERIC, "en_US"); | ||
629 | |||
630 | /* Header */ | ||
631 | if (verbose) | ||
632 | printf("\n%s", __doc__); | ||
633 | |||
634 | /* TODO Need more advanced stats on error types */ | ||
635 | if (verbose) { | ||
636 | printf(" - Stats map0: %s\n", bpf_map__name(map_data[0])); | ||
637 | printf(" - Stats map1: %s\n", bpf_map__name(map_data[1])); | ||
638 | printf("\n"); | ||
639 | } | ||
640 | fflush(stdout); | ||
641 | |||
642 | while (1) { | ||
643 | swap(&prev, &rec); | ||
644 | stats_collect(rec); | ||
645 | stats_print(rec, prev, err_only); | ||
646 | fflush(stdout); | ||
647 | sleep(interval); | ||
648 | } | ||
649 | |||
650 | free_stats_record(rec); | ||
651 | free_stats_record(prev); | ||
652 | } | ||
653 | |||
654 | static void print_bpf_prog_info(void) | ||
655 | { | ||
656 | struct bpf_program *prog; | ||
657 | struct bpf_map *map; | ||
658 | int i = 0; | ||
659 | |||
660 | /* Prog info */ | ||
661 | printf("Loaded BPF prog have %d bpf program(s)\n", tp_cnt); | ||
662 | bpf_object__for_each_program(prog, obj) { | ||
663 | printf(" - prog_fd[%d] = fd(%d)\n", i, bpf_program__fd(prog)); | ||
664 | i++; | ||
665 | } | ||
666 | |||
667 | i = 0; | ||
668 | /* Maps info */ | ||
669 | printf("Loaded BPF prog have %d map(s)\n", map_cnt); | ||
670 | bpf_object__for_each_map(map, obj) { | ||
671 | const char *name = bpf_map__name(map); | ||
672 | int fd = bpf_map__fd(map); | ||
673 | |||
674 | printf(" - map_data[%d] = fd(%d) name:%s\n", i, fd, name); | ||
675 | i++; | ||
676 | } | ||
677 | |||
678 | /* Event info */ | ||
679 | printf("Searching for (max:%d) event file descriptor(s)\n", tp_cnt); | ||
680 | for (i = 0; i < tp_cnt; i++) { | ||
681 | int fd = bpf_link__fd(tp_links[i]); | ||
682 | |||
683 | if (fd != -1) | ||
684 | printf(" - event_fd[%d] = fd(%d)\n", i, fd); | ||
685 | } | ||
686 | } | ||
687 | |||
688 | int main(int argc, char **argv) | ||
689 | { | ||
690 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
691 | struct bpf_program *prog; | ||
692 | int longindex = 0, opt; | ||
693 | int ret = EXIT_FAILURE; | ||
694 | enum map_type type; | ||
695 | char filename[256]; | ||
696 | |||
697 | /* Default settings: */ | ||
698 | bool errors_only = true; | ||
699 | int interval = 2; | ||
700 | |||
701 | /* Parse commands line args */ | ||
702 | while ((opt = getopt_long(argc, argv, "hDSs:", | ||
703 | long_options, &longindex)) != -1) { | ||
704 | switch (opt) { | ||
705 | case 'D': | ||
706 | debug = true; | ||
707 | break; | ||
708 | case 'S': | ||
709 | errors_only = false; | ||
710 | break; | ||
711 | case 's': | ||
712 | interval = atoi(optarg); | ||
713 | break; | ||
714 | case 'h': | ||
715 | default: | ||
716 | usage(argv); | ||
717 | return ret; | ||
718 | } | ||
719 | } | ||
720 | |||
721 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
722 | if (setrlimit(RLIMIT_MEMLOCK, &r)) { | ||
723 | perror("setrlimit(RLIMIT_MEMLOCK)"); | ||
724 | return ret; | ||
725 | } | ||
726 | |||
727 | /* Remove tracepoint program when program is interrupted or killed */ | ||
728 | signal(SIGINT, int_exit); | ||
729 | signal(SIGTERM, int_exit); | ||
730 | |||
731 | obj = bpf_object__open_file(filename, NULL); | ||
732 | if (libbpf_get_error(obj)) { | ||
733 | printf("ERROR: opening BPF object file failed\n"); | ||
734 | obj = NULL; | ||
735 | goto cleanup; | ||
736 | } | ||
737 | |||
738 | /* load BPF program */ | ||
739 | if (bpf_object__load(obj)) { | ||
740 | printf("ERROR: loading BPF object file failed\n"); | ||
741 | goto cleanup; | ||
742 | } | ||
743 | |||
744 | for (type = 0; type < NUM_MAP; type++) { | ||
745 | map_data[type] = | ||
746 | bpf_object__find_map_by_name(obj, map_type_strings[type]); | ||
747 | |||
748 | if (libbpf_get_error(map_data[type])) { | ||
749 | printf("ERROR: finding a map in obj file failed\n"); | ||
750 | goto cleanup; | ||
751 | } | ||
752 | map_cnt++; | ||
753 | } | ||
754 | |||
755 | bpf_object__for_each_program(prog, obj) { | ||
756 | tp_links[tp_cnt] = bpf_program__attach(prog); | ||
757 | if (libbpf_get_error(tp_links[tp_cnt])) { | ||
758 | printf("ERROR: bpf_program__attach failed\n"); | ||
759 | tp_links[tp_cnt] = NULL; | ||
760 | goto cleanup; | ||
761 | } | ||
762 | tp_cnt++; | ||
763 | } | ||
764 | |||
765 | if (debug) { | ||
766 | print_bpf_prog_info(); | ||
767 | } | ||
768 | |||
769 | /* Unload/stop tracepoint event by closing bpf_link's */ | ||
770 | if (errors_only) { | ||
771 | /* The bpf_link[i] depend on the order of | ||
772 | * the functions was defined in _kern.c | ||
773 | */ | ||
774 | bpf_link__destroy(tp_links[2]); /* tracepoint/xdp/xdp_redirect */ | ||
775 | tp_links[2] = NULL; | ||
776 | |||
777 | bpf_link__destroy(tp_links[3]); /* tracepoint/xdp/xdp_redirect_map */ | ||
778 | tp_links[3] = NULL; | ||
779 | } | ||
780 | |||
781 | stats_poll(interval, errors_only); | ||
782 | |||
783 | ret = EXIT_SUCCESS; | ||
784 | |||
785 | cleanup: | ||
786 | /* Detach tracepoints */ | ||
787 | while (tp_cnt) | ||
788 | bpf_link__destroy(tp_links[--tp_cnt]); | ||
789 | |||
790 | bpf_object__close(obj); | ||
791 | return ret; | ||
792 | } | ||
diff --git a/samples/bpf/xdp_redirect_cpu_kern.c b/samples/bpf/xdp_redirect_cpu_kern.c new file mode 100644 index 000000000..8255025de --- /dev/null +++ b/samples/bpf/xdp_redirect_cpu_kern.c | |||
@@ -0,0 +1,730 @@ | |||
1 | /* XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP) | ||
2 | * | ||
3 | * GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. | ||
4 | */ | ||
5 | #include <uapi/linux/if_ether.h> | ||
6 | #include <uapi/linux/if_packet.h> | ||
7 | #include <uapi/linux/if_vlan.h> | ||
8 | #include <uapi/linux/ip.h> | ||
9 | #include <uapi/linux/ipv6.h> | ||
10 | #include <uapi/linux/in.h> | ||
11 | #include <uapi/linux/tcp.h> | ||
12 | #include <uapi/linux/udp.h> | ||
13 | |||
14 | #include <uapi/linux/bpf.h> | ||
15 | #include <bpf/bpf_helpers.h> | ||
16 | #include "hash_func01.h" | ||
17 | |||
18 | #define MAX_CPUS NR_CPUS | ||
19 | |||
20 | /* Special map type that can XDP_REDIRECT frames to another CPU */ | ||
21 | struct { | ||
22 | __uint(type, BPF_MAP_TYPE_CPUMAP); | ||
23 | __uint(key_size, sizeof(u32)); | ||
24 | __uint(value_size, sizeof(struct bpf_cpumap_val)); | ||
25 | __uint(max_entries, MAX_CPUS); | ||
26 | } cpu_map SEC(".maps"); | ||
27 | |||
28 | /* Common stats data record to keep userspace more simple */ | ||
29 | struct datarec { | ||
30 | __u64 processed; | ||
31 | __u64 dropped; | ||
32 | __u64 issue; | ||
33 | __u64 xdp_pass; | ||
34 | __u64 xdp_drop; | ||
35 | __u64 xdp_redirect; | ||
36 | }; | ||
37 | |||
38 | /* Count RX packets, as XDP bpf_prog doesn't get direct TX-success | ||
39 | * feedback. Redirect TX errors can be caught via a tracepoint. | ||
40 | */ | ||
41 | struct { | ||
42 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); | ||
43 | __type(key, u32); | ||
44 | __type(value, struct datarec); | ||
45 | __uint(max_entries, 1); | ||
46 | } rx_cnt SEC(".maps"); | ||
47 | |||
48 | /* Used by trace point */ | ||
49 | struct { | ||
50 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); | ||
51 | __type(key, u32); | ||
52 | __type(value, struct datarec); | ||
53 | __uint(max_entries, 2); | ||
54 | /* TODO: have entries for all possible errno's */ | ||
55 | } redirect_err_cnt SEC(".maps"); | ||
56 | |||
57 | /* Used by trace point */ | ||
58 | struct { | ||
59 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); | ||
60 | __type(key, u32); | ||
61 | __type(value, struct datarec); | ||
62 | __uint(max_entries, MAX_CPUS); | ||
63 | } cpumap_enqueue_cnt SEC(".maps"); | ||
64 | |||
65 | /* Used by trace point */ | ||
66 | struct { | ||
67 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); | ||
68 | __type(key, u32); | ||
69 | __type(value, struct datarec); | ||
70 | __uint(max_entries, 1); | ||
71 | } cpumap_kthread_cnt SEC(".maps"); | ||
72 | |||
73 | /* Set of maps controlling available CPU, and for iterating through | ||
74 | * selectable redirect CPUs. | ||
75 | */ | ||
76 | struct { | ||
77 | __uint(type, BPF_MAP_TYPE_ARRAY); | ||
78 | __type(key, u32); | ||
79 | __type(value, u32); | ||
80 | __uint(max_entries, MAX_CPUS); | ||
81 | } cpus_available SEC(".maps"); | ||
82 | struct { | ||
83 | __uint(type, BPF_MAP_TYPE_ARRAY); | ||
84 | __type(key, u32); | ||
85 | __type(value, u32); | ||
86 | __uint(max_entries, 1); | ||
87 | } cpus_count SEC(".maps"); | ||
88 | struct { | ||
89 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); | ||
90 | __type(key, u32); | ||
91 | __type(value, u32); | ||
92 | __uint(max_entries, 1); | ||
93 | } cpus_iterator SEC(".maps"); | ||
94 | |||
95 | /* Used by trace point */ | ||
96 | struct { | ||
97 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); | ||
98 | __type(key, u32); | ||
99 | __type(value, struct datarec); | ||
100 | __uint(max_entries, 1); | ||
101 | } exception_cnt SEC(".maps"); | ||
102 | |||
103 | /* Helper parse functions */ | ||
104 | |||
105 | /* Parse Ethernet layer 2, extract network layer 3 offset and protocol | ||
106 | * | ||
107 | * Returns false on error and non-supported ether-type | ||
108 | */ | ||
109 | struct vlan_hdr { | ||
110 | __be16 h_vlan_TCI; | ||
111 | __be16 h_vlan_encapsulated_proto; | ||
112 | }; | ||
113 | |||
114 | static __always_inline | ||
115 | bool parse_eth(struct ethhdr *eth, void *data_end, | ||
116 | u16 *eth_proto, u64 *l3_offset) | ||
117 | { | ||
118 | u16 eth_type; | ||
119 | u64 offset; | ||
120 | |||
121 | offset = sizeof(*eth); | ||
122 | if ((void *)eth + offset > data_end) | ||
123 | return false; | ||
124 | |||
125 | eth_type = eth->h_proto; | ||
126 | |||
127 | /* Skip non 802.3 Ethertypes */ | ||
128 | if (unlikely(ntohs(eth_type) < ETH_P_802_3_MIN)) | ||
129 | return false; | ||
130 | |||
131 | /* Handle VLAN tagged packet */ | ||
132 | if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) { | ||
133 | struct vlan_hdr *vlan_hdr; | ||
134 | |||
135 | vlan_hdr = (void *)eth + offset; | ||
136 | offset += sizeof(*vlan_hdr); | ||
137 | if ((void *)eth + offset > data_end) | ||
138 | return false; | ||
139 | eth_type = vlan_hdr->h_vlan_encapsulated_proto; | ||
140 | } | ||
141 | /* Handle double VLAN tagged packet */ | ||
142 | if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) { | ||
143 | struct vlan_hdr *vlan_hdr; | ||
144 | |||
145 | vlan_hdr = (void *)eth + offset; | ||
146 | offset += sizeof(*vlan_hdr); | ||
147 | if ((void *)eth + offset > data_end) | ||
148 | return false; | ||
149 | eth_type = vlan_hdr->h_vlan_encapsulated_proto; | ||
150 | } | ||
151 | |||
152 | *eth_proto = ntohs(eth_type); | ||
153 | *l3_offset = offset; | ||
154 | return true; | ||
155 | } | ||
156 | |||
157 | static __always_inline | ||
158 | u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off) | ||
159 | { | ||
160 | void *data_end = (void *)(long)ctx->data_end; | ||
161 | void *data = (void *)(long)ctx->data; | ||
162 | struct iphdr *iph = data + nh_off; | ||
163 | struct udphdr *udph; | ||
164 | u16 dport; | ||
165 | |||
166 | if (iph + 1 > data_end) | ||
167 | return 0; | ||
168 | if (!(iph->protocol == IPPROTO_UDP)) | ||
169 | return 0; | ||
170 | |||
171 | udph = (void *)(iph + 1); | ||
172 | if (udph + 1 > data_end) | ||
173 | return 0; | ||
174 | |||
175 | dport = ntohs(udph->dest); | ||
176 | return dport; | ||
177 | } | ||
178 | |||
179 | static __always_inline | ||
180 | int get_proto_ipv4(struct xdp_md *ctx, u64 nh_off) | ||
181 | { | ||
182 | void *data_end = (void *)(long)ctx->data_end; | ||
183 | void *data = (void *)(long)ctx->data; | ||
184 | struct iphdr *iph = data + nh_off; | ||
185 | |||
186 | if (iph + 1 > data_end) | ||
187 | return 0; | ||
188 | return iph->protocol; | ||
189 | } | ||
190 | |||
191 | static __always_inline | ||
192 | int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off) | ||
193 | { | ||
194 | void *data_end = (void *)(long)ctx->data_end; | ||
195 | void *data = (void *)(long)ctx->data; | ||
196 | struct ipv6hdr *ip6h = data + nh_off; | ||
197 | |||
198 | if (ip6h + 1 > data_end) | ||
199 | return 0; | ||
200 | return ip6h->nexthdr; | ||
201 | } | ||
202 | |||
203 | SEC("xdp_cpu_map0") | ||
204 | int xdp_prognum0_no_touch(struct xdp_md *ctx) | ||
205 | { | ||
206 | void *data_end = (void *)(long)ctx->data_end; | ||
207 | void *data = (void *)(long)ctx->data; | ||
208 | struct datarec *rec; | ||
209 | u32 *cpu_selected; | ||
210 | u32 cpu_dest; | ||
211 | u32 key = 0; | ||
212 | |||
213 | /* Only use first entry in cpus_available */ | ||
214 | cpu_selected = bpf_map_lookup_elem(&cpus_available, &key); | ||
215 | if (!cpu_selected) | ||
216 | return XDP_ABORTED; | ||
217 | cpu_dest = *cpu_selected; | ||
218 | |||
219 | /* Count RX packet in map */ | ||
220 | rec = bpf_map_lookup_elem(&rx_cnt, &key); | ||
221 | if (!rec) | ||
222 | return XDP_ABORTED; | ||
223 | rec->processed++; | ||
224 | |||
225 | if (cpu_dest >= MAX_CPUS) { | ||
226 | rec->issue++; | ||
227 | return XDP_ABORTED; | ||
228 | } | ||
229 | |||
230 | return bpf_redirect_map(&cpu_map, cpu_dest, 0); | ||
231 | } | ||
232 | |||
233 | SEC("xdp_cpu_map1_touch_data") | ||
234 | int xdp_prognum1_touch_data(struct xdp_md *ctx) | ||
235 | { | ||
236 | void *data_end = (void *)(long)ctx->data_end; | ||
237 | void *data = (void *)(long)ctx->data; | ||
238 | struct ethhdr *eth = data; | ||
239 | struct datarec *rec; | ||
240 | u32 *cpu_selected; | ||
241 | u32 cpu_dest; | ||
242 | u16 eth_type; | ||
243 | u32 key = 0; | ||
244 | |||
245 | /* Only use first entry in cpus_available */ | ||
246 | cpu_selected = bpf_map_lookup_elem(&cpus_available, &key); | ||
247 | if (!cpu_selected) | ||
248 | return XDP_ABORTED; | ||
249 | cpu_dest = *cpu_selected; | ||
250 | |||
251 | /* Validate packet length is minimum Eth header size */ | ||
252 | if (eth + 1 > data_end) | ||
253 | return XDP_ABORTED; | ||
254 | |||
255 | /* Count RX packet in map */ | ||
256 | rec = bpf_map_lookup_elem(&rx_cnt, &key); | ||
257 | if (!rec) | ||
258 | return XDP_ABORTED; | ||
259 | rec->processed++; | ||
260 | |||
261 | /* Read packet data, and use it (drop non 802.3 Ethertypes) */ | ||
262 | eth_type = eth->h_proto; | ||
263 | if (ntohs(eth_type) < ETH_P_802_3_MIN) { | ||
264 | rec->dropped++; | ||
265 | return XDP_DROP; | ||
266 | } | ||
267 | |||
268 | if (cpu_dest >= MAX_CPUS) { | ||
269 | rec->issue++; | ||
270 | return XDP_ABORTED; | ||
271 | } | ||
272 | |||
273 | return bpf_redirect_map(&cpu_map, cpu_dest, 0); | ||
274 | } | ||
275 | |||
276 | SEC("xdp_cpu_map2_round_robin") | ||
277 | int xdp_prognum2_round_robin(struct xdp_md *ctx) | ||
278 | { | ||
279 | void *data_end = (void *)(long)ctx->data_end; | ||
280 | void *data = (void *)(long)ctx->data; | ||
281 | struct ethhdr *eth = data; | ||
282 | struct datarec *rec; | ||
283 | u32 cpu_dest; | ||
284 | u32 *cpu_lookup; | ||
285 | u32 key0 = 0; | ||
286 | |||
287 | u32 *cpu_selected; | ||
288 | u32 *cpu_iterator; | ||
289 | u32 *cpu_max; | ||
290 | u32 cpu_idx; | ||
291 | |||
292 | cpu_max = bpf_map_lookup_elem(&cpus_count, &key0); | ||
293 | if (!cpu_max) | ||
294 | return XDP_ABORTED; | ||
295 | |||
296 | cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key0); | ||
297 | if (!cpu_iterator) | ||
298 | return XDP_ABORTED; | ||
299 | cpu_idx = *cpu_iterator; | ||
300 | |||
301 | *cpu_iterator += 1; | ||
302 | if (*cpu_iterator == *cpu_max) | ||
303 | *cpu_iterator = 0; | ||
304 | |||
305 | cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx); | ||
306 | if (!cpu_selected) | ||
307 | return XDP_ABORTED; | ||
308 | cpu_dest = *cpu_selected; | ||
309 | |||
310 | /* Count RX packet in map */ | ||
311 | rec = bpf_map_lookup_elem(&rx_cnt, &key0); | ||
312 | if (!rec) | ||
313 | return XDP_ABORTED; | ||
314 | rec->processed++; | ||
315 | |||
316 | if (cpu_dest >= MAX_CPUS) { | ||
317 | rec->issue++; | ||
318 | return XDP_ABORTED; | ||
319 | } | ||
320 | |||
321 | return bpf_redirect_map(&cpu_map, cpu_dest, 0); | ||
322 | } | ||
323 | |||
324 | SEC("xdp_cpu_map3_proto_separate") | ||
325 | int xdp_prognum3_proto_separate(struct xdp_md *ctx) | ||
326 | { | ||
327 | void *data_end = (void *)(long)ctx->data_end; | ||
328 | void *data = (void *)(long)ctx->data; | ||
329 | struct ethhdr *eth = data; | ||
330 | u8 ip_proto = IPPROTO_UDP; | ||
331 | struct datarec *rec; | ||
332 | u16 eth_proto = 0; | ||
333 | u64 l3_offset = 0; | ||
334 | u32 cpu_dest = 0; | ||
335 | u32 cpu_idx = 0; | ||
336 | u32 *cpu_lookup; | ||
337 | u32 key = 0; | ||
338 | |||
339 | /* Count RX packet in map */ | ||
340 | rec = bpf_map_lookup_elem(&rx_cnt, &key); | ||
341 | if (!rec) | ||
342 | return XDP_ABORTED; | ||
343 | rec->processed++; | ||
344 | |||
345 | if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) | ||
346 | return XDP_PASS; /* Just skip */ | ||
347 | |||
348 | /* Extract L4 protocol */ | ||
349 | switch (eth_proto) { | ||
350 | case ETH_P_IP: | ||
351 | ip_proto = get_proto_ipv4(ctx, l3_offset); | ||
352 | break; | ||
353 | case ETH_P_IPV6: | ||
354 | ip_proto = get_proto_ipv6(ctx, l3_offset); | ||
355 | break; | ||
356 | case ETH_P_ARP: | ||
357 | cpu_idx = 0; /* ARP packet handled on separate CPU */ | ||
358 | break; | ||
359 | default: | ||
360 | cpu_idx = 0; | ||
361 | } | ||
362 | |||
363 | /* Choose CPU based on L4 protocol */ | ||
364 | switch (ip_proto) { | ||
365 | case IPPROTO_ICMP: | ||
366 | case IPPROTO_ICMPV6: | ||
367 | cpu_idx = 2; | ||
368 | break; | ||
369 | case IPPROTO_TCP: | ||
370 | cpu_idx = 0; | ||
371 | break; | ||
372 | case IPPROTO_UDP: | ||
373 | cpu_idx = 1; | ||
374 | break; | ||
375 | default: | ||
376 | cpu_idx = 0; | ||
377 | } | ||
378 | |||
379 | cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); | ||
380 | if (!cpu_lookup) | ||
381 | return XDP_ABORTED; | ||
382 | cpu_dest = *cpu_lookup; | ||
383 | |||
384 | if (cpu_dest >= MAX_CPUS) { | ||
385 | rec->issue++; | ||
386 | return XDP_ABORTED; | ||
387 | } | ||
388 | |||
389 | return bpf_redirect_map(&cpu_map, cpu_dest, 0); | ||
390 | } | ||
391 | |||
392 | SEC("xdp_cpu_map4_ddos_filter_pktgen") | ||
393 | int xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx) | ||
394 | { | ||
395 | void *data_end = (void *)(long)ctx->data_end; | ||
396 | void *data = (void *)(long)ctx->data; | ||
397 | struct ethhdr *eth = data; | ||
398 | u8 ip_proto = IPPROTO_UDP; | ||
399 | struct datarec *rec; | ||
400 | u16 eth_proto = 0; | ||
401 | u64 l3_offset = 0; | ||
402 | u32 cpu_dest = 0; | ||
403 | u32 cpu_idx = 0; | ||
404 | u16 dest_port; | ||
405 | u32 *cpu_lookup; | ||
406 | u32 key = 0; | ||
407 | |||
408 | /* Count RX packet in map */ | ||
409 | rec = bpf_map_lookup_elem(&rx_cnt, &key); | ||
410 | if (!rec) | ||
411 | return XDP_ABORTED; | ||
412 | rec->processed++; | ||
413 | |||
414 | if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) | ||
415 | return XDP_PASS; /* Just skip */ | ||
416 | |||
417 | /* Extract L4 protocol */ | ||
418 | switch (eth_proto) { | ||
419 | case ETH_P_IP: | ||
420 | ip_proto = get_proto_ipv4(ctx, l3_offset); | ||
421 | break; | ||
422 | case ETH_P_IPV6: | ||
423 | ip_proto = get_proto_ipv6(ctx, l3_offset); | ||
424 | break; | ||
425 | case ETH_P_ARP: | ||
426 | cpu_idx = 0; /* ARP packet handled on separate CPU */ | ||
427 | break; | ||
428 | default: | ||
429 | cpu_idx = 0; | ||
430 | } | ||
431 | |||
432 | /* Choose CPU based on L4 protocol */ | ||
433 | switch (ip_proto) { | ||
434 | case IPPROTO_ICMP: | ||
435 | case IPPROTO_ICMPV6: | ||
436 | cpu_idx = 2; | ||
437 | break; | ||
438 | case IPPROTO_TCP: | ||
439 | cpu_idx = 0; | ||
440 | break; | ||
441 | case IPPROTO_UDP: | ||
442 | cpu_idx = 1; | ||
443 | /* DDoS filter UDP port 9 (pktgen) */ | ||
444 | dest_port = get_dest_port_ipv4_udp(ctx, l3_offset); | ||
445 | if (dest_port == 9) { | ||
446 | if (rec) | ||
447 | rec->dropped++; | ||
448 | return XDP_DROP; | ||
449 | } | ||
450 | break; | ||
451 | default: | ||
452 | cpu_idx = 0; | ||
453 | } | ||
454 | |||
455 | cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); | ||
456 | if (!cpu_lookup) | ||
457 | return XDP_ABORTED; | ||
458 | cpu_dest = *cpu_lookup; | ||
459 | |||
460 | if (cpu_dest >= MAX_CPUS) { | ||
461 | rec->issue++; | ||
462 | return XDP_ABORTED; | ||
463 | } | ||
464 | |||
465 | return bpf_redirect_map(&cpu_map, cpu_dest, 0); | ||
466 | } | ||
467 | |||
468 | /* Hashing initval */ | ||
469 | #define INITVAL 15485863 | ||
470 | |||
471 | static __always_inline | ||
472 | u32 get_ipv4_hash_ip_pair(struct xdp_md *ctx, u64 nh_off) | ||
473 | { | ||
474 | void *data_end = (void *)(long)ctx->data_end; | ||
475 | void *data = (void *)(long)ctx->data; | ||
476 | struct iphdr *iph = data + nh_off; | ||
477 | u32 cpu_hash; | ||
478 | |||
479 | if (iph + 1 > data_end) | ||
480 | return 0; | ||
481 | |||
482 | cpu_hash = iph->saddr + iph->daddr; | ||
483 | cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + iph->protocol); | ||
484 | |||
485 | return cpu_hash; | ||
486 | } | ||
487 | |||
488 | static __always_inline | ||
489 | u32 get_ipv6_hash_ip_pair(struct xdp_md *ctx, u64 nh_off) | ||
490 | { | ||
491 | void *data_end = (void *)(long)ctx->data_end; | ||
492 | void *data = (void *)(long)ctx->data; | ||
493 | struct ipv6hdr *ip6h = data + nh_off; | ||
494 | u32 cpu_hash; | ||
495 | |||
496 | if (ip6h + 1 > data_end) | ||
497 | return 0; | ||
498 | |||
499 | cpu_hash = ip6h->saddr.s6_addr32[0] + ip6h->daddr.s6_addr32[0]; | ||
500 | cpu_hash += ip6h->saddr.s6_addr32[1] + ip6h->daddr.s6_addr32[1]; | ||
501 | cpu_hash += ip6h->saddr.s6_addr32[2] + ip6h->daddr.s6_addr32[2]; | ||
502 | cpu_hash += ip6h->saddr.s6_addr32[3] + ip6h->daddr.s6_addr32[3]; | ||
503 | cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + ip6h->nexthdr); | ||
504 | |||
505 | return cpu_hash; | ||
506 | } | ||
507 | |||
508 | /* Load-Balance traffic based on hashing IP-addrs + L4-proto. The | ||
509 | * hashing scheme is symmetric, meaning swapping IP src/dest still hit | ||
510 | * same CPU. | ||
511 | */ | ||
512 | SEC("xdp_cpu_map5_lb_hash_ip_pairs") | ||
513 | int xdp_prognum5_lb_hash_ip_pairs(struct xdp_md *ctx) | ||
514 | { | ||
515 | void *data_end = (void *)(long)ctx->data_end; | ||
516 | void *data = (void *)(long)ctx->data; | ||
517 | struct ethhdr *eth = data; | ||
518 | u8 ip_proto = IPPROTO_UDP; | ||
519 | struct datarec *rec; | ||
520 | u16 eth_proto = 0; | ||
521 | u64 l3_offset = 0; | ||
522 | u32 cpu_dest = 0; | ||
523 | u32 cpu_idx = 0; | ||
524 | u32 *cpu_lookup; | ||
525 | u32 *cpu_max; | ||
526 | u32 cpu_hash; | ||
527 | u32 key = 0; | ||
528 | |||
529 | /* Count RX packet in map */ | ||
530 | rec = bpf_map_lookup_elem(&rx_cnt, &key); | ||
531 | if (!rec) | ||
532 | return XDP_ABORTED; | ||
533 | rec->processed++; | ||
534 | |||
535 | cpu_max = bpf_map_lookup_elem(&cpus_count, &key); | ||
536 | if (!cpu_max) | ||
537 | return XDP_ABORTED; | ||
538 | |||
539 | if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) | ||
540 | return XDP_PASS; /* Just skip */ | ||
541 | |||
542 | /* Hash for IPv4 and IPv6 */ | ||
543 | switch (eth_proto) { | ||
544 | case ETH_P_IP: | ||
545 | cpu_hash = get_ipv4_hash_ip_pair(ctx, l3_offset); | ||
546 | break; | ||
547 | case ETH_P_IPV6: | ||
548 | cpu_hash = get_ipv6_hash_ip_pair(ctx, l3_offset); | ||
549 | break; | ||
550 | case ETH_P_ARP: /* ARP packet handled on CPU idx 0 */ | ||
551 | default: | ||
552 | cpu_hash = 0; | ||
553 | } | ||
554 | |||
555 | /* Choose CPU based on hash */ | ||
556 | cpu_idx = cpu_hash % *cpu_max; | ||
557 | |||
558 | cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); | ||
559 | if (!cpu_lookup) | ||
560 | return XDP_ABORTED; | ||
561 | cpu_dest = *cpu_lookup; | ||
562 | |||
563 | if (cpu_dest >= MAX_CPUS) { | ||
564 | rec->issue++; | ||
565 | return XDP_ABORTED; | ||
566 | } | ||
567 | |||
568 | return bpf_redirect_map(&cpu_map, cpu_dest, 0); | ||
569 | } | ||
570 | |||
571 | char _license[] SEC("license") = "GPL"; | ||
572 | |||
573 | /*** Trace point code ***/ | ||
574 | |||
575 | /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format | ||
576 | * Code in: kernel/include/trace/events/xdp.h | ||
577 | */ | ||
578 | struct xdp_redirect_ctx { | ||
579 | u64 __pad; // First 8 bytes are not accessible by bpf code | ||
580 | int prog_id; // offset:8; size:4; signed:1; | ||
581 | u32 act; // offset:12 size:4; signed:0; | ||
582 | int ifindex; // offset:16 size:4; signed:1; | ||
583 | int err; // offset:20 size:4; signed:1; | ||
584 | int to_ifindex; // offset:24 size:4; signed:1; | ||
585 | u32 map_id; // offset:28 size:4; signed:0; | ||
586 | int map_index; // offset:32 size:4; signed:1; | ||
587 | }; // offset:36 | ||
588 | |||
589 | enum { | ||
590 | XDP_REDIRECT_SUCCESS = 0, | ||
591 | XDP_REDIRECT_ERROR = 1 | ||
592 | }; | ||
593 | |||
594 | static __always_inline | ||
595 | int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx) | ||
596 | { | ||
597 | u32 key = XDP_REDIRECT_ERROR; | ||
598 | struct datarec *rec; | ||
599 | int err = ctx->err; | ||
600 | |||
601 | if (!err) | ||
602 | key = XDP_REDIRECT_SUCCESS; | ||
603 | |||
604 | rec = bpf_map_lookup_elem(&redirect_err_cnt, &key); | ||
605 | if (!rec) | ||
606 | return 0; | ||
607 | rec->dropped += 1; | ||
608 | |||
609 | return 0; /* Indicate event was filtered (no further processing)*/ | ||
610 | /* | ||
611 | * Returning 1 here would allow e.g. a perf-record tracepoint | ||
612 | * to see and record these events, but it doesn't work well | ||
613 | * in-practice as stopping perf-record also unload this | ||
614 | * bpf_prog. Plus, there is additional overhead of doing so. | ||
615 | */ | ||
616 | } | ||
617 | |||
618 | SEC("tracepoint/xdp/xdp_redirect_err") | ||
619 | int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx) | ||
620 | { | ||
621 | return xdp_redirect_collect_stat(ctx); | ||
622 | } | ||
623 | |||
624 | SEC("tracepoint/xdp/xdp_redirect_map_err") | ||
625 | int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx) | ||
626 | { | ||
627 | return xdp_redirect_collect_stat(ctx); | ||
628 | } | ||
629 | |||
630 | /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format | ||
631 | * Code in: kernel/include/trace/events/xdp.h | ||
632 | */ | ||
633 | struct xdp_exception_ctx { | ||
634 | u64 __pad; // First 8 bytes are not accessible by bpf code | ||
635 | int prog_id; // offset:8; size:4; signed:1; | ||
636 | u32 act; // offset:12; size:4; signed:0; | ||
637 | int ifindex; // offset:16; size:4; signed:1; | ||
638 | }; | ||
639 | |||
640 | SEC("tracepoint/xdp/xdp_exception") | ||
641 | int trace_xdp_exception(struct xdp_exception_ctx *ctx) | ||
642 | { | ||
643 | struct datarec *rec; | ||
644 | u32 key = 0; | ||
645 | |||
646 | rec = bpf_map_lookup_elem(&exception_cnt, &key); | ||
647 | if (!rec) | ||
648 | return 1; | ||
649 | rec->dropped += 1; | ||
650 | |||
651 | return 0; | ||
652 | } | ||
653 | |||
654 | /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format | ||
655 | * Code in: kernel/include/trace/events/xdp.h | ||
656 | */ | ||
657 | struct cpumap_enqueue_ctx { | ||
658 | u64 __pad; // First 8 bytes are not accessible by bpf code | ||
659 | int map_id; // offset:8; size:4; signed:1; | ||
660 | u32 act; // offset:12; size:4; signed:0; | ||
661 | int cpu; // offset:16; size:4; signed:1; | ||
662 | unsigned int drops; // offset:20; size:4; signed:0; | ||
663 | unsigned int processed; // offset:24; size:4; signed:0; | ||
664 | int to_cpu; // offset:28; size:4; signed:1; | ||
665 | }; | ||
666 | |||
667 | SEC("tracepoint/xdp/xdp_cpumap_enqueue") | ||
668 | int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx) | ||
669 | { | ||
670 | u32 to_cpu = ctx->to_cpu; | ||
671 | struct datarec *rec; | ||
672 | |||
673 | if (to_cpu >= MAX_CPUS) | ||
674 | return 1; | ||
675 | |||
676 | rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu); | ||
677 | if (!rec) | ||
678 | return 0; | ||
679 | rec->processed += ctx->processed; | ||
680 | rec->dropped += ctx->drops; | ||
681 | |||
682 | /* Record bulk events, then userspace can calc average bulk size */ | ||
683 | if (ctx->processed > 0) | ||
684 | rec->issue += 1; | ||
685 | |||
686 | /* Inception: It's possible to detect overload situations, via | ||
687 | * this tracepoint. This can be used for creating a feedback | ||
688 | * loop to XDP, which can take appropriate actions to mitigate | ||
689 | * this overload situation. | ||
690 | */ | ||
691 | return 0; | ||
692 | } | ||
693 | |||
694 | /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format | ||
695 | * Code in: kernel/include/trace/events/xdp.h | ||
696 | */ | ||
697 | struct cpumap_kthread_ctx { | ||
698 | u64 __pad; // First 8 bytes are not accessible | ||
699 | int map_id; // offset:8; size:4; signed:1; | ||
700 | u32 act; // offset:12; size:4; signed:0; | ||
701 | int cpu; // offset:16; size:4; signed:1; | ||
702 | unsigned int drops; // offset:20; size:4; signed:0; | ||
703 | unsigned int processed; // offset:24; size:4; signed:0; | ||
704 | int sched; // offset:28; size:4; signed:1; | ||
705 | unsigned int xdp_pass; // offset:32; size:4; signed:0; | ||
706 | unsigned int xdp_drop; // offset:36; size:4; signed:0; | ||
707 | unsigned int xdp_redirect; // offset:40; size:4; signed:0; | ||
708 | }; | ||
709 | |||
710 | SEC("tracepoint/xdp/xdp_cpumap_kthread") | ||
711 | int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx) | ||
712 | { | ||
713 | struct datarec *rec; | ||
714 | u32 key = 0; | ||
715 | |||
716 | rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key); | ||
717 | if (!rec) | ||
718 | return 0; | ||
719 | rec->processed += ctx->processed; | ||
720 | rec->dropped += ctx->drops; | ||
721 | rec->xdp_pass += ctx->xdp_pass; | ||
722 | rec->xdp_drop += ctx->xdp_drop; | ||
723 | rec->xdp_redirect += ctx->xdp_redirect; | ||
724 | |||
725 | /* Count times kthread yielded CPU via schedule call */ | ||
726 | if (ctx->sched) | ||
727 | rec->issue++; | ||
728 | |||
729 | return 0; | ||
730 | } | ||
diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c new file mode 100644 index 000000000..16eb839e7 --- /dev/null +++ b/samples/bpf/xdp_redirect_cpu_user.c | |||
@@ -0,0 +1,983 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. | ||
3 | */ | ||
4 | static const char *__doc__ = | ||
5 | " XDP redirect with a CPU-map type \"BPF_MAP_TYPE_CPUMAP\""; | ||
6 | |||
7 | #include <errno.h> | ||
8 | #include <signal.h> | ||
9 | #include <stdio.h> | ||
10 | #include <stdlib.h> | ||
11 | #include <stdbool.h> | ||
12 | #include <string.h> | ||
13 | #include <unistd.h> | ||
14 | #include <locale.h> | ||
15 | #include <sys/resource.h> | ||
16 | #include <sys/sysinfo.h> | ||
17 | #include <getopt.h> | ||
18 | #include <net/if.h> | ||
19 | #include <time.h> | ||
20 | #include <linux/limits.h> | ||
21 | |||
22 | #include <arpa/inet.h> | ||
23 | #include <linux/if_link.h> | ||
24 | |||
25 | /* How many xdp_progs are defined in _kern.c */ | ||
26 | #define MAX_PROG 6 | ||
27 | |||
28 | #include <bpf/bpf.h> | ||
29 | #include <bpf/libbpf.h> | ||
30 | |||
31 | #include "bpf_util.h" | ||
32 | |||
33 | static int ifindex = -1; | ||
34 | static char ifname_buf[IF_NAMESIZE]; | ||
35 | static char *ifname; | ||
36 | static __u32 prog_id; | ||
37 | |||
38 | static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; | ||
39 | static int n_cpus; | ||
40 | |||
41 | enum map_type { | ||
42 | CPU_MAP, | ||
43 | RX_CNT, | ||
44 | REDIRECT_ERR_CNT, | ||
45 | CPUMAP_ENQUEUE_CNT, | ||
46 | CPUMAP_KTHREAD_CNT, | ||
47 | CPUS_AVAILABLE, | ||
48 | CPUS_COUNT, | ||
49 | CPUS_ITERATOR, | ||
50 | EXCEPTION_CNT, | ||
51 | }; | ||
52 | |||
53 | static const char *const map_type_strings[] = { | ||
54 | [CPU_MAP] = "cpu_map", | ||
55 | [RX_CNT] = "rx_cnt", | ||
56 | [REDIRECT_ERR_CNT] = "redirect_err_cnt", | ||
57 | [CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt", | ||
58 | [CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt", | ||
59 | [CPUS_AVAILABLE] = "cpus_available", | ||
60 | [CPUS_COUNT] = "cpus_count", | ||
61 | [CPUS_ITERATOR] = "cpus_iterator", | ||
62 | [EXCEPTION_CNT] = "exception_cnt", | ||
63 | }; | ||
64 | |||
65 | #define NUM_TP 5 | ||
66 | #define NUM_MAP 9 | ||
67 | struct bpf_link *tp_links[NUM_TP] = {}; | ||
68 | static int map_fds[NUM_MAP]; | ||
69 | static int tp_cnt = 0; | ||
70 | |||
71 | /* Exit return codes */ | ||
72 | #define EXIT_OK 0 | ||
73 | #define EXIT_FAIL 1 | ||
74 | #define EXIT_FAIL_OPTION 2 | ||
75 | #define EXIT_FAIL_XDP 3 | ||
76 | #define EXIT_FAIL_BPF 4 | ||
77 | #define EXIT_FAIL_MEM 5 | ||
78 | |||
79 | static const struct option long_options[] = { | ||
80 | {"help", no_argument, NULL, 'h' }, | ||
81 | {"dev", required_argument, NULL, 'd' }, | ||
82 | {"skb-mode", no_argument, NULL, 'S' }, | ||
83 | {"sec", required_argument, NULL, 's' }, | ||
84 | {"progname", required_argument, NULL, 'p' }, | ||
85 | {"qsize", required_argument, NULL, 'q' }, | ||
86 | {"cpu", required_argument, NULL, 'c' }, | ||
87 | {"stress-mode", no_argument, NULL, 'x' }, | ||
88 | {"no-separators", no_argument, NULL, 'z' }, | ||
89 | {"force", no_argument, NULL, 'F' }, | ||
90 | {"mprog-disable", no_argument, NULL, 'n' }, | ||
91 | {"mprog-name", required_argument, NULL, 'e' }, | ||
92 | {"mprog-filename", required_argument, NULL, 'f' }, | ||
93 | {"redirect-device", required_argument, NULL, 'r' }, | ||
94 | {"redirect-map", required_argument, NULL, 'm' }, | ||
95 | {0, 0, NULL, 0 } | ||
96 | }; | ||
97 | |||
98 | static void int_exit(int sig) | ||
99 | { | ||
100 | __u32 curr_prog_id = 0; | ||
101 | |||
102 | if (ifindex > -1) { | ||
103 | if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) { | ||
104 | printf("bpf_get_link_xdp_id failed\n"); | ||
105 | exit(EXIT_FAIL); | ||
106 | } | ||
107 | if (prog_id == curr_prog_id) { | ||
108 | fprintf(stderr, | ||
109 | "Interrupted: Removing XDP program on ifindex:%d device:%s\n", | ||
110 | ifindex, ifname); | ||
111 | bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); | ||
112 | } else if (!curr_prog_id) { | ||
113 | printf("couldn't find a prog id on a given iface\n"); | ||
114 | } else { | ||
115 | printf("program on interface changed, not removing\n"); | ||
116 | } | ||
117 | } | ||
118 | /* Detach tracepoints */ | ||
119 | while (tp_cnt) | ||
120 | bpf_link__destroy(tp_links[--tp_cnt]); | ||
121 | |||
122 | exit(EXIT_OK); | ||
123 | } | ||
124 | |||
125 | static void print_avail_progs(struct bpf_object *obj) | ||
126 | { | ||
127 | struct bpf_program *pos; | ||
128 | |||
129 | bpf_object__for_each_program(pos, obj) { | ||
130 | if (bpf_program__is_xdp(pos)) | ||
131 | printf(" %s\n", bpf_program__section_name(pos)); | ||
132 | } | ||
133 | } | ||
134 | |||
135 | static void usage(char *argv[], struct bpf_object *obj) | ||
136 | { | ||
137 | int i; | ||
138 | |||
139 | printf("\nDOCUMENTATION:\n%s\n", __doc__); | ||
140 | printf("\n"); | ||
141 | printf(" Usage: %s (options-see-below)\n", argv[0]); | ||
142 | printf(" Listing options:\n"); | ||
143 | for (i = 0; long_options[i].name != 0; i++) { | ||
144 | printf(" --%-12s", long_options[i].name); | ||
145 | if (long_options[i].flag != NULL) | ||
146 | printf(" flag (internal value:%d)", | ||
147 | *long_options[i].flag); | ||
148 | else | ||
149 | printf(" short-option: -%c", | ||
150 | long_options[i].val); | ||
151 | printf("\n"); | ||
152 | } | ||
153 | printf("\n Programs to be used for --progname:\n"); | ||
154 | print_avail_progs(obj); | ||
155 | printf("\n"); | ||
156 | } | ||
157 | |||
158 | /* gettime returns the current time of day in nanoseconds. | ||
159 | * Cost: clock_gettime (ns) => 26ns (CLOCK_MONOTONIC) | ||
160 | * clock_gettime (ns) => 9ns (CLOCK_MONOTONIC_COARSE) | ||
161 | */ | ||
162 | #define NANOSEC_PER_SEC 1000000000 /* 10^9 */ | ||
163 | static __u64 gettime(void) | ||
164 | { | ||
165 | struct timespec t; | ||
166 | int res; | ||
167 | |||
168 | res = clock_gettime(CLOCK_MONOTONIC, &t); | ||
169 | if (res < 0) { | ||
170 | fprintf(stderr, "Error with gettimeofday! (%i)\n", res); | ||
171 | exit(EXIT_FAIL); | ||
172 | } | ||
173 | return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; | ||
174 | } | ||
175 | |||
176 | /* Common stats data record shared with _kern.c */ | ||
177 | struct datarec { | ||
178 | __u64 processed; | ||
179 | __u64 dropped; | ||
180 | __u64 issue; | ||
181 | __u64 xdp_pass; | ||
182 | __u64 xdp_drop; | ||
183 | __u64 xdp_redirect; | ||
184 | }; | ||
185 | struct record { | ||
186 | __u64 timestamp; | ||
187 | struct datarec total; | ||
188 | struct datarec *cpu; | ||
189 | }; | ||
190 | struct stats_record { | ||
191 | struct record rx_cnt; | ||
192 | struct record redir_err; | ||
193 | struct record kthread; | ||
194 | struct record exception; | ||
195 | struct record enq[]; | ||
196 | }; | ||
197 | |||
198 | static bool map_collect_percpu(int fd, __u32 key, struct record *rec) | ||
199 | { | ||
200 | /* For percpu maps, userspace gets a value per possible CPU */ | ||
201 | unsigned int nr_cpus = bpf_num_possible_cpus(); | ||
202 | struct datarec values[nr_cpus]; | ||
203 | __u64 sum_xdp_redirect = 0; | ||
204 | __u64 sum_xdp_pass = 0; | ||
205 | __u64 sum_xdp_drop = 0; | ||
206 | __u64 sum_processed = 0; | ||
207 | __u64 sum_dropped = 0; | ||
208 | __u64 sum_issue = 0; | ||
209 | int i; | ||
210 | |||
211 | if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { | ||
212 | fprintf(stderr, | ||
213 | "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); | ||
214 | return false; | ||
215 | } | ||
216 | /* Get time as close as possible to reading map contents */ | ||
217 | rec->timestamp = gettime(); | ||
218 | |||
219 | /* Record and sum values from each CPU */ | ||
220 | for (i = 0; i < nr_cpus; i++) { | ||
221 | rec->cpu[i].processed = values[i].processed; | ||
222 | sum_processed += values[i].processed; | ||
223 | rec->cpu[i].dropped = values[i].dropped; | ||
224 | sum_dropped += values[i].dropped; | ||
225 | rec->cpu[i].issue = values[i].issue; | ||
226 | sum_issue += values[i].issue; | ||
227 | rec->cpu[i].xdp_pass = values[i].xdp_pass; | ||
228 | sum_xdp_pass += values[i].xdp_pass; | ||
229 | rec->cpu[i].xdp_drop = values[i].xdp_drop; | ||
230 | sum_xdp_drop += values[i].xdp_drop; | ||
231 | rec->cpu[i].xdp_redirect = values[i].xdp_redirect; | ||
232 | sum_xdp_redirect += values[i].xdp_redirect; | ||
233 | } | ||
234 | rec->total.processed = sum_processed; | ||
235 | rec->total.dropped = sum_dropped; | ||
236 | rec->total.issue = sum_issue; | ||
237 | rec->total.xdp_pass = sum_xdp_pass; | ||
238 | rec->total.xdp_drop = sum_xdp_drop; | ||
239 | rec->total.xdp_redirect = sum_xdp_redirect; | ||
240 | return true; | ||
241 | } | ||
242 | |||
243 | static struct datarec *alloc_record_per_cpu(void) | ||
244 | { | ||
245 | unsigned int nr_cpus = bpf_num_possible_cpus(); | ||
246 | struct datarec *array; | ||
247 | |||
248 | array = calloc(nr_cpus, sizeof(struct datarec)); | ||
249 | if (!array) { | ||
250 | fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus); | ||
251 | exit(EXIT_FAIL_MEM); | ||
252 | } | ||
253 | return array; | ||
254 | } | ||
255 | |||
256 | static struct stats_record *alloc_stats_record(void) | ||
257 | { | ||
258 | struct stats_record *rec; | ||
259 | int i, size; | ||
260 | |||
261 | size = sizeof(*rec) + n_cpus * sizeof(struct record); | ||
262 | rec = malloc(size); | ||
263 | if (!rec) { | ||
264 | fprintf(stderr, "Mem alloc error\n"); | ||
265 | exit(EXIT_FAIL_MEM); | ||
266 | } | ||
267 | memset(rec, 0, size); | ||
268 | rec->rx_cnt.cpu = alloc_record_per_cpu(); | ||
269 | rec->redir_err.cpu = alloc_record_per_cpu(); | ||
270 | rec->kthread.cpu = alloc_record_per_cpu(); | ||
271 | rec->exception.cpu = alloc_record_per_cpu(); | ||
272 | for (i = 0; i < n_cpus; i++) | ||
273 | rec->enq[i].cpu = alloc_record_per_cpu(); | ||
274 | |||
275 | return rec; | ||
276 | } | ||
277 | |||
278 | static void free_stats_record(struct stats_record *r) | ||
279 | { | ||
280 | int i; | ||
281 | |||
282 | for (i = 0; i < n_cpus; i++) | ||
283 | free(r->enq[i].cpu); | ||
284 | free(r->exception.cpu); | ||
285 | free(r->kthread.cpu); | ||
286 | free(r->redir_err.cpu); | ||
287 | free(r->rx_cnt.cpu); | ||
288 | free(r); | ||
289 | } | ||
290 | |||
291 | static double calc_period(struct record *r, struct record *p) | ||
292 | { | ||
293 | double period_ = 0; | ||
294 | __u64 period = 0; | ||
295 | |||
296 | period = r->timestamp - p->timestamp; | ||
297 | if (period > 0) | ||
298 | period_ = ((double) period / NANOSEC_PER_SEC); | ||
299 | |||
300 | return period_; | ||
301 | } | ||
302 | |||
303 | static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_) | ||
304 | { | ||
305 | __u64 packets = 0; | ||
306 | __u64 pps = 0; | ||
307 | |||
308 | if (period_ > 0) { | ||
309 | packets = r->processed - p->processed; | ||
310 | pps = packets / period_; | ||
311 | } | ||
312 | return pps; | ||
313 | } | ||
314 | |||
315 | static __u64 calc_drop_pps(struct datarec *r, struct datarec *p, double period_) | ||
316 | { | ||
317 | __u64 packets = 0; | ||
318 | __u64 pps = 0; | ||
319 | |||
320 | if (period_ > 0) { | ||
321 | packets = r->dropped - p->dropped; | ||
322 | pps = packets / period_; | ||
323 | } | ||
324 | return pps; | ||
325 | } | ||
326 | |||
327 | static __u64 calc_errs_pps(struct datarec *r, | ||
328 | struct datarec *p, double period_) | ||
329 | { | ||
330 | __u64 packets = 0; | ||
331 | __u64 pps = 0; | ||
332 | |||
333 | if (period_ > 0) { | ||
334 | packets = r->issue - p->issue; | ||
335 | pps = packets / period_; | ||
336 | } | ||
337 | return pps; | ||
338 | } | ||
339 | |||
340 | static void calc_xdp_pps(struct datarec *r, struct datarec *p, | ||
341 | double *xdp_pass, double *xdp_drop, | ||
342 | double *xdp_redirect, double period_) | ||
343 | { | ||
344 | *xdp_pass = 0, *xdp_drop = 0, *xdp_redirect = 0; | ||
345 | if (period_ > 0) { | ||
346 | *xdp_redirect = (r->xdp_redirect - p->xdp_redirect) / period_; | ||
347 | *xdp_pass = (r->xdp_pass - p->xdp_pass) / period_; | ||
348 | *xdp_drop = (r->xdp_drop - p->xdp_drop) / period_; | ||
349 | } | ||
350 | } | ||
351 | |||
352 | static void stats_print(struct stats_record *stats_rec, | ||
353 | struct stats_record *stats_prev, | ||
354 | char *prog_name, char *mprog_name, int mprog_fd) | ||
355 | { | ||
356 | unsigned int nr_cpus = bpf_num_possible_cpus(); | ||
357 | double pps = 0, drop = 0, err = 0; | ||
358 | bool mprog_enabled = false; | ||
359 | struct record *rec, *prev; | ||
360 | int to_cpu; | ||
361 | double t; | ||
362 | int i; | ||
363 | |||
364 | if (mprog_fd > 0) | ||
365 | mprog_enabled = true; | ||
366 | |||
367 | /* Header */ | ||
368 | printf("Running XDP/eBPF prog_name:%s\n", prog_name); | ||
369 | printf("%-15s %-7s %-14s %-11s %-9s\n", | ||
370 | "XDP-cpumap", "CPU:to", "pps", "drop-pps", "extra-info"); | ||
371 | |||
372 | /* XDP rx_cnt */ | ||
373 | { | ||
374 | char *fmt_rx = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n"; | ||
375 | char *fm2_rx = "%-15s %-7s %'-14.0f %'-11.0f\n"; | ||
376 | char *errstr = ""; | ||
377 | |||
378 | rec = &stats_rec->rx_cnt; | ||
379 | prev = &stats_prev->rx_cnt; | ||
380 | t = calc_period(rec, prev); | ||
381 | for (i = 0; i < nr_cpus; i++) { | ||
382 | struct datarec *r = &rec->cpu[i]; | ||
383 | struct datarec *p = &prev->cpu[i]; | ||
384 | |||
385 | pps = calc_pps(r, p, t); | ||
386 | drop = calc_drop_pps(r, p, t); | ||
387 | err = calc_errs_pps(r, p, t); | ||
388 | if (err > 0) | ||
389 | errstr = "cpu-dest/err"; | ||
390 | if (pps > 0) | ||
391 | printf(fmt_rx, "XDP-RX", | ||
392 | i, pps, drop, err, errstr); | ||
393 | } | ||
394 | pps = calc_pps(&rec->total, &prev->total, t); | ||
395 | drop = calc_drop_pps(&rec->total, &prev->total, t); | ||
396 | err = calc_errs_pps(&rec->total, &prev->total, t); | ||
397 | printf(fm2_rx, "XDP-RX", "total", pps, drop); | ||
398 | } | ||
399 | |||
400 | /* cpumap enqueue stats */ | ||
401 | for (to_cpu = 0; to_cpu < n_cpus; to_cpu++) { | ||
402 | char *fmt = "%-15s %3d:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n"; | ||
403 | char *fm2 = "%-15s %3s:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n"; | ||
404 | char *errstr = ""; | ||
405 | |||
406 | rec = &stats_rec->enq[to_cpu]; | ||
407 | prev = &stats_prev->enq[to_cpu]; | ||
408 | t = calc_period(rec, prev); | ||
409 | for (i = 0; i < nr_cpus; i++) { | ||
410 | struct datarec *r = &rec->cpu[i]; | ||
411 | struct datarec *p = &prev->cpu[i]; | ||
412 | |||
413 | pps = calc_pps(r, p, t); | ||
414 | drop = calc_drop_pps(r, p, t); | ||
415 | err = calc_errs_pps(r, p, t); | ||
416 | if (err > 0) { | ||
417 | errstr = "bulk-average"; | ||
418 | err = pps / err; /* calc average bulk size */ | ||
419 | } | ||
420 | if (pps > 0) | ||
421 | printf(fmt, "cpumap-enqueue", | ||
422 | i, to_cpu, pps, drop, err, errstr); | ||
423 | } | ||
424 | pps = calc_pps(&rec->total, &prev->total, t); | ||
425 | if (pps > 0) { | ||
426 | drop = calc_drop_pps(&rec->total, &prev->total, t); | ||
427 | err = calc_errs_pps(&rec->total, &prev->total, t); | ||
428 | if (err > 0) { | ||
429 | errstr = "bulk-average"; | ||
430 | err = pps / err; /* calc average bulk size */ | ||
431 | } | ||
432 | printf(fm2, "cpumap-enqueue", | ||
433 | "sum", to_cpu, pps, drop, err, errstr); | ||
434 | } | ||
435 | } | ||
436 | |||
437 | /* cpumap kthread stats */ | ||
438 | { | ||
439 | char *fmt_k = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n"; | ||
440 | char *fm2_k = "%-15s %-7s %'-14.0f %'-11.0f %'-10.0f %s\n"; | ||
441 | char *e_str = ""; | ||
442 | |||
443 | rec = &stats_rec->kthread; | ||
444 | prev = &stats_prev->kthread; | ||
445 | t = calc_period(rec, prev); | ||
446 | for (i = 0; i < nr_cpus; i++) { | ||
447 | struct datarec *r = &rec->cpu[i]; | ||
448 | struct datarec *p = &prev->cpu[i]; | ||
449 | |||
450 | pps = calc_pps(r, p, t); | ||
451 | drop = calc_drop_pps(r, p, t); | ||
452 | err = calc_errs_pps(r, p, t); | ||
453 | if (err > 0) | ||
454 | e_str = "sched"; | ||
455 | if (pps > 0) | ||
456 | printf(fmt_k, "cpumap_kthread", | ||
457 | i, pps, drop, err, e_str); | ||
458 | } | ||
459 | pps = calc_pps(&rec->total, &prev->total, t); | ||
460 | drop = calc_drop_pps(&rec->total, &prev->total, t); | ||
461 | err = calc_errs_pps(&rec->total, &prev->total, t); | ||
462 | if (err > 0) | ||
463 | e_str = "sched-sum"; | ||
464 | printf(fm2_k, "cpumap_kthread", "total", pps, drop, err, e_str); | ||
465 | } | ||
466 | |||
467 | /* XDP redirect err tracepoints (very unlikely) */ | ||
468 | { | ||
469 | char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n"; | ||
470 | char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n"; | ||
471 | |||
472 | rec = &stats_rec->redir_err; | ||
473 | prev = &stats_prev->redir_err; | ||
474 | t = calc_period(rec, prev); | ||
475 | for (i = 0; i < nr_cpus; i++) { | ||
476 | struct datarec *r = &rec->cpu[i]; | ||
477 | struct datarec *p = &prev->cpu[i]; | ||
478 | |||
479 | pps = calc_pps(r, p, t); | ||
480 | drop = calc_drop_pps(r, p, t); | ||
481 | if (pps > 0) | ||
482 | printf(fmt_err, "redirect_err", i, pps, drop); | ||
483 | } | ||
484 | pps = calc_pps(&rec->total, &prev->total, t); | ||
485 | drop = calc_drop_pps(&rec->total, &prev->total, t); | ||
486 | printf(fm2_err, "redirect_err", "total", pps, drop); | ||
487 | } | ||
488 | |||
489 | /* XDP general exception tracepoints */ | ||
490 | { | ||
491 | char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n"; | ||
492 | char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n"; | ||
493 | |||
494 | rec = &stats_rec->exception; | ||
495 | prev = &stats_prev->exception; | ||
496 | t = calc_period(rec, prev); | ||
497 | for (i = 0; i < nr_cpus; i++) { | ||
498 | struct datarec *r = &rec->cpu[i]; | ||
499 | struct datarec *p = &prev->cpu[i]; | ||
500 | |||
501 | pps = calc_pps(r, p, t); | ||
502 | drop = calc_drop_pps(r, p, t); | ||
503 | if (pps > 0) | ||
504 | printf(fmt_err, "xdp_exception", i, pps, drop); | ||
505 | } | ||
506 | pps = calc_pps(&rec->total, &prev->total, t); | ||
507 | drop = calc_drop_pps(&rec->total, &prev->total, t); | ||
508 | printf(fm2_err, "xdp_exception", "total", pps, drop); | ||
509 | } | ||
510 | |||
511 | /* CPUMAP attached XDP program that runs on remote/destination CPU */ | ||
512 | if (mprog_enabled) { | ||
513 | char *fmt_k = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f\n"; | ||
514 | char *fm2_k = "%-15s %-7s %'-14.0f %'-11.0f %'-10.0f\n"; | ||
515 | double xdp_pass, xdp_drop, xdp_redirect; | ||
516 | |||
517 | printf("\n2nd remote XDP/eBPF prog_name: %s\n", mprog_name); | ||
518 | printf("%-15s %-7s %-14s %-11s %-9s\n", | ||
519 | "XDP-cpumap", "CPU:to", "xdp-pass", "xdp-drop", "xdp-redir"); | ||
520 | |||
521 | rec = &stats_rec->kthread; | ||
522 | prev = &stats_prev->kthread; | ||
523 | t = calc_period(rec, prev); | ||
524 | for (i = 0; i < nr_cpus; i++) { | ||
525 | struct datarec *r = &rec->cpu[i]; | ||
526 | struct datarec *p = &prev->cpu[i]; | ||
527 | |||
528 | calc_xdp_pps(r, p, &xdp_pass, &xdp_drop, | ||
529 | &xdp_redirect, t); | ||
530 | if (xdp_pass > 0 || xdp_drop > 0 || xdp_redirect > 0) | ||
531 | printf(fmt_k, "xdp-in-kthread", i, xdp_pass, xdp_drop, | ||
532 | xdp_redirect); | ||
533 | } | ||
534 | calc_xdp_pps(&rec->total, &prev->total, &xdp_pass, &xdp_drop, | ||
535 | &xdp_redirect, t); | ||
536 | printf(fm2_k, "xdp-in-kthread", "total", xdp_pass, xdp_drop, xdp_redirect); | ||
537 | } | ||
538 | |||
539 | printf("\n"); | ||
540 | fflush(stdout); | ||
541 | } | ||
542 | |||
543 | static void stats_collect(struct stats_record *rec) | ||
544 | { | ||
545 | int fd, i; | ||
546 | |||
547 | fd = map_fds[RX_CNT]; | ||
548 | map_collect_percpu(fd, 0, &rec->rx_cnt); | ||
549 | |||
550 | fd = map_fds[REDIRECT_ERR_CNT]; | ||
551 | map_collect_percpu(fd, 1, &rec->redir_err); | ||
552 | |||
553 | fd = map_fds[CPUMAP_ENQUEUE_CNT]; | ||
554 | for (i = 0; i < n_cpus; i++) | ||
555 | map_collect_percpu(fd, i, &rec->enq[i]); | ||
556 | |||
557 | fd = map_fds[CPUMAP_KTHREAD_CNT]; | ||
558 | map_collect_percpu(fd, 0, &rec->kthread); | ||
559 | |||
560 | fd = map_fds[EXCEPTION_CNT]; | ||
561 | map_collect_percpu(fd, 0, &rec->exception); | ||
562 | } | ||
563 | |||
564 | |||
565 | /* Pointer swap trick */ | ||
566 | static inline void swap(struct stats_record **a, struct stats_record **b) | ||
567 | { | ||
568 | struct stats_record *tmp; | ||
569 | |||
570 | tmp = *a; | ||
571 | *a = *b; | ||
572 | *b = tmp; | ||
573 | } | ||
574 | |||
575 | static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value, | ||
576 | __u32 avail_idx, bool new) | ||
577 | { | ||
578 | __u32 curr_cpus_count = 0; | ||
579 | __u32 key = 0; | ||
580 | int ret; | ||
581 | |||
582 | /* Add a CPU entry to cpumap, as this allocate a cpu entry in | ||
583 | * the kernel for the cpu. | ||
584 | */ | ||
585 | ret = bpf_map_update_elem(map_fds[CPU_MAP], &cpu, value, 0); | ||
586 | if (ret) { | ||
587 | fprintf(stderr, "Create CPU entry failed (err:%d)\n", ret); | ||
588 | exit(EXIT_FAIL_BPF); | ||
589 | } | ||
590 | |||
591 | /* Inform bpf_prog's that a new CPU is available to select | ||
592 | * from via some control maps. | ||
593 | */ | ||
594 | ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &avail_idx, &cpu, 0); | ||
595 | if (ret) { | ||
596 | fprintf(stderr, "Add to avail CPUs failed\n"); | ||
597 | exit(EXIT_FAIL_BPF); | ||
598 | } | ||
599 | |||
600 | /* When not replacing/updating existing entry, bump the count */ | ||
601 | ret = bpf_map_lookup_elem(map_fds[CPUS_COUNT], &key, &curr_cpus_count); | ||
602 | if (ret) { | ||
603 | fprintf(stderr, "Failed reading curr cpus_count\n"); | ||
604 | exit(EXIT_FAIL_BPF); | ||
605 | } | ||
606 | if (new) { | ||
607 | curr_cpus_count++; | ||
608 | ret = bpf_map_update_elem(map_fds[CPUS_COUNT], &key, | ||
609 | &curr_cpus_count, 0); | ||
610 | if (ret) { | ||
611 | fprintf(stderr, "Failed write curr cpus_count\n"); | ||
612 | exit(EXIT_FAIL_BPF); | ||
613 | } | ||
614 | } | ||
615 | /* map_fd[7] = cpus_iterator */ | ||
616 | printf("%s CPU:%u as idx:%u qsize:%d prog_fd: %d (cpus_count:%u)\n", | ||
617 | new ? "Add-new":"Replace", cpu, avail_idx, | ||
618 | value->qsize, value->bpf_prog.fd, curr_cpus_count); | ||
619 | |||
620 | return 0; | ||
621 | } | ||
622 | |||
623 | /* CPUs are zero-indexed. Thus, add a special sentinel default value | ||
624 | * in map cpus_available to mark CPU index'es not configured | ||
625 | */ | ||
626 | static void mark_cpus_unavailable(void) | ||
627 | { | ||
628 | __u32 invalid_cpu = n_cpus; | ||
629 | int ret, i; | ||
630 | |||
631 | for (i = 0; i < n_cpus; i++) { | ||
632 | ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &i, | ||
633 | &invalid_cpu, 0); | ||
634 | if (ret) { | ||
635 | fprintf(stderr, "Failed marking CPU unavailable\n"); | ||
636 | exit(EXIT_FAIL_BPF); | ||
637 | } | ||
638 | } | ||
639 | } | ||
640 | |||
641 | /* Stress cpumap management code by concurrently changing underlying cpumap */ | ||
642 | static void stress_cpumap(struct bpf_cpumap_val *value) | ||
643 | { | ||
644 | /* Changing qsize will cause kernel to free and alloc a new | ||
645 | * bpf_cpu_map_entry, with an associated/complicated tear-down | ||
646 | * procedure. | ||
647 | */ | ||
648 | value->qsize = 1024; | ||
649 | create_cpu_entry(1, value, 0, false); | ||
650 | value->qsize = 8; | ||
651 | create_cpu_entry(1, value, 0, false); | ||
652 | value->qsize = 16000; | ||
653 | create_cpu_entry(1, value, 0, false); | ||
654 | } | ||
655 | |||
656 | static void stats_poll(int interval, bool use_separators, char *prog_name, | ||
657 | char *mprog_name, struct bpf_cpumap_val *value, | ||
658 | bool stress_mode) | ||
659 | { | ||
660 | struct stats_record *record, *prev; | ||
661 | int mprog_fd; | ||
662 | |||
663 | record = alloc_stats_record(); | ||
664 | prev = alloc_stats_record(); | ||
665 | stats_collect(record); | ||
666 | |||
667 | /* Trick to pretty printf with thousands separators use %' */ | ||
668 | if (use_separators) | ||
669 | setlocale(LC_NUMERIC, "en_US"); | ||
670 | |||
671 | while (1) { | ||
672 | swap(&prev, &record); | ||
673 | mprog_fd = value->bpf_prog.fd; | ||
674 | stats_collect(record); | ||
675 | stats_print(record, prev, prog_name, mprog_name, mprog_fd); | ||
676 | sleep(interval); | ||
677 | if (stress_mode) | ||
678 | stress_cpumap(value); | ||
679 | } | ||
680 | |||
681 | free_stats_record(record); | ||
682 | free_stats_record(prev); | ||
683 | } | ||
684 | |||
685 | static int init_tracepoints(struct bpf_object *obj) | ||
686 | { | ||
687 | struct bpf_program *prog; | ||
688 | |||
689 | bpf_object__for_each_program(prog, obj) { | ||
690 | if (bpf_program__is_tracepoint(prog) != true) | ||
691 | continue; | ||
692 | |||
693 | tp_links[tp_cnt] = bpf_program__attach(prog); | ||
694 | if (libbpf_get_error(tp_links[tp_cnt])) { | ||
695 | tp_links[tp_cnt] = NULL; | ||
696 | return -EINVAL; | ||
697 | } | ||
698 | tp_cnt++; | ||
699 | } | ||
700 | |||
701 | return 0; | ||
702 | } | ||
703 | |||
704 | static int init_map_fds(struct bpf_object *obj) | ||
705 | { | ||
706 | enum map_type type; | ||
707 | |||
708 | for (type = 0; type < NUM_MAP; type++) { | ||
709 | map_fds[type] = | ||
710 | bpf_object__find_map_fd_by_name(obj, | ||
711 | map_type_strings[type]); | ||
712 | |||
713 | if (map_fds[type] < 0) | ||
714 | return -ENOENT; | ||
715 | } | ||
716 | |||
717 | return 0; | ||
718 | } | ||
719 | |||
720 | static int load_cpumap_prog(char *file_name, char *prog_name, | ||
721 | char *redir_interface, char *redir_map) | ||
722 | { | ||
723 | struct bpf_prog_load_attr prog_load_attr = { | ||
724 | .prog_type = BPF_PROG_TYPE_XDP, | ||
725 | .expected_attach_type = BPF_XDP_CPUMAP, | ||
726 | .file = file_name, | ||
727 | }; | ||
728 | struct bpf_program *prog; | ||
729 | struct bpf_object *obj; | ||
730 | int fd; | ||
731 | |||
732 | if (bpf_prog_load_xattr(&prog_load_attr, &obj, &fd)) | ||
733 | return -1; | ||
734 | |||
735 | if (fd < 0) { | ||
736 | fprintf(stderr, "ERR: bpf_prog_load_xattr: %s\n", | ||
737 | strerror(errno)); | ||
738 | return fd; | ||
739 | } | ||
740 | |||
741 | if (redir_interface && redir_map) { | ||
742 | int err, map_fd, ifindex_out, key = 0; | ||
743 | |||
744 | map_fd = bpf_object__find_map_fd_by_name(obj, redir_map); | ||
745 | if (map_fd < 0) | ||
746 | return map_fd; | ||
747 | |||
748 | ifindex_out = if_nametoindex(redir_interface); | ||
749 | if (!ifindex_out) | ||
750 | return -1; | ||
751 | |||
752 | err = bpf_map_update_elem(map_fd, &key, &ifindex_out, 0); | ||
753 | if (err < 0) | ||
754 | return err; | ||
755 | } | ||
756 | |||
757 | prog = bpf_object__find_program_by_title(obj, prog_name); | ||
758 | if (!prog) { | ||
759 | fprintf(stderr, "bpf_object__find_program_by_title failed\n"); | ||
760 | return EXIT_FAIL; | ||
761 | } | ||
762 | |||
763 | return bpf_program__fd(prog); | ||
764 | } | ||
765 | |||
766 | int main(int argc, char **argv) | ||
767 | { | ||
768 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
769 | char *prog_name = "xdp_cpu_map5_lb_hash_ip_pairs"; | ||
770 | char *mprog_filename = "xdp_redirect_kern.o"; | ||
771 | char *redir_interface = NULL, *redir_map = NULL; | ||
772 | char *mprog_name = "xdp_redirect_dummy"; | ||
773 | bool mprog_disable = false; | ||
774 | struct bpf_prog_load_attr prog_load_attr = { | ||
775 | .prog_type = BPF_PROG_TYPE_UNSPEC, | ||
776 | }; | ||
777 | struct bpf_prog_info info = {}; | ||
778 | __u32 info_len = sizeof(info); | ||
779 | struct bpf_cpumap_val value; | ||
780 | bool use_separators = true; | ||
781 | bool stress_mode = false; | ||
782 | struct bpf_program *prog; | ||
783 | struct bpf_object *obj; | ||
784 | int err = EXIT_FAIL; | ||
785 | char filename[256]; | ||
786 | int added_cpus = 0; | ||
787 | int longindex = 0; | ||
788 | int interval = 2; | ||
789 | int add_cpu = -1; | ||
790 | int opt, prog_fd; | ||
791 | int *cpu, i; | ||
792 | __u32 qsize; | ||
793 | |||
794 | n_cpus = get_nprocs_conf(); | ||
795 | |||
796 | /* Notice: choosing he queue size is very important with the | ||
797 | * ixgbe driver, because it's driver page recycling trick is | ||
798 | * dependend on pages being returned quickly. The number of | ||
799 | * out-standing packets in the system must be less-than 2x | ||
800 | * RX-ring size. | ||
801 | */ | ||
802 | qsize = 128+64; | ||
803 | |||
804 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
805 | prog_load_attr.file = filename; | ||
806 | |||
807 | if (setrlimit(RLIMIT_MEMLOCK, &r)) { | ||
808 | perror("setrlimit(RLIMIT_MEMLOCK)"); | ||
809 | return 1; | ||
810 | } | ||
811 | |||
812 | if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) | ||
813 | return err; | ||
814 | |||
815 | if (prog_fd < 0) { | ||
816 | fprintf(stderr, "ERR: bpf_prog_load_xattr: %s\n", | ||
817 | strerror(errno)); | ||
818 | return err; | ||
819 | } | ||
820 | |||
821 | if (init_tracepoints(obj) < 0) { | ||
822 | fprintf(stderr, "ERR: bpf_program__attach failed\n"); | ||
823 | return err; | ||
824 | } | ||
825 | |||
826 | if (init_map_fds(obj) < 0) { | ||
827 | fprintf(stderr, "bpf_object__find_map_fd_by_name failed\n"); | ||
828 | return err; | ||
829 | } | ||
830 | mark_cpus_unavailable(); | ||
831 | |||
832 | cpu = malloc(n_cpus * sizeof(int)); | ||
833 | if (!cpu) { | ||
834 | fprintf(stderr, "failed to allocate cpu array\n"); | ||
835 | return err; | ||
836 | } | ||
837 | memset(cpu, 0, n_cpus * sizeof(int)); | ||
838 | |||
839 | /* Parse commands line args */ | ||
840 | while ((opt = getopt_long(argc, argv, "hSd:s:p:q:c:xzFf:e:r:m:n", | ||
841 | long_options, &longindex)) != -1) { | ||
842 | switch (opt) { | ||
843 | case 'd': | ||
844 | if (strlen(optarg) >= IF_NAMESIZE) { | ||
845 | fprintf(stderr, "ERR: --dev name too long\n"); | ||
846 | goto error; | ||
847 | } | ||
848 | ifname = (char *)&ifname_buf; | ||
849 | strncpy(ifname, optarg, IF_NAMESIZE); | ||
850 | ifindex = if_nametoindex(ifname); | ||
851 | if (ifindex == 0) { | ||
852 | fprintf(stderr, | ||
853 | "ERR: --dev name unknown err(%d):%s\n", | ||
854 | errno, strerror(errno)); | ||
855 | goto error; | ||
856 | } | ||
857 | break; | ||
858 | case 's': | ||
859 | interval = atoi(optarg); | ||
860 | break; | ||
861 | case 'S': | ||
862 | xdp_flags |= XDP_FLAGS_SKB_MODE; | ||
863 | break; | ||
864 | case 'x': | ||
865 | stress_mode = true; | ||
866 | break; | ||
867 | case 'z': | ||
868 | use_separators = false; | ||
869 | break; | ||
870 | case 'p': | ||
871 | /* Selecting eBPF prog to load */ | ||
872 | prog_name = optarg; | ||
873 | break; | ||
874 | case 'n': | ||
875 | mprog_disable = true; | ||
876 | break; | ||
877 | case 'f': | ||
878 | mprog_filename = optarg; | ||
879 | break; | ||
880 | case 'e': | ||
881 | mprog_name = optarg; | ||
882 | break; | ||
883 | case 'r': | ||
884 | redir_interface = optarg; | ||
885 | break; | ||
886 | case 'm': | ||
887 | redir_map = optarg; | ||
888 | break; | ||
889 | case 'c': | ||
890 | /* Add multiple CPUs */ | ||
891 | add_cpu = strtoul(optarg, NULL, 0); | ||
892 | if (add_cpu >= n_cpus) { | ||
893 | fprintf(stderr, | ||
894 | "--cpu nr too large for cpumap err(%d):%s\n", | ||
895 | errno, strerror(errno)); | ||
896 | goto error; | ||
897 | } | ||
898 | cpu[added_cpus++] = add_cpu; | ||
899 | break; | ||
900 | case 'q': | ||
901 | qsize = atoi(optarg); | ||
902 | break; | ||
903 | case 'F': | ||
904 | xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; | ||
905 | break; | ||
906 | case 'h': | ||
907 | error: | ||
908 | default: | ||
909 | free(cpu); | ||
910 | usage(argv, obj); | ||
911 | return EXIT_FAIL_OPTION; | ||
912 | } | ||
913 | } | ||
914 | |||
915 | if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) | ||
916 | xdp_flags |= XDP_FLAGS_DRV_MODE; | ||
917 | |||
918 | /* Required option */ | ||
919 | if (ifindex == -1) { | ||
920 | fprintf(stderr, "ERR: required option --dev missing\n"); | ||
921 | usage(argv, obj); | ||
922 | err = EXIT_FAIL_OPTION; | ||
923 | goto out; | ||
924 | } | ||
925 | /* Required option */ | ||
926 | if (add_cpu == -1) { | ||
927 | fprintf(stderr, "ERR: required option --cpu missing\n"); | ||
928 | fprintf(stderr, " Specify multiple --cpu option to add more\n"); | ||
929 | usage(argv, obj); | ||
930 | err = EXIT_FAIL_OPTION; | ||
931 | goto out; | ||
932 | } | ||
933 | |||
934 | value.bpf_prog.fd = 0; | ||
935 | if (!mprog_disable) | ||
936 | value.bpf_prog.fd = load_cpumap_prog(mprog_filename, mprog_name, | ||
937 | redir_interface, redir_map); | ||
938 | if (value.bpf_prog.fd < 0) { | ||
939 | err = value.bpf_prog.fd; | ||
940 | goto out; | ||
941 | } | ||
942 | value.qsize = qsize; | ||
943 | |||
944 | for (i = 0; i < added_cpus; i++) | ||
945 | create_cpu_entry(cpu[i], &value, i, true); | ||
946 | |||
947 | /* Remove XDP program when program is interrupted or killed */ | ||
948 | signal(SIGINT, int_exit); | ||
949 | signal(SIGTERM, int_exit); | ||
950 | |||
951 | prog = bpf_object__find_program_by_title(obj, prog_name); | ||
952 | if (!prog) { | ||
953 | fprintf(stderr, "bpf_object__find_program_by_title failed\n"); | ||
954 | goto out; | ||
955 | } | ||
956 | |||
957 | prog_fd = bpf_program__fd(prog); | ||
958 | if (prog_fd < 0) { | ||
959 | fprintf(stderr, "bpf_program__fd failed\n"); | ||
960 | goto out; | ||
961 | } | ||
962 | |||
963 | if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) { | ||
964 | fprintf(stderr, "link set xdp fd failed\n"); | ||
965 | err = EXIT_FAIL_XDP; | ||
966 | goto out; | ||
967 | } | ||
968 | |||
969 | err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); | ||
970 | if (err) { | ||
971 | printf("can't get prog info - %s\n", strerror(errno)); | ||
972 | goto out; | ||
973 | } | ||
974 | prog_id = info.id; | ||
975 | |||
976 | stats_poll(interval, use_separators, prog_name, mprog_name, | ||
977 | &value, stress_mode); | ||
978 | |||
979 | err = EXIT_OK; | ||
980 | out: | ||
981 | free(cpu); | ||
982 | return err; | ||
983 | } | ||
diff --git a/samples/bpf/xdp_redirect_kern.c b/samples/bpf/xdp_redirect_kern.c new file mode 100644 index 000000000..d26ec3aa2 --- /dev/null +++ b/samples/bpf/xdp_redirect_kern.c | |||
@@ -0,0 +1,90 @@ | |||
1 | /* Copyright (c) 2016 John Fastabend <john.r.fastabend@intel.com> | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | * | ||
7 | * This program is distributed in the hope that it will be useful, but | ||
8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
10 | * General Public License for more details. | ||
11 | */ | ||
12 | #define KBUILD_MODNAME "foo" | ||
13 | #include <uapi/linux/bpf.h> | ||
14 | #include <linux/in.h> | ||
15 | #include <linux/if_ether.h> | ||
16 | #include <linux/if_packet.h> | ||
17 | #include <linux/if_vlan.h> | ||
18 | #include <linux/ip.h> | ||
19 | #include <linux/ipv6.h> | ||
20 | #include <bpf/bpf_helpers.h> | ||
21 | |||
22 | struct { | ||
23 | __uint(type, BPF_MAP_TYPE_ARRAY); | ||
24 | __type(key, int); | ||
25 | __type(value, int); | ||
26 | __uint(max_entries, 1); | ||
27 | } tx_port SEC(".maps"); | ||
28 | |||
29 | /* Count RX packets, as XDP bpf_prog doesn't get direct TX-success | ||
30 | * feedback. Redirect TX errors can be caught via a tracepoint. | ||
31 | */ | ||
32 | struct { | ||
33 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); | ||
34 | __type(key, u32); | ||
35 | __type(value, long); | ||
36 | __uint(max_entries, 1); | ||
37 | } rxcnt SEC(".maps"); | ||
38 | |||
39 | static void swap_src_dst_mac(void *data) | ||
40 | { | ||
41 | unsigned short *p = data; | ||
42 | unsigned short dst[3]; | ||
43 | |||
44 | dst[0] = p[0]; | ||
45 | dst[1] = p[1]; | ||
46 | dst[2] = p[2]; | ||
47 | p[0] = p[3]; | ||
48 | p[1] = p[4]; | ||
49 | p[2] = p[5]; | ||
50 | p[3] = dst[0]; | ||
51 | p[4] = dst[1]; | ||
52 | p[5] = dst[2]; | ||
53 | } | ||
54 | |||
55 | SEC("xdp_redirect") | ||
56 | int xdp_redirect_prog(struct xdp_md *ctx) | ||
57 | { | ||
58 | void *data_end = (void *)(long)ctx->data_end; | ||
59 | void *data = (void *)(long)ctx->data; | ||
60 | struct ethhdr *eth = data; | ||
61 | int rc = XDP_DROP; | ||
62 | int *ifindex, port = 0; | ||
63 | long *value; | ||
64 | u32 key = 0; | ||
65 | u64 nh_off; | ||
66 | |||
67 | nh_off = sizeof(*eth); | ||
68 | if (data + nh_off > data_end) | ||
69 | return rc; | ||
70 | |||
71 | ifindex = bpf_map_lookup_elem(&tx_port, &port); | ||
72 | if (!ifindex) | ||
73 | return rc; | ||
74 | |||
75 | value = bpf_map_lookup_elem(&rxcnt, &key); | ||
76 | if (value) | ||
77 | *value += 1; | ||
78 | |||
79 | swap_src_dst_mac(data); | ||
80 | return bpf_redirect(*ifindex, 0); | ||
81 | } | ||
82 | |||
83 | /* Redirect require an XDP bpf_prog loaded on the TX device */ | ||
84 | SEC("xdp_redirect_dummy") | ||
85 | int xdp_redirect_dummy_prog(struct xdp_md *ctx) | ||
86 | { | ||
87 | return XDP_PASS; | ||
88 | } | ||
89 | |||
90 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/xdp_redirect_map_kern.c b/samples/bpf/xdp_redirect_map_kern.c new file mode 100644 index 000000000..6489352ab --- /dev/null +++ b/samples/bpf/xdp_redirect_map_kern.c | |||
@@ -0,0 +1,92 @@ | |||
1 | /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | * | ||
7 | * This program is distributed in the hope that it will be useful, but | ||
8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
10 | * General Public License for more details. | ||
11 | */ | ||
12 | #define KBUILD_MODNAME "foo" | ||
13 | #include <uapi/linux/bpf.h> | ||
14 | #include <linux/in.h> | ||
15 | #include <linux/if_ether.h> | ||
16 | #include <linux/if_packet.h> | ||
17 | #include <linux/if_vlan.h> | ||
18 | #include <linux/ip.h> | ||
19 | #include <linux/ipv6.h> | ||
20 | #include <bpf/bpf_helpers.h> | ||
21 | |||
22 | struct { | ||
23 | __uint(type, BPF_MAP_TYPE_DEVMAP); | ||
24 | __uint(key_size, sizeof(int)); | ||
25 | __uint(value_size, sizeof(int)); | ||
26 | __uint(max_entries, 100); | ||
27 | } tx_port SEC(".maps"); | ||
28 | |||
29 | /* Count RX packets, as XDP bpf_prog doesn't get direct TX-success | ||
30 | * feedback. Redirect TX errors can be caught via a tracepoint. | ||
31 | */ | ||
32 | struct { | ||
33 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); | ||
34 | __type(key, u32); | ||
35 | __type(value, long); | ||
36 | __uint(max_entries, 1); | ||
37 | } rxcnt SEC(".maps"); | ||
38 | |||
39 | static void swap_src_dst_mac(void *data) | ||
40 | { | ||
41 | unsigned short *p = data; | ||
42 | unsigned short dst[3]; | ||
43 | |||
44 | dst[0] = p[0]; | ||
45 | dst[1] = p[1]; | ||
46 | dst[2] = p[2]; | ||
47 | p[0] = p[3]; | ||
48 | p[1] = p[4]; | ||
49 | p[2] = p[5]; | ||
50 | p[3] = dst[0]; | ||
51 | p[4] = dst[1]; | ||
52 | p[5] = dst[2]; | ||
53 | } | ||
54 | |||
55 | SEC("xdp_redirect_map") | ||
56 | int xdp_redirect_map_prog(struct xdp_md *ctx) | ||
57 | { | ||
58 | void *data_end = (void *)(long)ctx->data_end; | ||
59 | void *data = (void *)(long)ctx->data; | ||
60 | struct ethhdr *eth = data; | ||
61 | int rc = XDP_DROP; | ||
62 | int vport, port = 0, m = 0; | ||
63 | long *value; | ||
64 | u32 key = 0; | ||
65 | u64 nh_off; | ||
66 | |||
67 | nh_off = sizeof(*eth); | ||
68 | if (data + nh_off > data_end) | ||
69 | return rc; | ||
70 | |||
71 | /* constant virtual port */ | ||
72 | vport = 0; | ||
73 | |||
74 | /* count packet in global counter */ | ||
75 | value = bpf_map_lookup_elem(&rxcnt, &key); | ||
76 | if (value) | ||
77 | *value += 1; | ||
78 | |||
79 | swap_src_dst_mac(data); | ||
80 | |||
81 | /* send packet out physical port */ | ||
82 | return bpf_redirect_map(&tx_port, vport, 0); | ||
83 | } | ||
84 | |||
85 | /* Redirect require an XDP bpf_prog loaded on the TX device */ | ||
86 | SEC("xdp_redirect_dummy") | ||
87 | int xdp_redirect_dummy_prog(struct xdp_md *ctx) | ||
88 | { | ||
89 | return XDP_PASS; | ||
90 | } | ||
91 | |||
92 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c new file mode 100644 index 000000000..35e16dee6 --- /dev/null +++ b/samples/bpf/xdp_redirect_map_user.c | |||
@@ -0,0 +1,222 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io | ||
3 | */ | ||
4 | #include <linux/bpf.h> | ||
5 | #include <linux/if_link.h> | ||
6 | #include <assert.h> | ||
7 | #include <errno.h> | ||
8 | #include <signal.h> | ||
9 | #include <stdio.h> | ||
10 | #include <stdlib.h> | ||
11 | #include <stdbool.h> | ||
12 | #include <string.h> | ||
13 | #include <net/if.h> | ||
14 | #include <unistd.h> | ||
15 | #include <libgen.h> | ||
16 | #include <sys/resource.h> | ||
17 | |||
18 | #include "bpf_util.h" | ||
19 | #include <bpf/bpf.h> | ||
20 | #include <bpf/libbpf.h> | ||
21 | |||
22 | static int ifindex_in; | ||
23 | static int ifindex_out; | ||
24 | static bool ifindex_out_xdp_dummy_attached = true; | ||
25 | static __u32 prog_id; | ||
26 | static __u32 dummy_prog_id; | ||
27 | |||
28 | static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; | ||
29 | static int rxcnt_map_fd; | ||
30 | |||
31 | static void int_exit(int sig) | ||
32 | { | ||
33 | __u32 curr_prog_id = 0; | ||
34 | |||
35 | if (bpf_get_link_xdp_id(ifindex_in, &curr_prog_id, xdp_flags)) { | ||
36 | printf("bpf_get_link_xdp_id failed\n"); | ||
37 | exit(1); | ||
38 | } | ||
39 | if (prog_id == curr_prog_id) | ||
40 | bpf_set_link_xdp_fd(ifindex_in, -1, xdp_flags); | ||
41 | else if (!curr_prog_id) | ||
42 | printf("couldn't find a prog id on iface IN\n"); | ||
43 | else | ||
44 | printf("program on iface IN changed, not removing\n"); | ||
45 | |||
46 | if (ifindex_out_xdp_dummy_attached) { | ||
47 | curr_prog_id = 0; | ||
48 | if (bpf_get_link_xdp_id(ifindex_out, &curr_prog_id, | ||
49 | xdp_flags)) { | ||
50 | printf("bpf_get_link_xdp_id failed\n"); | ||
51 | exit(1); | ||
52 | } | ||
53 | if (dummy_prog_id == curr_prog_id) | ||
54 | bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags); | ||
55 | else if (!curr_prog_id) | ||
56 | printf("couldn't find a prog id on iface OUT\n"); | ||
57 | else | ||
58 | printf("program on iface OUT changed, not removing\n"); | ||
59 | } | ||
60 | exit(0); | ||
61 | } | ||
62 | |||
63 | static void poll_stats(int interval, int ifindex) | ||
64 | { | ||
65 | unsigned int nr_cpus = bpf_num_possible_cpus(); | ||
66 | __u64 values[nr_cpus], prev[nr_cpus]; | ||
67 | |||
68 | memset(prev, 0, sizeof(prev)); | ||
69 | |||
70 | while (1) { | ||
71 | __u64 sum = 0; | ||
72 | __u32 key = 0; | ||
73 | int i; | ||
74 | |||
75 | sleep(interval); | ||
76 | assert(bpf_map_lookup_elem(rxcnt_map_fd, &key, values) == 0); | ||
77 | for (i = 0; i < nr_cpus; i++) | ||
78 | sum += (values[i] - prev[i]); | ||
79 | if (sum) | ||
80 | printf("ifindex %i: %10llu pkt/s\n", | ||
81 | ifindex, sum / interval); | ||
82 | memcpy(prev, values, sizeof(values)); | ||
83 | } | ||
84 | } | ||
85 | |||
86 | static void usage(const char *prog) | ||
87 | { | ||
88 | fprintf(stderr, | ||
89 | "usage: %s [OPTS] <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n\n" | ||
90 | "OPTS:\n" | ||
91 | " -S use skb-mode\n" | ||
92 | " -N enforce native mode\n" | ||
93 | " -F force loading prog\n", | ||
94 | prog); | ||
95 | } | ||
96 | |||
97 | int main(int argc, char **argv) | ||
98 | { | ||
99 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
100 | struct bpf_prog_load_attr prog_load_attr = { | ||
101 | .prog_type = BPF_PROG_TYPE_XDP, | ||
102 | }; | ||
103 | struct bpf_program *prog, *dummy_prog; | ||
104 | struct bpf_prog_info info = {}; | ||
105 | __u32 info_len = sizeof(info); | ||
106 | int prog_fd, dummy_prog_fd; | ||
107 | const char *optstr = "FSN"; | ||
108 | struct bpf_object *obj; | ||
109 | int ret, opt, key = 0; | ||
110 | char filename[256]; | ||
111 | int tx_port_map_fd; | ||
112 | |||
113 | while ((opt = getopt(argc, argv, optstr)) != -1) { | ||
114 | switch (opt) { | ||
115 | case 'S': | ||
116 | xdp_flags |= XDP_FLAGS_SKB_MODE; | ||
117 | break; | ||
118 | case 'N': | ||
119 | /* default, set below */ | ||
120 | break; | ||
121 | case 'F': | ||
122 | xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; | ||
123 | break; | ||
124 | default: | ||
125 | usage(basename(argv[0])); | ||
126 | return 1; | ||
127 | } | ||
128 | } | ||
129 | |||
130 | if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) | ||
131 | xdp_flags |= XDP_FLAGS_DRV_MODE; | ||
132 | |||
133 | if (optind == argc) { | ||
134 | printf("usage: %s <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n", argv[0]); | ||
135 | return 1; | ||
136 | } | ||
137 | |||
138 | if (setrlimit(RLIMIT_MEMLOCK, &r)) { | ||
139 | perror("setrlimit(RLIMIT_MEMLOCK)"); | ||
140 | return 1; | ||
141 | } | ||
142 | |||
143 | ifindex_in = if_nametoindex(argv[optind]); | ||
144 | if (!ifindex_in) | ||
145 | ifindex_in = strtoul(argv[optind], NULL, 0); | ||
146 | |||
147 | ifindex_out = if_nametoindex(argv[optind + 1]); | ||
148 | if (!ifindex_out) | ||
149 | ifindex_out = strtoul(argv[optind + 1], NULL, 0); | ||
150 | |||
151 | printf("input: %d output: %d\n", ifindex_in, ifindex_out); | ||
152 | |||
153 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
154 | prog_load_attr.file = filename; | ||
155 | |||
156 | if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) | ||
157 | return 1; | ||
158 | |||
159 | prog = bpf_program__next(NULL, obj); | ||
160 | dummy_prog = bpf_program__next(prog, obj); | ||
161 | if (!prog || !dummy_prog) { | ||
162 | printf("finding a prog in obj file failed\n"); | ||
163 | return 1; | ||
164 | } | ||
165 | /* bpf_prog_load_xattr gives us the pointer to first prog's fd, | ||
166 | * so we're missing only the fd for dummy prog | ||
167 | */ | ||
168 | dummy_prog_fd = bpf_program__fd(dummy_prog); | ||
169 | if (prog_fd < 0 || dummy_prog_fd < 0) { | ||
170 | printf("bpf_prog_load_xattr: %s\n", strerror(errno)); | ||
171 | return 1; | ||
172 | } | ||
173 | |||
174 | tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port"); | ||
175 | rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt"); | ||
176 | if (tx_port_map_fd < 0 || rxcnt_map_fd < 0) { | ||
177 | printf("bpf_object__find_map_fd_by_name failed\n"); | ||
178 | return 1; | ||
179 | } | ||
180 | |||
181 | if (bpf_set_link_xdp_fd(ifindex_in, prog_fd, xdp_flags) < 0) { | ||
182 | printf("ERROR: link set xdp fd failed on %d\n", ifindex_in); | ||
183 | return 1; | ||
184 | } | ||
185 | |||
186 | ret = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); | ||
187 | if (ret) { | ||
188 | printf("can't get prog info - %s\n", strerror(errno)); | ||
189 | return ret; | ||
190 | } | ||
191 | prog_id = info.id; | ||
192 | |||
193 | /* Loading dummy XDP prog on out-device */ | ||
194 | if (bpf_set_link_xdp_fd(ifindex_out, dummy_prog_fd, | ||
195 | (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) { | ||
196 | printf("WARN: link set xdp fd failed on %d\n", ifindex_out); | ||
197 | ifindex_out_xdp_dummy_attached = false; | ||
198 | } | ||
199 | |||
200 | memset(&info, 0, sizeof(info)); | ||
201 | ret = bpf_obj_get_info_by_fd(dummy_prog_fd, &info, &info_len); | ||
202 | if (ret) { | ||
203 | printf("can't get prog info - %s\n", strerror(errno)); | ||
204 | return ret; | ||
205 | } | ||
206 | dummy_prog_id = info.id; | ||
207 | |||
208 | signal(SIGINT, int_exit); | ||
209 | signal(SIGTERM, int_exit); | ||
210 | |||
211 | /* populate virtual to physical port map */ | ||
212 | ret = bpf_map_update_elem(tx_port_map_fd, &key, &ifindex_out, 0); | ||
213 | if (ret) { | ||
214 | perror("bpf_update_elem"); | ||
215 | goto out; | ||
216 | } | ||
217 | |||
218 | poll_stats(2, ifindex_out); | ||
219 | |||
220 | out: | ||
221 | return 0; | ||
222 | } | ||
diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c new file mode 100644 index 000000000..3c92adc2a --- /dev/null +++ b/samples/bpf/xdp_redirect_user.c | |||
@@ -0,0 +1,223 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* Copyright (c) 2016 John Fastabend <john.r.fastabend@intel.com> | ||
3 | */ | ||
4 | #include <linux/bpf.h> | ||
5 | #include <linux/if_link.h> | ||
6 | #include <assert.h> | ||
7 | #include <errno.h> | ||
8 | #include <signal.h> | ||
9 | #include <stdio.h> | ||
10 | #include <stdlib.h> | ||
11 | #include <stdbool.h> | ||
12 | #include <string.h> | ||
13 | #include <net/if.h> | ||
14 | #include <unistd.h> | ||
15 | #include <libgen.h> | ||
16 | #include <sys/resource.h> | ||
17 | |||
18 | #include "bpf_util.h" | ||
19 | #include <bpf/bpf.h> | ||
20 | #include <bpf/libbpf.h> | ||
21 | |||
22 | static int ifindex_in; | ||
23 | static int ifindex_out; | ||
24 | static bool ifindex_out_xdp_dummy_attached = true; | ||
25 | static __u32 prog_id; | ||
26 | static __u32 dummy_prog_id; | ||
27 | |||
28 | static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; | ||
29 | static int rxcnt_map_fd; | ||
30 | |||
31 | static void int_exit(int sig) | ||
32 | { | ||
33 | __u32 curr_prog_id = 0; | ||
34 | |||
35 | if (bpf_get_link_xdp_id(ifindex_in, &curr_prog_id, xdp_flags)) { | ||
36 | printf("bpf_get_link_xdp_id failed\n"); | ||
37 | exit(1); | ||
38 | } | ||
39 | if (prog_id == curr_prog_id) | ||
40 | bpf_set_link_xdp_fd(ifindex_in, -1, xdp_flags); | ||
41 | else if (!curr_prog_id) | ||
42 | printf("couldn't find a prog id on iface IN\n"); | ||
43 | else | ||
44 | printf("program on iface IN changed, not removing\n"); | ||
45 | |||
46 | if (ifindex_out_xdp_dummy_attached) { | ||
47 | curr_prog_id = 0; | ||
48 | if (bpf_get_link_xdp_id(ifindex_out, &curr_prog_id, | ||
49 | xdp_flags)) { | ||
50 | printf("bpf_get_link_xdp_id failed\n"); | ||
51 | exit(1); | ||
52 | } | ||
53 | if (dummy_prog_id == curr_prog_id) | ||
54 | bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags); | ||
55 | else if (!curr_prog_id) | ||
56 | printf("couldn't find a prog id on iface OUT\n"); | ||
57 | else | ||
58 | printf("program on iface OUT changed, not removing\n"); | ||
59 | } | ||
60 | exit(0); | ||
61 | } | ||
62 | |||
63 | static void poll_stats(int interval, int ifindex) | ||
64 | { | ||
65 | unsigned int nr_cpus = bpf_num_possible_cpus(); | ||
66 | __u64 values[nr_cpus], prev[nr_cpus]; | ||
67 | |||
68 | memset(prev, 0, sizeof(prev)); | ||
69 | |||
70 | while (1) { | ||
71 | __u64 sum = 0; | ||
72 | __u32 key = 0; | ||
73 | int i; | ||
74 | |||
75 | sleep(interval); | ||
76 | assert(bpf_map_lookup_elem(rxcnt_map_fd, &key, values) == 0); | ||
77 | for (i = 0; i < nr_cpus; i++) | ||
78 | sum += (values[i] - prev[i]); | ||
79 | if (sum) | ||
80 | printf("ifindex %i: %10llu pkt/s\n", | ||
81 | ifindex, sum / interval); | ||
82 | memcpy(prev, values, sizeof(values)); | ||
83 | } | ||
84 | } | ||
85 | |||
86 | static void usage(const char *prog) | ||
87 | { | ||
88 | fprintf(stderr, | ||
89 | "usage: %s [OPTS] <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n\n" | ||
90 | "OPTS:\n" | ||
91 | " -S use skb-mode\n" | ||
92 | " -N enforce native mode\n" | ||
93 | " -F force loading prog\n", | ||
94 | prog); | ||
95 | } | ||
96 | |||
97 | |||
98 | int main(int argc, char **argv) | ||
99 | { | ||
100 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
101 | struct bpf_prog_load_attr prog_load_attr = { | ||
102 | .prog_type = BPF_PROG_TYPE_XDP, | ||
103 | }; | ||
104 | struct bpf_program *prog, *dummy_prog; | ||
105 | int prog_fd, tx_port_map_fd, opt; | ||
106 | struct bpf_prog_info info = {}; | ||
107 | __u32 info_len = sizeof(info); | ||
108 | const char *optstr = "FSN"; | ||
109 | struct bpf_object *obj; | ||
110 | char filename[256]; | ||
111 | int dummy_prog_fd; | ||
112 | int ret, key = 0; | ||
113 | |||
114 | while ((opt = getopt(argc, argv, optstr)) != -1) { | ||
115 | switch (opt) { | ||
116 | case 'S': | ||
117 | xdp_flags |= XDP_FLAGS_SKB_MODE; | ||
118 | break; | ||
119 | case 'N': | ||
120 | /* default, set below */ | ||
121 | break; | ||
122 | case 'F': | ||
123 | xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; | ||
124 | break; | ||
125 | default: | ||
126 | usage(basename(argv[0])); | ||
127 | return 1; | ||
128 | } | ||
129 | } | ||
130 | |||
131 | if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) | ||
132 | xdp_flags |= XDP_FLAGS_DRV_MODE; | ||
133 | |||
134 | if (optind + 2 != argc) { | ||
135 | printf("usage: %s <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n", argv[0]); | ||
136 | return 1; | ||
137 | } | ||
138 | |||
139 | if (setrlimit(RLIMIT_MEMLOCK, &r)) { | ||
140 | perror("setrlimit(RLIMIT_MEMLOCK)"); | ||
141 | return 1; | ||
142 | } | ||
143 | |||
144 | ifindex_in = if_nametoindex(argv[optind]); | ||
145 | if (!ifindex_in) | ||
146 | ifindex_in = strtoul(argv[optind], NULL, 0); | ||
147 | |||
148 | ifindex_out = if_nametoindex(argv[optind + 1]); | ||
149 | if (!ifindex_out) | ||
150 | ifindex_out = strtoul(argv[optind + 1], NULL, 0); | ||
151 | |||
152 | printf("input: %d output: %d\n", ifindex_in, ifindex_out); | ||
153 | |||
154 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
155 | prog_load_attr.file = filename; | ||
156 | |||
157 | if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) | ||
158 | return 1; | ||
159 | |||
160 | prog = bpf_program__next(NULL, obj); | ||
161 | dummy_prog = bpf_program__next(prog, obj); | ||
162 | if (!prog || !dummy_prog) { | ||
163 | printf("finding a prog in obj file failed\n"); | ||
164 | return 1; | ||
165 | } | ||
166 | /* bpf_prog_load_xattr gives us the pointer to first prog's fd, | ||
167 | * so we're missing only the fd for dummy prog | ||
168 | */ | ||
169 | dummy_prog_fd = bpf_program__fd(dummy_prog); | ||
170 | if (prog_fd < 0 || dummy_prog_fd < 0) { | ||
171 | printf("bpf_prog_load_xattr: %s\n", strerror(errno)); | ||
172 | return 1; | ||
173 | } | ||
174 | |||
175 | tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port"); | ||
176 | rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt"); | ||
177 | if (tx_port_map_fd < 0 || rxcnt_map_fd < 0) { | ||
178 | printf("bpf_object__find_map_fd_by_name failed\n"); | ||
179 | return 1; | ||
180 | } | ||
181 | |||
182 | if (bpf_set_link_xdp_fd(ifindex_in, prog_fd, xdp_flags) < 0) { | ||
183 | printf("ERROR: link set xdp fd failed on %d\n", ifindex_in); | ||
184 | return 1; | ||
185 | } | ||
186 | |||
187 | ret = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); | ||
188 | if (ret) { | ||
189 | printf("can't get prog info - %s\n", strerror(errno)); | ||
190 | return ret; | ||
191 | } | ||
192 | prog_id = info.id; | ||
193 | |||
194 | /* Loading dummy XDP prog on out-device */ | ||
195 | if (bpf_set_link_xdp_fd(ifindex_out, dummy_prog_fd, | ||
196 | (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) { | ||
197 | printf("WARN: link set xdp fd failed on %d\n", ifindex_out); | ||
198 | ifindex_out_xdp_dummy_attached = false; | ||
199 | } | ||
200 | |||
201 | memset(&info, 0, sizeof(info)); | ||
202 | ret = bpf_obj_get_info_by_fd(dummy_prog_fd, &info, &info_len); | ||
203 | if (ret) { | ||
204 | printf("can't get prog info - %s\n", strerror(errno)); | ||
205 | return ret; | ||
206 | } | ||
207 | dummy_prog_id = info.id; | ||
208 | |||
209 | signal(SIGINT, int_exit); | ||
210 | signal(SIGTERM, int_exit); | ||
211 | |||
212 | /* bpf redirect port */ | ||
213 | ret = bpf_map_update_elem(tx_port_map_fd, &key, &ifindex_out, 0); | ||
214 | if (ret) { | ||
215 | perror("bpf_update_elem"); | ||
216 | goto out; | ||
217 | } | ||
218 | |||
219 | poll_stats(2, ifindex_out); | ||
220 | |||
221 | out: | ||
222 | return ret; | ||
223 | } | ||
diff --git a/samples/bpf/xdp_router_ipv4_kern.c b/samples/bpf/xdp_router_ipv4_kern.c new file mode 100644 index 000000000..b37ca2b13 --- /dev/null +++ b/samples/bpf/xdp_router_ipv4_kern.c | |||
@@ -0,0 +1,186 @@ | |||
1 | /* Copyright (C) 2017 Cavium, Inc. | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or modify it | ||
4 | * under the terms of version 2 of the GNU General Public License | ||
5 | * as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #define KBUILD_MODNAME "foo" | ||
8 | #include <uapi/linux/bpf.h> | ||
9 | #include <linux/in.h> | ||
10 | #include <linux/if_ether.h> | ||
11 | #include <linux/if_packet.h> | ||
12 | #include <linux/if_vlan.h> | ||
13 | #include <linux/ip.h> | ||
14 | #include <linux/ipv6.h> | ||
15 | #include <bpf/bpf_helpers.h> | ||
16 | #include <linux/slab.h> | ||
17 | #include <net/ip_fib.h> | ||
18 | |||
19 | struct trie_value { | ||
20 | __u8 prefix[4]; | ||
21 | __be64 value; | ||
22 | int ifindex; | ||
23 | int metric; | ||
24 | __be32 gw; | ||
25 | }; | ||
26 | |||
27 | /* Key for lpm_trie*/ | ||
28 | union key_4 { | ||
29 | u32 b32[2]; | ||
30 | u8 b8[8]; | ||
31 | }; | ||
32 | |||
33 | struct arp_entry { | ||
34 | __be64 mac; | ||
35 | __be32 dst; | ||
36 | }; | ||
37 | |||
38 | struct direct_map { | ||
39 | struct arp_entry arp; | ||
40 | int ifindex; | ||
41 | __be64 mac; | ||
42 | }; | ||
43 | |||
44 | /* Map for trie implementation*/ | ||
45 | struct { | ||
46 | __uint(type, BPF_MAP_TYPE_LPM_TRIE); | ||
47 | __uint(key_size, 8); | ||
48 | __uint(value_size, sizeof(struct trie_value)); | ||
49 | __uint(max_entries, 50); | ||
50 | __uint(map_flags, BPF_F_NO_PREALLOC); | ||
51 | } lpm_map SEC(".maps"); | ||
52 | |||
53 | /* Map for counter*/ | ||
54 | struct { | ||
55 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); | ||
56 | __type(key, u32); | ||
57 | __type(value, u64); | ||
58 | __uint(max_entries, 256); | ||
59 | } rxcnt SEC(".maps"); | ||
60 | |||
61 | /* Map for ARP table*/ | ||
62 | struct { | ||
63 | __uint(type, BPF_MAP_TYPE_HASH); | ||
64 | __type(key, __be32); | ||
65 | __type(value, __be64); | ||
66 | __uint(max_entries, 50); | ||
67 | } arp_table SEC(".maps"); | ||
68 | |||
69 | /* Map to keep the exact match entries in the route table*/ | ||
70 | struct { | ||
71 | __uint(type, BPF_MAP_TYPE_HASH); | ||
72 | __type(key, __be32); | ||
73 | __type(value, struct direct_map); | ||
74 | __uint(max_entries, 50); | ||
75 | } exact_match SEC(".maps"); | ||
76 | |||
77 | struct { | ||
78 | __uint(type, BPF_MAP_TYPE_DEVMAP); | ||
79 | __uint(key_size, sizeof(int)); | ||
80 | __uint(value_size, sizeof(int)); | ||
81 | __uint(max_entries, 100); | ||
82 | } tx_port SEC(".maps"); | ||
83 | |||
84 | /* Function to set source and destination mac of the packet */ | ||
85 | static inline void set_src_dst_mac(void *data, void *src, void *dst) | ||
86 | { | ||
87 | unsigned short *source = src; | ||
88 | unsigned short *dest = dst; | ||
89 | unsigned short *p = data; | ||
90 | |||
91 | __builtin_memcpy(p, dest, 6); | ||
92 | __builtin_memcpy(p + 3, source, 6); | ||
93 | } | ||
94 | |||
95 | /* Parse IPV4 packet to get SRC, DST IP and protocol */ | ||
96 | static inline int parse_ipv4(void *data, u64 nh_off, void *data_end, | ||
97 | __be32 *src, __be32 *dest) | ||
98 | { | ||
99 | struct iphdr *iph = data + nh_off; | ||
100 | |||
101 | if (iph + 1 > data_end) | ||
102 | return 0; | ||
103 | *src = iph->saddr; | ||
104 | *dest = iph->daddr; | ||
105 | return iph->protocol; | ||
106 | } | ||
107 | |||
108 | SEC("xdp_router_ipv4") | ||
109 | int xdp_router_ipv4_prog(struct xdp_md *ctx) | ||
110 | { | ||
111 | void *data_end = (void *)(long)ctx->data_end; | ||
112 | __be64 *dest_mac = NULL, *src_mac = NULL; | ||
113 | void *data = (void *)(long)ctx->data; | ||
114 | struct trie_value *prefix_value; | ||
115 | int rc = XDP_DROP, forward_to; | ||
116 | struct ethhdr *eth = data; | ||
117 | union key_4 key4; | ||
118 | long *value; | ||
119 | u16 h_proto; | ||
120 | u32 ipproto; | ||
121 | u64 nh_off; | ||
122 | |||
123 | nh_off = sizeof(*eth); | ||
124 | if (data + nh_off > data_end) | ||
125 | return rc; | ||
126 | |||
127 | h_proto = eth->h_proto; | ||
128 | |||
129 | if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { | ||
130 | struct vlan_hdr *vhdr; | ||
131 | |||
132 | vhdr = data + nh_off; | ||
133 | nh_off += sizeof(struct vlan_hdr); | ||
134 | if (data + nh_off > data_end) | ||
135 | return rc; | ||
136 | h_proto = vhdr->h_vlan_encapsulated_proto; | ||
137 | } | ||
138 | if (h_proto == htons(ETH_P_ARP)) { | ||
139 | return XDP_PASS; | ||
140 | } else if (h_proto == htons(ETH_P_IP)) { | ||
141 | struct direct_map *direct_entry; | ||
142 | __be32 src_ip = 0, dest_ip = 0; | ||
143 | |||
144 | ipproto = parse_ipv4(data, nh_off, data_end, &src_ip, &dest_ip); | ||
145 | direct_entry = bpf_map_lookup_elem(&exact_match, &dest_ip); | ||
146 | /* Check for exact match, this would give a faster lookup*/ | ||
147 | if (direct_entry && direct_entry->mac && direct_entry->arp.mac) { | ||
148 | src_mac = &direct_entry->mac; | ||
149 | dest_mac = &direct_entry->arp.mac; | ||
150 | forward_to = direct_entry->ifindex; | ||
151 | } else { | ||
152 | /* Look up in the trie for lpm*/ | ||
153 | key4.b32[0] = 32; | ||
154 | key4.b8[4] = dest_ip & 0xff; | ||
155 | key4.b8[5] = (dest_ip >> 8) & 0xff; | ||
156 | key4.b8[6] = (dest_ip >> 16) & 0xff; | ||
157 | key4.b8[7] = (dest_ip >> 24) & 0xff; | ||
158 | prefix_value = bpf_map_lookup_elem(&lpm_map, &key4); | ||
159 | if (!prefix_value) | ||
160 | return XDP_DROP; | ||
161 | src_mac = &prefix_value->value; | ||
162 | if (!src_mac) | ||
163 | return XDP_DROP; | ||
164 | dest_mac = bpf_map_lookup_elem(&arp_table, &dest_ip); | ||
165 | if (!dest_mac) { | ||
166 | if (!prefix_value->gw) | ||
167 | return XDP_DROP; | ||
168 | dest_ip = prefix_value->gw; | ||
169 | dest_mac = bpf_map_lookup_elem(&arp_table, &dest_ip); | ||
170 | } | ||
171 | forward_to = prefix_value->ifindex; | ||
172 | } | ||
173 | } else { | ||
174 | ipproto = 0; | ||
175 | } | ||
176 | if (src_mac && dest_mac) { | ||
177 | set_src_dst_mac(data, src_mac, dest_mac); | ||
178 | value = bpf_map_lookup_elem(&rxcnt, &ipproto); | ||
179 | if (value) | ||
180 | *value += 1; | ||
181 | return bpf_redirect_map(&tx_port, forward_to, 0); | ||
182 | } | ||
183 | return rc; | ||
184 | } | ||
185 | |||
186 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c new file mode 100644 index 000000000..c2da1b51f --- /dev/null +++ b/samples/bpf/xdp_router_ipv4_user.c | |||
@@ -0,0 +1,741 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* Copyright (C) 2017 Cavium, Inc. | ||
3 | */ | ||
4 | #include <linux/bpf.h> | ||
5 | #include <linux/netlink.h> | ||
6 | #include <linux/rtnetlink.h> | ||
7 | #include <assert.h> | ||
8 | #include <errno.h> | ||
9 | #include <signal.h> | ||
10 | #include <stdio.h> | ||
11 | #include <stdlib.h> | ||
12 | #include <string.h> | ||
13 | #include <sys/socket.h> | ||
14 | #include <unistd.h> | ||
15 | #include <bpf/bpf.h> | ||
16 | #include <arpa/inet.h> | ||
17 | #include <fcntl.h> | ||
18 | #include <poll.h> | ||
19 | #include <net/if.h> | ||
20 | #include <netdb.h> | ||
21 | #include <sys/ioctl.h> | ||
22 | #include <sys/syscall.h> | ||
23 | #include "bpf_util.h" | ||
24 | #include <bpf/libbpf.h> | ||
25 | #include <sys/resource.h> | ||
26 | #include <libgen.h> | ||
27 | |||
28 | int sock, sock_arp, flags = XDP_FLAGS_UPDATE_IF_NOEXIST; | ||
29 | static int total_ifindex; | ||
30 | static int *ifindex_list; | ||
31 | static __u32 *prog_id_list; | ||
32 | char buf[8192]; | ||
33 | static int lpm_map_fd; | ||
34 | static int rxcnt_map_fd; | ||
35 | static int arp_table_map_fd; | ||
36 | static int exact_match_map_fd; | ||
37 | static int tx_port_map_fd; | ||
38 | |||
39 | static int get_route_table(int rtm_family); | ||
40 | static void int_exit(int sig) | ||
41 | { | ||
42 | __u32 prog_id = 0; | ||
43 | int i = 0; | ||
44 | |||
45 | for (i = 0; i < total_ifindex; i++) { | ||
46 | if (bpf_get_link_xdp_id(ifindex_list[i], &prog_id, flags)) { | ||
47 | printf("bpf_get_link_xdp_id on iface %d failed\n", | ||
48 | ifindex_list[i]); | ||
49 | exit(1); | ||
50 | } | ||
51 | if (prog_id_list[i] == prog_id) | ||
52 | bpf_set_link_xdp_fd(ifindex_list[i], -1, flags); | ||
53 | else if (!prog_id) | ||
54 | printf("couldn't find a prog id on iface %d\n", | ||
55 | ifindex_list[i]); | ||
56 | else | ||
57 | printf("program on iface %d changed, not removing\n", | ||
58 | ifindex_list[i]); | ||
59 | prog_id = 0; | ||
60 | } | ||
61 | exit(0); | ||
62 | } | ||
63 | |||
64 | static void close_and_exit(int sig) | ||
65 | { | ||
66 | close(sock); | ||
67 | close(sock_arp); | ||
68 | |||
69 | int_exit(0); | ||
70 | } | ||
71 | |||
72 | /* Get the mac address of the interface given interface name */ | ||
73 | static __be64 getmac(char *iface) | ||
74 | { | ||
75 | struct ifreq ifr; | ||
76 | __be64 mac = 0; | ||
77 | int fd, i; | ||
78 | |||
79 | fd = socket(AF_INET, SOCK_DGRAM, 0); | ||
80 | ifr.ifr_addr.sa_family = AF_INET; | ||
81 | strncpy(ifr.ifr_name, iface, IFNAMSIZ - 1); | ||
82 | if (ioctl(fd, SIOCGIFHWADDR, &ifr) < 0) { | ||
83 | printf("ioctl failed leaving....\n"); | ||
84 | return -1; | ||
85 | } | ||
86 | for (i = 0; i < 6 ; i++) | ||
87 | *((__u8 *)&mac + i) = (__u8)ifr.ifr_hwaddr.sa_data[i]; | ||
88 | close(fd); | ||
89 | return mac; | ||
90 | } | ||
91 | |||
92 | static int recv_msg(struct sockaddr_nl sock_addr, int sock) | ||
93 | { | ||
94 | struct nlmsghdr *nh; | ||
95 | int len, nll = 0; | ||
96 | char *buf_ptr; | ||
97 | |||
98 | buf_ptr = buf; | ||
99 | while (1) { | ||
100 | len = recv(sock, buf_ptr, sizeof(buf) - nll, 0); | ||
101 | if (len < 0) | ||
102 | return len; | ||
103 | |||
104 | nh = (struct nlmsghdr *)buf_ptr; | ||
105 | |||
106 | if (nh->nlmsg_type == NLMSG_DONE) | ||
107 | break; | ||
108 | buf_ptr += len; | ||
109 | nll += len; | ||
110 | if ((sock_addr.nl_groups & RTMGRP_NEIGH) == RTMGRP_NEIGH) | ||
111 | break; | ||
112 | |||
113 | if ((sock_addr.nl_groups & RTMGRP_IPV4_ROUTE) == RTMGRP_IPV4_ROUTE) | ||
114 | break; | ||
115 | } | ||
116 | return nll; | ||
117 | } | ||
118 | |||
119 | /* Function to parse the route entry returned by netlink | ||
120 | * Updates the route entry related map entries | ||
121 | */ | ||
122 | static void read_route(struct nlmsghdr *nh, int nll) | ||
123 | { | ||
124 | char dsts[24], gws[24], ifs[16], dsts_len[24], metrics[24]; | ||
125 | struct bpf_lpm_trie_key *prefix_key; | ||
126 | struct rtattr *rt_attr; | ||
127 | struct rtmsg *rt_msg; | ||
128 | int rtm_family; | ||
129 | int rtl; | ||
130 | int i; | ||
131 | struct route_table { | ||
132 | int dst_len, iface, metric; | ||
133 | char *iface_name; | ||
134 | __be32 dst, gw; | ||
135 | __be64 mac; | ||
136 | } route; | ||
137 | struct arp_table { | ||
138 | __be64 mac; | ||
139 | __be32 dst; | ||
140 | }; | ||
141 | |||
142 | struct direct_map { | ||
143 | struct arp_table arp; | ||
144 | int ifindex; | ||
145 | __be64 mac; | ||
146 | } direct_entry; | ||
147 | |||
148 | if (nh->nlmsg_type == RTM_DELROUTE) | ||
149 | printf("DELETING Route entry\n"); | ||
150 | else if (nh->nlmsg_type == RTM_GETROUTE) | ||
151 | printf("READING Route entry\n"); | ||
152 | else if (nh->nlmsg_type == RTM_NEWROUTE) | ||
153 | printf("NEW Route entry\n"); | ||
154 | else | ||
155 | printf("%d\n", nh->nlmsg_type); | ||
156 | |||
157 | memset(&route, 0, sizeof(route)); | ||
158 | printf("Destination\t\tGateway\t\tGenmask\t\tMetric\t\tIface\n"); | ||
159 | for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) { | ||
160 | rt_msg = (struct rtmsg *)NLMSG_DATA(nh); | ||
161 | rtm_family = rt_msg->rtm_family; | ||
162 | if (rtm_family == AF_INET) | ||
163 | if (rt_msg->rtm_table != RT_TABLE_MAIN) | ||
164 | continue; | ||
165 | rt_attr = (struct rtattr *)RTM_RTA(rt_msg); | ||
166 | rtl = RTM_PAYLOAD(nh); | ||
167 | |||
168 | for (; RTA_OK(rt_attr, rtl); rt_attr = RTA_NEXT(rt_attr, rtl)) { | ||
169 | switch (rt_attr->rta_type) { | ||
170 | case NDA_DST: | ||
171 | sprintf(dsts, "%u", | ||
172 | (*((__be32 *)RTA_DATA(rt_attr)))); | ||
173 | break; | ||
174 | case RTA_GATEWAY: | ||
175 | sprintf(gws, "%u", | ||
176 | *((__be32 *)RTA_DATA(rt_attr))); | ||
177 | break; | ||
178 | case RTA_OIF: | ||
179 | sprintf(ifs, "%u", | ||
180 | *((int *)RTA_DATA(rt_attr))); | ||
181 | break; | ||
182 | case RTA_METRICS: | ||
183 | sprintf(metrics, "%u", | ||
184 | *((int *)RTA_DATA(rt_attr))); | ||
185 | default: | ||
186 | break; | ||
187 | } | ||
188 | } | ||
189 | sprintf(dsts_len, "%d", rt_msg->rtm_dst_len); | ||
190 | route.dst = atoi(dsts); | ||
191 | route.dst_len = atoi(dsts_len); | ||
192 | route.gw = atoi(gws); | ||
193 | route.iface = atoi(ifs); | ||
194 | route.metric = atoi(metrics); | ||
195 | route.iface_name = alloca(sizeof(char *) * IFNAMSIZ); | ||
196 | route.iface_name = if_indextoname(route.iface, route.iface_name); | ||
197 | route.mac = getmac(route.iface_name); | ||
198 | if (route.mac == -1) | ||
199 | int_exit(0); | ||
200 | assert(bpf_map_update_elem(tx_port_map_fd, | ||
201 | &route.iface, &route.iface, 0) == 0); | ||
202 | if (rtm_family == AF_INET) { | ||
203 | struct trie_value { | ||
204 | __u8 prefix[4]; | ||
205 | __be64 value; | ||
206 | int ifindex; | ||
207 | int metric; | ||
208 | __be32 gw; | ||
209 | } *prefix_value; | ||
210 | |||
211 | prefix_key = alloca(sizeof(*prefix_key) + 3); | ||
212 | prefix_value = alloca(sizeof(*prefix_value)); | ||
213 | |||
214 | prefix_key->prefixlen = 32; | ||
215 | prefix_key->prefixlen = route.dst_len; | ||
216 | direct_entry.mac = route.mac & 0xffffffffffff; | ||
217 | direct_entry.ifindex = route.iface; | ||
218 | direct_entry.arp.mac = 0; | ||
219 | direct_entry.arp.dst = 0; | ||
220 | if (route.dst_len == 32) { | ||
221 | if (nh->nlmsg_type == RTM_DELROUTE) { | ||
222 | assert(bpf_map_delete_elem(exact_match_map_fd, | ||
223 | &route.dst) == 0); | ||
224 | } else { | ||
225 | if (bpf_map_lookup_elem(arp_table_map_fd, | ||
226 | &route.dst, | ||
227 | &direct_entry.arp.mac) == 0) | ||
228 | direct_entry.arp.dst = route.dst; | ||
229 | assert(bpf_map_update_elem(exact_match_map_fd, | ||
230 | &route.dst, | ||
231 | &direct_entry, 0) == 0); | ||
232 | } | ||
233 | } | ||
234 | for (i = 0; i < 4; i++) | ||
235 | prefix_key->data[i] = (route.dst >> i * 8) & 0xff; | ||
236 | |||
237 | printf("%3d.%d.%d.%d\t\t%3x\t\t%d\t\t%d\t\t%s\n", | ||
238 | (int)prefix_key->data[0], | ||
239 | (int)prefix_key->data[1], | ||
240 | (int)prefix_key->data[2], | ||
241 | (int)prefix_key->data[3], | ||
242 | route.gw, route.dst_len, | ||
243 | route.metric, | ||
244 | route.iface_name); | ||
245 | if (bpf_map_lookup_elem(lpm_map_fd, prefix_key, | ||
246 | prefix_value) < 0) { | ||
247 | for (i = 0; i < 4; i++) | ||
248 | prefix_value->prefix[i] = prefix_key->data[i]; | ||
249 | prefix_value->value = route.mac & 0xffffffffffff; | ||
250 | prefix_value->ifindex = route.iface; | ||
251 | prefix_value->gw = route.gw; | ||
252 | prefix_value->metric = route.metric; | ||
253 | |||
254 | assert(bpf_map_update_elem(lpm_map_fd, | ||
255 | prefix_key, | ||
256 | prefix_value, 0 | ||
257 | ) == 0); | ||
258 | } else { | ||
259 | if (nh->nlmsg_type == RTM_DELROUTE) { | ||
260 | printf("deleting entry\n"); | ||
261 | printf("prefix key=%d.%d.%d.%d/%d", | ||
262 | prefix_key->data[0], | ||
263 | prefix_key->data[1], | ||
264 | prefix_key->data[2], | ||
265 | prefix_key->data[3], | ||
266 | prefix_key->prefixlen); | ||
267 | assert(bpf_map_delete_elem(lpm_map_fd, | ||
268 | prefix_key | ||
269 | ) == 0); | ||
270 | /* Rereading the route table to check if | ||
271 | * there is an entry with the same | ||
272 | * prefix but a different metric as the | ||
273 | * deleted enty. | ||
274 | */ | ||
275 | get_route_table(AF_INET); | ||
276 | } else if (prefix_key->data[0] == | ||
277 | prefix_value->prefix[0] && | ||
278 | prefix_key->data[1] == | ||
279 | prefix_value->prefix[1] && | ||
280 | prefix_key->data[2] == | ||
281 | prefix_value->prefix[2] && | ||
282 | prefix_key->data[3] == | ||
283 | prefix_value->prefix[3] && | ||
284 | route.metric >= prefix_value->metric) { | ||
285 | continue; | ||
286 | } else { | ||
287 | for (i = 0; i < 4; i++) | ||
288 | prefix_value->prefix[i] = | ||
289 | prefix_key->data[i]; | ||
290 | prefix_value->value = | ||
291 | route.mac & 0xffffffffffff; | ||
292 | prefix_value->ifindex = route.iface; | ||
293 | prefix_value->gw = route.gw; | ||
294 | prefix_value->metric = route.metric; | ||
295 | assert(bpf_map_update_elem(lpm_map_fd, | ||
296 | prefix_key, | ||
297 | prefix_value, | ||
298 | 0) == 0); | ||
299 | } | ||
300 | } | ||
301 | } | ||
302 | memset(&route, 0, sizeof(route)); | ||
303 | memset(dsts, 0, sizeof(dsts)); | ||
304 | memset(dsts_len, 0, sizeof(dsts_len)); | ||
305 | memset(gws, 0, sizeof(gws)); | ||
306 | memset(ifs, 0, sizeof(ifs)); | ||
307 | memset(&route, 0, sizeof(route)); | ||
308 | } | ||
309 | } | ||
310 | |||
311 | /* Function to read the existing route table when the process is launched*/ | ||
312 | static int get_route_table(int rtm_family) | ||
313 | { | ||
314 | struct sockaddr_nl sa; | ||
315 | struct nlmsghdr *nh; | ||
316 | int sock, seq = 0; | ||
317 | struct msghdr msg; | ||
318 | struct iovec iov; | ||
319 | int ret = 0; | ||
320 | int nll; | ||
321 | |||
322 | struct { | ||
323 | struct nlmsghdr nl; | ||
324 | struct rtmsg rt; | ||
325 | char buf[8192]; | ||
326 | } req; | ||
327 | |||
328 | sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); | ||
329 | if (sock < 0) { | ||
330 | printf("open netlink socket: %s\n", strerror(errno)); | ||
331 | return -1; | ||
332 | } | ||
333 | memset(&sa, 0, sizeof(sa)); | ||
334 | sa.nl_family = AF_NETLINK; | ||
335 | if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { | ||
336 | printf("bind to netlink: %s\n", strerror(errno)); | ||
337 | ret = -1; | ||
338 | goto cleanup; | ||
339 | } | ||
340 | memset(&req, 0, sizeof(req)); | ||
341 | req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); | ||
342 | req.nl.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; | ||
343 | req.nl.nlmsg_type = RTM_GETROUTE; | ||
344 | |||
345 | req.rt.rtm_family = rtm_family; | ||
346 | req.rt.rtm_table = RT_TABLE_MAIN; | ||
347 | req.nl.nlmsg_pid = 0; | ||
348 | req.nl.nlmsg_seq = ++seq; | ||
349 | memset(&msg, 0, sizeof(msg)); | ||
350 | iov.iov_base = (void *)&req.nl; | ||
351 | iov.iov_len = req.nl.nlmsg_len; | ||
352 | msg.msg_iov = &iov; | ||
353 | msg.msg_iovlen = 1; | ||
354 | ret = sendmsg(sock, &msg, 0); | ||
355 | if (ret < 0) { | ||
356 | printf("send to netlink: %s\n", strerror(errno)); | ||
357 | ret = -1; | ||
358 | goto cleanup; | ||
359 | } | ||
360 | memset(buf, 0, sizeof(buf)); | ||
361 | nll = recv_msg(sa, sock); | ||
362 | if (nll < 0) { | ||
363 | printf("recv from netlink: %s\n", strerror(nll)); | ||
364 | ret = -1; | ||
365 | goto cleanup; | ||
366 | } | ||
367 | nh = (struct nlmsghdr *)buf; | ||
368 | read_route(nh, nll); | ||
369 | cleanup: | ||
370 | close(sock); | ||
371 | return ret; | ||
372 | } | ||
373 | |||
374 | /* Function to parse the arp entry returned by netlink | ||
375 | * Updates the arp entry related map entries | ||
376 | */ | ||
377 | static void read_arp(struct nlmsghdr *nh, int nll) | ||
378 | { | ||
379 | struct rtattr *rt_attr; | ||
380 | char dsts[24], mac[24]; | ||
381 | struct ndmsg *rt_msg; | ||
382 | int rtl, ndm_family; | ||
383 | |||
384 | struct arp_table { | ||
385 | __be64 mac; | ||
386 | __be32 dst; | ||
387 | } arp_entry; | ||
388 | struct direct_map { | ||
389 | struct arp_table arp; | ||
390 | int ifindex; | ||
391 | __be64 mac; | ||
392 | } direct_entry; | ||
393 | |||
394 | if (nh->nlmsg_type == RTM_GETNEIGH) | ||
395 | printf("READING arp entry\n"); | ||
396 | printf("Address\tHwAddress\n"); | ||
397 | for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) { | ||
398 | rt_msg = (struct ndmsg *)NLMSG_DATA(nh); | ||
399 | rt_attr = (struct rtattr *)RTM_RTA(rt_msg); | ||
400 | ndm_family = rt_msg->ndm_family; | ||
401 | rtl = RTM_PAYLOAD(nh); | ||
402 | for (; RTA_OK(rt_attr, rtl); rt_attr = RTA_NEXT(rt_attr, rtl)) { | ||
403 | switch (rt_attr->rta_type) { | ||
404 | case NDA_DST: | ||
405 | sprintf(dsts, "%u", | ||
406 | *((__be32 *)RTA_DATA(rt_attr))); | ||
407 | break; | ||
408 | case NDA_LLADDR: | ||
409 | sprintf(mac, "%lld", | ||
410 | *((__be64 *)RTA_DATA(rt_attr))); | ||
411 | break; | ||
412 | default: | ||
413 | break; | ||
414 | } | ||
415 | } | ||
416 | arp_entry.dst = atoi(dsts); | ||
417 | arp_entry.mac = atol(mac); | ||
418 | printf("%x\t\t%llx\n", arp_entry.dst, arp_entry.mac); | ||
419 | if (ndm_family == AF_INET) { | ||
420 | if (bpf_map_lookup_elem(exact_match_map_fd, | ||
421 | &arp_entry.dst, | ||
422 | &direct_entry) == 0) { | ||
423 | if (nh->nlmsg_type == RTM_DELNEIGH) { | ||
424 | direct_entry.arp.dst = 0; | ||
425 | direct_entry.arp.mac = 0; | ||
426 | } else if (nh->nlmsg_type == RTM_NEWNEIGH) { | ||
427 | direct_entry.arp.dst = arp_entry.dst; | ||
428 | direct_entry.arp.mac = arp_entry.mac; | ||
429 | } | ||
430 | assert(bpf_map_update_elem(exact_match_map_fd, | ||
431 | &arp_entry.dst, | ||
432 | &direct_entry, 0 | ||
433 | ) == 0); | ||
434 | memset(&direct_entry, 0, sizeof(direct_entry)); | ||
435 | } | ||
436 | if (nh->nlmsg_type == RTM_DELNEIGH) { | ||
437 | assert(bpf_map_delete_elem(arp_table_map_fd, | ||
438 | &arp_entry.dst) == 0); | ||
439 | } else if (nh->nlmsg_type == RTM_NEWNEIGH) { | ||
440 | assert(bpf_map_update_elem(arp_table_map_fd, | ||
441 | &arp_entry.dst, | ||
442 | &arp_entry.mac, 0 | ||
443 | ) == 0); | ||
444 | } | ||
445 | } | ||
446 | memset(&arp_entry, 0, sizeof(arp_entry)); | ||
447 | memset(dsts, 0, sizeof(dsts)); | ||
448 | } | ||
449 | } | ||
450 | |||
451 | /* Function to read the existing arp table when the process is launched*/ | ||
452 | static int get_arp_table(int rtm_family) | ||
453 | { | ||
454 | struct sockaddr_nl sa; | ||
455 | struct nlmsghdr *nh; | ||
456 | int sock, seq = 0; | ||
457 | struct msghdr msg; | ||
458 | struct iovec iov; | ||
459 | int ret = 0; | ||
460 | int nll; | ||
461 | struct { | ||
462 | struct nlmsghdr nl; | ||
463 | struct ndmsg rt; | ||
464 | char buf[8192]; | ||
465 | } req; | ||
466 | |||
467 | sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); | ||
468 | if (sock < 0) { | ||
469 | printf("open netlink socket: %s\n", strerror(errno)); | ||
470 | return -1; | ||
471 | } | ||
472 | memset(&sa, 0, sizeof(sa)); | ||
473 | sa.nl_family = AF_NETLINK; | ||
474 | if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { | ||
475 | printf("bind to netlink: %s\n", strerror(errno)); | ||
476 | ret = -1; | ||
477 | goto cleanup; | ||
478 | } | ||
479 | memset(&req, 0, sizeof(req)); | ||
480 | req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); | ||
481 | req.nl.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; | ||
482 | req.nl.nlmsg_type = RTM_GETNEIGH; | ||
483 | req.rt.ndm_state = NUD_REACHABLE; | ||
484 | req.rt.ndm_family = rtm_family; | ||
485 | req.nl.nlmsg_pid = 0; | ||
486 | req.nl.nlmsg_seq = ++seq; | ||
487 | memset(&msg, 0, sizeof(msg)); | ||
488 | iov.iov_base = (void *)&req.nl; | ||
489 | iov.iov_len = req.nl.nlmsg_len; | ||
490 | msg.msg_iov = &iov; | ||
491 | msg.msg_iovlen = 1; | ||
492 | ret = sendmsg(sock, &msg, 0); | ||
493 | if (ret < 0) { | ||
494 | printf("send to netlink: %s\n", strerror(errno)); | ||
495 | ret = -1; | ||
496 | goto cleanup; | ||
497 | } | ||
498 | memset(buf, 0, sizeof(buf)); | ||
499 | nll = recv_msg(sa, sock); | ||
500 | if (nll < 0) { | ||
501 | printf("recv from netlink: %s\n", strerror(nll)); | ||
502 | ret = -1; | ||
503 | goto cleanup; | ||
504 | } | ||
505 | nh = (struct nlmsghdr *)buf; | ||
506 | read_arp(nh, nll); | ||
507 | cleanup: | ||
508 | close(sock); | ||
509 | return ret; | ||
510 | } | ||
511 | |||
512 | /* Function to keep track and update changes in route and arp table | ||
513 | * Give regular statistics of packets forwarded | ||
514 | */ | ||
515 | static int monitor_route(void) | ||
516 | { | ||
517 | unsigned int nr_cpus = bpf_num_possible_cpus(); | ||
518 | const unsigned int nr_keys = 256; | ||
519 | struct pollfd fds_route, fds_arp; | ||
520 | __u64 prev[nr_keys][nr_cpus]; | ||
521 | struct sockaddr_nl la, lr; | ||
522 | __u64 values[nr_cpus]; | ||
523 | struct nlmsghdr *nh; | ||
524 | int nll, ret = 0; | ||
525 | int interval = 5; | ||
526 | __u32 key; | ||
527 | int i; | ||
528 | |||
529 | sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); | ||
530 | if (sock < 0) { | ||
531 | printf("open netlink socket: %s\n", strerror(errno)); | ||
532 | return -1; | ||
533 | } | ||
534 | |||
535 | fcntl(sock, F_SETFL, O_NONBLOCK); | ||
536 | memset(&lr, 0, sizeof(lr)); | ||
537 | lr.nl_family = AF_NETLINK; | ||
538 | lr.nl_groups = RTMGRP_IPV6_ROUTE | RTMGRP_IPV4_ROUTE | RTMGRP_NOTIFY; | ||
539 | if (bind(sock, (struct sockaddr *)&lr, sizeof(lr)) < 0) { | ||
540 | printf("bind to netlink: %s\n", strerror(errno)); | ||
541 | ret = -1; | ||
542 | goto cleanup; | ||
543 | } | ||
544 | fds_route.fd = sock; | ||
545 | fds_route.events = POLL_IN; | ||
546 | |||
547 | sock_arp = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); | ||
548 | if (sock_arp < 0) { | ||
549 | printf("open netlink socket: %s\n", strerror(errno)); | ||
550 | return -1; | ||
551 | } | ||
552 | |||
553 | fcntl(sock_arp, F_SETFL, O_NONBLOCK); | ||
554 | memset(&la, 0, sizeof(la)); | ||
555 | la.nl_family = AF_NETLINK; | ||
556 | la.nl_groups = RTMGRP_NEIGH | RTMGRP_NOTIFY; | ||
557 | if (bind(sock_arp, (struct sockaddr *)&la, sizeof(la)) < 0) { | ||
558 | printf("bind to netlink: %s\n", strerror(errno)); | ||
559 | ret = -1; | ||
560 | goto cleanup; | ||
561 | } | ||
562 | fds_arp.fd = sock_arp; | ||
563 | fds_arp.events = POLL_IN; | ||
564 | |||
565 | memset(prev, 0, sizeof(prev)); | ||
566 | do { | ||
567 | signal(SIGINT, close_and_exit); | ||
568 | signal(SIGTERM, close_and_exit); | ||
569 | |||
570 | sleep(interval); | ||
571 | for (key = 0; key < nr_keys; key++) { | ||
572 | __u64 sum = 0; | ||
573 | |||
574 | assert(bpf_map_lookup_elem(rxcnt_map_fd, | ||
575 | &key, values) == 0); | ||
576 | for (i = 0; i < nr_cpus; i++) | ||
577 | sum += (values[i] - prev[key][i]); | ||
578 | if (sum) | ||
579 | printf("proto %u: %10llu pkt/s\n", | ||
580 | key, sum / interval); | ||
581 | memcpy(prev[key], values, sizeof(values)); | ||
582 | } | ||
583 | |||
584 | memset(buf, 0, sizeof(buf)); | ||
585 | if (poll(&fds_route, 1, 3) == POLL_IN) { | ||
586 | nll = recv_msg(lr, sock); | ||
587 | if (nll < 0) { | ||
588 | printf("recv from netlink: %s\n", strerror(nll)); | ||
589 | ret = -1; | ||
590 | goto cleanup; | ||
591 | } | ||
592 | |||
593 | nh = (struct nlmsghdr *)buf; | ||
594 | printf("Routing table updated.\n"); | ||
595 | read_route(nh, nll); | ||
596 | } | ||
597 | memset(buf, 0, sizeof(buf)); | ||
598 | if (poll(&fds_arp, 1, 3) == POLL_IN) { | ||
599 | nll = recv_msg(la, sock_arp); | ||
600 | if (nll < 0) { | ||
601 | printf("recv from netlink: %s\n", strerror(nll)); | ||
602 | ret = -1; | ||
603 | goto cleanup; | ||
604 | } | ||
605 | |||
606 | nh = (struct nlmsghdr *)buf; | ||
607 | read_arp(nh, nll); | ||
608 | } | ||
609 | |||
610 | } while (1); | ||
611 | cleanup: | ||
612 | close(sock); | ||
613 | return ret; | ||
614 | } | ||
615 | |||
616 | static void usage(const char *prog) | ||
617 | { | ||
618 | fprintf(stderr, | ||
619 | "%s: %s [OPTS] interface name list\n\n" | ||
620 | "OPTS:\n" | ||
621 | " -S use skb-mode\n" | ||
622 | " -F force loading prog\n", | ||
623 | __func__, prog); | ||
624 | } | ||
625 | |||
626 | int main(int ac, char **argv) | ||
627 | { | ||
628 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
629 | struct bpf_prog_load_attr prog_load_attr = { | ||
630 | .prog_type = BPF_PROG_TYPE_XDP, | ||
631 | }; | ||
632 | struct bpf_prog_info info = {}; | ||
633 | __u32 info_len = sizeof(info); | ||
634 | const char *optstr = "SF"; | ||
635 | struct bpf_object *obj; | ||
636 | char filename[256]; | ||
637 | char **ifname_list; | ||
638 | int prog_fd, opt; | ||
639 | int err, i = 1; | ||
640 | |||
641 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
642 | prog_load_attr.file = filename; | ||
643 | |||
644 | total_ifindex = ac - 1; | ||
645 | ifname_list = (argv + 1); | ||
646 | |||
647 | while ((opt = getopt(ac, argv, optstr)) != -1) { | ||
648 | switch (opt) { | ||
649 | case 'S': | ||
650 | flags |= XDP_FLAGS_SKB_MODE; | ||
651 | total_ifindex--; | ||
652 | ifname_list++; | ||
653 | break; | ||
654 | case 'F': | ||
655 | flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; | ||
656 | total_ifindex--; | ||
657 | ifname_list++; | ||
658 | break; | ||
659 | default: | ||
660 | usage(basename(argv[0])); | ||
661 | return 1; | ||
662 | } | ||
663 | } | ||
664 | |||
665 | if (!(flags & XDP_FLAGS_SKB_MODE)) | ||
666 | flags |= XDP_FLAGS_DRV_MODE; | ||
667 | |||
668 | if (optind == ac) { | ||
669 | usage(basename(argv[0])); | ||
670 | return 1; | ||
671 | } | ||
672 | |||
673 | if (setrlimit(RLIMIT_MEMLOCK, &r)) { | ||
674 | perror("setrlimit(RLIMIT_MEMLOCK)"); | ||
675 | return 1; | ||
676 | } | ||
677 | |||
678 | if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) | ||
679 | return 1; | ||
680 | |||
681 | printf("\n**************loading bpf file*********************\n\n\n"); | ||
682 | if (!prog_fd) { | ||
683 | printf("bpf_prog_load_xattr: %s\n", strerror(errno)); | ||
684 | return 1; | ||
685 | } | ||
686 | |||
687 | lpm_map_fd = bpf_object__find_map_fd_by_name(obj, "lpm_map"); | ||
688 | rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt"); | ||
689 | arp_table_map_fd = bpf_object__find_map_fd_by_name(obj, "arp_table"); | ||
690 | exact_match_map_fd = bpf_object__find_map_fd_by_name(obj, | ||
691 | "exact_match"); | ||
692 | tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port"); | ||
693 | if (lpm_map_fd < 0 || rxcnt_map_fd < 0 || arp_table_map_fd < 0 || | ||
694 | exact_match_map_fd < 0 || tx_port_map_fd < 0) { | ||
695 | printf("bpf_object__find_map_fd_by_name failed\n"); | ||
696 | return 1; | ||
697 | } | ||
698 | |||
699 | ifindex_list = (int *)calloc(total_ifindex, sizeof(int *)); | ||
700 | for (i = 0; i < total_ifindex; i++) { | ||
701 | ifindex_list[i] = if_nametoindex(ifname_list[i]); | ||
702 | if (!ifindex_list[i]) { | ||
703 | printf("Couldn't translate interface name: %s", | ||
704 | strerror(errno)); | ||
705 | return 1; | ||
706 | } | ||
707 | } | ||
708 | prog_id_list = (__u32 *)calloc(total_ifindex, sizeof(__u32 *)); | ||
709 | for (i = 0; i < total_ifindex; i++) { | ||
710 | if (bpf_set_link_xdp_fd(ifindex_list[i], prog_fd, flags) < 0) { | ||
711 | printf("link set xdp fd failed\n"); | ||
712 | int recovery_index = i; | ||
713 | |||
714 | for (i = 0; i < recovery_index; i++) | ||
715 | bpf_set_link_xdp_fd(ifindex_list[i], -1, flags); | ||
716 | |||
717 | return 1; | ||
718 | } | ||
719 | err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); | ||
720 | if (err) { | ||
721 | printf("can't get prog info - %s\n", strerror(errno)); | ||
722 | return err; | ||
723 | } | ||
724 | prog_id_list[i] = info.id; | ||
725 | memset(&info, 0, sizeof(info)); | ||
726 | printf("Attached to %d\n", ifindex_list[i]); | ||
727 | } | ||
728 | signal(SIGINT, int_exit); | ||
729 | signal(SIGTERM, int_exit); | ||
730 | |||
731 | printf("*******************ROUTE TABLE*************************\n\n\n"); | ||
732 | get_route_table(AF_INET); | ||
733 | printf("*******************ARP TABLE***************************\n\n\n"); | ||
734 | get_arp_table(AF_INET); | ||
735 | if (monitor_route() < 0) { | ||
736 | printf("Error in receiving route update"); | ||
737 | return 1; | ||
738 | } | ||
739 | |||
740 | return 0; | ||
741 | } | ||
diff --git a/samples/bpf/xdp_rxq_info_kern.c b/samples/bpf/xdp_rxq_info_kern.c new file mode 100644 index 000000000..5e7459f9b --- /dev/null +++ b/samples/bpf/xdp_rxq_info_kern.c | |||
@@ -0,0 +1,140 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 | ||
2 | * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. | ||
3 | * | ||
4 | * Example howto extract XDP RX-queue info | ||
5 | */ | ||
6 | #include <uapi/linux/bpf.h> | ||
7 | #include <uapi/linux/if_ether.h> | ||
8 | #include <uapi/linux/in.h> | ||
9 | #include <bpf/bpf_helpers.h> | ||
10 | |||
11 | /* Config setup from with userspace | ||
12 | * | ||
13 | * User-side setup ifindex in config_map, to verify that | ||
14 | * ctx->ingress_ifindex is correct (against configured ifindex) | ||
15 | */ | ||
16 | struct config { | ||
17 | __u32 action; | ||
18 | int ifindex; | ||
19 | __u32 options; | ||
20 | }; | ||
21 | enum cfg_options_flags { | ||
22 | NO_TOUCH = 0x0U, | ||
23 | READ_MEM = 0x1U, | ||
24 | SWAP_MAC = 0x2U, | ||
25 | }; | ||
26 | |||
27 | struct { | ||
28 | __uint(type, BPF_MAP_TYPE_ARRAY); | ||
29 | __type(key, int); | ||
30 | __type(value, struct config); | ||
31 | __uint(max_entries, 1); | ||
32 | } config_map SEC(".maps"); | ||
33 | |||
34 | /* Common stats data record (shared with userspace) */ | ||
35 | struct datarec { | ||
36 | __u64 processed; | ||
37 | __u64 issue; | ||
38 | }; | ||
39 | |||
40 | struct { | ||
41 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); | ||
42 | __type(key, u32); | ||
43 | __type(value, struct datarec); | ||
44 | __uint(max_entries, 1); | ||
45 | } stats_global_map SEC(".maps"); | ||
46 | |||
47 | #define MAX_RXQs 64 | ||
48 | |||
49 | /* Stats per rx_queue_index (per CPU) */ | ||
50 | struct { | ||
51 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); | ||
52 | __type(key, u32); | ||
53 | __type(value, struct datarec); | ||
54 | __uint(max_entries, MAX_RXQs + 1); | ||
55 | } rx_queue_index_map SEC(".maps"); | ||
56 | |||
57 | static __always_inline | ||
58 | void swap_src_dst_mac(void *data) | ||
59 | { | ||
60 | unsigned short *p = data; | ||
61 | unsigned short dst[3]; | ||
62 | |||
63 | dst[0] = p[0]; | ||
64 | dst[1] = p[1]; | ||
65 | dst[2] = p[2]; | ||
66 | p[0] = p[3]; | ||
67 | p[1] = p[4]; | ||
68 | p[2] = p[5]; | ||
69 | p[3] = dst[0]; | ||
70 | p[4] = dst[1]; | ||
71 | p[5] = dst[2]; | ||
72 | } | ||
73 | |||
74 | SEC("xdp_prog0") | ||
75 | int xdp_prognum0(struct xdp_md *ctx) | ||
76 | { | ||
77 | void *data_end = (void *)(long)ctx->data_end; | ||
78 | void *data = (void *)(long)ctx->data; | ||
79 | struct datarec *rec, *rxq_rec; | ||
80 | int ingress_ifindex; | ||
81 | struct config *config; | ||
82 | u32 key = 0; | ||
83 | |||
84 | /* Global stats record */ | ||
85 | rec = bpf_map_lookup_elem(&stats_global_map, &key); | ||
86 | if (!rec) | ||
87 | return XDP_ABORTED; | ||
88 | rec->processed++; | ||
89 | |||
90 | /* Accessing ctx->ingress_ifindex, cause BPF to rewrite BPF | ||
91 | * instructions inside kernel to access xdp_rxq->dev->ifindex | ||
92 | */ | ||
93 | ingress_ifindex = ctx->ingress_ifindex; | ||
94 | |||
95 | config = bpf_map_lookup_elem(&config_map, &key); | ||
96 | if (!config) | ||
97 | return XDP_ABORTED; | ||
98 | |||
99 | /* Simple test: check ctx provided ifindex is as expected */ | ||
100 | if (ingress_ifindex != config->ifindex) { | ||
101 | /* count this error case */ | ||
102 | rec->issue++; | ||
103 | return XDP_ABORTED; | ||
104 | } | ||
105 | |||
106 | /* Update stats per rx_queue_index. Handle if rx_queue_index | ||
107 | * is larger than stats map can contain info for. | ||
108 | */ | ||
109 | key = ctx->rx_queue_index; | ||
110 | if (key >= MAX_RXQs) | ||
111 | key = MAX_RXQs; | ||
112 | rxq_rec = bpf_map_lookup_elem(&rx_queue_index_map, &key); | ||
113 | if (!rxq_rec) | ||
114 | return XDP_ABORTED; | ||
115 | rxq_rec->processed++; | ||
116 | if (key == MAX_RXQs) | ||
117 | rxq_rec->issue++; | ||
118 | |||
119 | /* Default: Don't touch packet data, only count packets */ | ||
120 | if (unlikely(config->options & (READ_MEM|SWAP_MAC))) { | ||
121 | struct ethhdr *eth = data; | ||
122 | |||
123 | if (eth + 1 > data_end) | ||
124 | return XDP_ABORTED; | ||
125 | |||
126 | /* Avoid compiler removing this: Drop non 802.3 Ethertypes */ | ||
127 | if (ntohs(eth->h_proto) < ETH_P_802_3_MIN) | ||
128 | return XDP_ABORTED; | ||
129 | |||
130 | /* XDP_TX requires changing MAC-addrs, else HW may drop. | ||
131 | * Can also be enabled with --swapmac (for test purposes) | ||
132 | */ | ||
133 | if (unlikely(config->options & SWAP_MAC)) | ||
134 | swap_src_dst_mac(data); | ||
135 | } | ||
136 | |||
137 | return config->action; | ||
138 | } | ||
139 | |||
140 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c new file mode 100644 index 000000000..93fa1bc54 --- /dev/null +++ b/samples/bpf/xdp_rxq_info_user.c | |||
@@ -0,0 +1,605 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 | ||
2 | * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. | ||
3 | */ | ||
4 | static const char *__doc__ = " XDP RX-queue info extract example\n\n" | ||
5 | "Monitor how many packets per sec (pps) are received\n" | ||
6 | "per NIC RX queue index and which CPU processed the packet\n" | ||
7 | ; | ||
8 | |||
9 | #include <errno.h> | ||
10 | #include <signal.h> | ||
11 | #include <stdio.h> | ||
12 | #include <stdlib.h> | ||
13 | #include <stdbool.h> | ||
14 | #include <string.h> | ||
15 | #include <unistd.h> | ||
16 | #include <locale.h> | ||
17 | #include <sys/resource.h> | ||
18 | #include <getopt.h> | ||
19 | #include <net/if.h> | ||
20 | #include <time.h> | ||
21 | |||
22 | #include <arpa/inet.h> | ||
23 | #include <linux/if_link.h> | ||
24 | |||
25 | #include <bpf/bpf.h> | ||
26 | #include <bpf/libbpf.h> | ||
27 | #include "bpf_util.h" | ||
28 | |||
29 | static int ifindex = -1; | ||
30 | static char ifname_buf[IF_NAMESIZE]; | ||
31 | static char *ifname; | ||
32 | static __u32 prog_id; | ||
33 | |||
34 | static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; | ||
35 | |||
36 | static struct bpf_map *stats_global_map; | ||
37 | static struct bpf_map *rx_queue_index_map; | ||
38 | |||
39 | /* Exit return codes */ | ||
40 | #define EXIT_OK 0 | ||
41 | #define EXIT_FAIL 1 | ||
42 | #define EXIT_FAIL_OPTION 2 | ||
43 | #define EXIT_FAIL_XDP 3 | ||
44 | #define EXIT_FAIL_BPF 4 | ||
45 | #define EXIT_FAIL_MEM 5 | ||
46 | |||
47 | static const struct option long_options[] = { | ||
48 | {"help", no_argument, NULL, 'h' }, | ||
49 | {"dev", required_argument, NULL, 'd' }, | ||
50 | {"skb-mode", no_argument, NULL, 'S' }, | ||
51 | {"sec", required_argument, NULL, 's' }, | ||
52 | {"no-separators", no_argument, NULL, 'z' }, | ||
53 | {"action", required_argument, NULL, 'a' }, | ||
54 | {"readmem", no_argument, NULL, 'r' }, | ||
55 | {"swapmac", no_argument, NULL, 'm' }, | ||
56 | {"force", no_argument, NULL, 'F' }, | ||
57 | {0, 0, NULL, 0 } | ||
58 | }; | ||
59 | |||
60 | static void int_exit(int sig) | ||
61 | { | ||
62 | __u32 curr_prog_id = 0; | ||
63 | |||
64 | if (ifindex > -1) { | ||
65 | if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) { | ||
66 | printf("bpf_get_link_xdp_id failed\n"); | ||
67 | exit(EXIT_FAIL); | ||
68 | } | ||
69 | if (prog_id == curr_prog_id) { | ||
70 | fprintf(stderr, | ||
71 | "Interrupted: Removing XDP program on ifindex:%d device:%s\n", | ||
72 | ifindex, ifname); | ||
73 | bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); | ||
74 | } else if (!curr_prog_id) { | ||
75 | printf("couldn't find a prog id on a given iface\n"); | ||
76 | } else { | ||
77 | printf("program on interface changed, not removing\n"); | ||
78 | } | ||
79 | } | ||
80 | exit(EXIT_OK); | ||
81 | } | ||
82 | |||
83 | struct config { | ||
84 | __u32 action; | ||
85 | int ifindex; | ||
86 | __u32 options; | ||
87 | }; | ||
88 | enum cfg_options_flags { | ||
89 | NO_TOUCH = 0x0U, | ||
90 | READ_MEM = 0x1U, | ||
91 | SWAP_MAC = 0x2U, | ||
92 | }; | ||
93 | #define XDP_ACTION_MAX (XDP_TX + 1) | ||
94 | #define XDP_ACTION_MAX_STRLEN 11 | ||
95 | static const char *xdp_action_names[XDP_ACTION_MAX] = { | ||
96 | [XDP_ABORTED] = "XDP_ABORTED", | ||
97 | [XDP_DROP] = "XDP_DROP", | ||
98 | [XDP_PASS] = "XDP_PASS", | ||
99 | [XDP_TX] = "XDP_TX", | ||
100 | }; | ||
101 | |||
102 | static const char *action2str(int action) | ||
103 | { | ||
104 | if (action < XDP_ACTION_MAX) | ||
105 | return xdp_action_names[action]; | ||
106 | return NULL; | ||
107 | } | ||
108 | |||
109 | static int parse_xdp_action(char *action_str) | ||
110 | { | ||
111 | size_t maxlen; | ||
112 | __u64 action = -1; | ||
113 | int i; | ||
114 | |||
115 | for (i = 0; i < XDP_ACTION_MAX; i++) { | ||
116 | maxlen = XDP_ACTION_MAX_STRLEN; | ||
117 | if (strncmp(xdp_action_names[i], action_str, maxlen) == 0) { | ||
118 | action = i; | ||
119 | break; | ||
120 | } | ||
121 | } | ||
122 | return action; | ||
123 | } | ||
124 | |||
125 | static void list_xdp_actions(void) | ||
126 | { | ||
127 | int i; | ||
128 | |||
129 | printf("Available XDP --action <options>\n"); | ||
130 | for (i = 0; i < XDP_ACTION_MAX; i++) | ||
131 | printf("\t%s\n", xdp_action_names[i]); | ||
132 | printf("\n"); | ||
133 | } | ||
134 | |||
135 | static char* options2str(enum cfg_options_flags flag) | ||
136 | { | ||
137 | if (flag == NO_TOUCH) | ||
138 | return "no_touch"; | ||
139 | if (flag & SWAP_MAC) | ||
140 | return "swapmac"; | ||
141 | if (flag & READ_MEM) | ||
142 | return "read"; | ||
143 | fprintf(stderr, "ERR: Unknown config option flags"); | ||
144 | exit(EXIT_FAIL); | ||
145 | } | ||
146 | |||
147 | static void usage(char *argv[]) | ||
148 | { | ||
149 | int i; | ||
150 | |||
151 | printf("\nDOCUMENTATION:\n%s\n", __doc__); | ||
152 | printf(" Usage: %s (options-see-below)\n", argv[0]); | ||
153 | printf(" Listing options:\n"); | ||
154 | for (i = 0; long_options[i].name != 0; i++) { | ||
155 | printf(" --%-12s", long_options[i].name); | ||
156 | if (long_options[i].flag != NULL) | ||
157 | printf(" flag (internal value:%d)", | ||
158 | *long_options[i].flag); | ||
159 | else | ||
160 | printf(" short-option: -%c", | ||
161 | long_options[i].val); | ||
162 | printf("\n"); | ||
163 | } | ||
164 | printf("\n"); | ||
165 | list_xdp_actions(); | ||
166 | } | ||
167 | |||
168 | #define NANOSEC_PER_SEC 1000000000 /* 10^9 */ | ||
169 | static __u64 gettime(void) | ||
170 | { | ||
171 | struct timespec t; | ||
172 | int res; | ||
173 | |||
174 | res = clock_gettime(CLOCK_MONOTONIC, &t); | ||
175 | if (res < 0) { | ||
176 | fprintf(stderr, "Error with gettimeofday! (%i)\n", res); | ||
177 | exit(EXIT_FAIL); | ||
178 | } | ||
179 | return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; | ||
180 | } | ||
181 | |||
182 | /* Common stats data record shared with _kern.c */ | ||
183 | struct datarec { | ||
184 | __u64 processed; | ||
185 | __u64 issue; | ||
186 | }; | ||
187 | struct record { | ||
188 | __u64 timestamp; | ||
189 | struct datarec total; | ||
190 | struct datarec *cpu; | ||
191 | }; | ||
192 | struct stats_record { | ||
193 | struct record stats; | ||
194 | struct record *rxq; | ||
195 | }; | ||
196 | |||
197 | static struct datarec *alloc_record_per_cpu(void) | ||
198 | { | ||
199 | unsigned int nr_cpus = bpf_num_possible_cpus(); | ||
200 | struct datarec *array; | ||
201 | |||
202 | array = calloc(nr_cpus, sizeof(struct datarec)); | ||
203 | if (!array) { | ||
204 | fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus); | ||
205 | exit(EXIT_FAIL_MEM); | ||
206 | } | ||
207 | return array; | ||
208 | } | ||
209 | |||
210 | static struct record *alloc_record_per_rxq(void) | ||
211 | { | ||
212 | unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries; | ||
213 | struct record *array; | ||
214 | |||
215 | array = calloc(nr_rxqs, sizeof(struct record)); | ||
216 | if (!array) { | ||
217 | fprintf(stderr, "Mem alloc error (nr_rxqs:%u)\n", nr_rxqs); | ||
218 | exit(EXIT_FAIL_MEM); | ||
219 | } | ||
220 | return array; | ||
221 | } | ||
222 | |||
223 | static struct stats_record *alloc_stats_record(void) | ||
224 | { | ||
225 | unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries; | ||
226 | struct stats_record *rec; | ||
227 | int i; | ||
228 | |||
229 | rec = calloc(1, sizeof(struct stats_record)); | ||
230 | if (!rec) { | ||
231 | fprintf(stderr, "Mem alloc error\n"); | ||
232 | exit(EXIT_FAIL_MEM); | ||
233 | } | ||
234 | rec->rxq = alloc_record_per_rxq(); | ||
235 | for (i = 0; i < nr_rxqs; i++) | ||
236 | rec->rxq[i].cpu = alloc_record_per_cpu(); | ||
237 | |||
238 | rec->stats.cpu = alloc_record_per_cpu(); | ||
239 | return rec; | ||
240 | } | ||
241 | |||
242 | static void free_stats_record(struct stats_record *r) | ||
243 | { | ||
244 | unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries; | ||
245 | int i; | ||
246 | |||
247 | for (i = 0; i < nr_rxqs; i++) | ||
248 | free(r->rxq[i].cpu); | ||
249 | |||
250 | free(r->rxq); | ||
251 | free(r->stats.cpu); | ||
252 | free(r); | ||
253 | } | ||
254 | |||
255 | static bool map_collect_percpu(int fd, __u32 key, struct record *rec) | ||
256 | { | ||
257 | /* For percpu maps, userspace gets a value per possible CPU */ | ||
258 | unsigned int nr_cpus = bpf_num_possible_cpus(); | ||
259 | struct datarec values[nr_cpus]; | ||
260 | __u64 sum_processed = 0; | ||
261 | __u64 sum_issue = 0; | ||
262 | int i; | ||
263 | |||
264 | if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { | ||
265 | fprintf(stderr, | ||
266 | "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); | ||
267 | return false; | ||
268 | } | ||
269 | /* Get time as close as possible to reading map contents */ | ||
270 | rec->timestamp = gettime(); | ||
271 | |||
272 | /* Record and sum values from each CPU */ | ||
273 | for (i = 0; i < nr_cpus; i++) { | ||
274 | rec->cpu[i].processed = values[i].processed; | ||
275 | sum_processed += values[i].processed; | ||
276 | rec->cpu[i].issue = values[i].issue; | ||
277 | sum_issue += values[i].issue; | ||
278 | } | ||
279 | rec->total.processed = sum_processed; | ||
280 | rec->total.issue = sum_issue; | ||
281 | return true; | ||
282 | } | ||
283 | |||
284 | static void stats_collect(struct stats_record *rec) | ||
285 | { | ||
286 | int fd, i, max_rxqs; | ||
287 | |||
288 | fd = bpf_map__fd(stats_global_map); | ||
289 | map_collect_percpu(fd, 0, &rec->stats); | ||
290 | |||
291 | fd = bpf_map__fd(rx_queue_index_map); | ||
292 | max_rxqs = bpf_map__def(rx_queue_index_map)->max_entries; | ||
293 | for (i = 0; i < max_rxqs; i++) | ||
294 | map_collect_percpu(fd, i, &rec->rxq[i]); | ||
295 | } | ||
296 | |||
297 | static double calc_period(struct record *r, struct record *p) | ||
298 | { | ||
299 | double period_ = 0; | ||
300 | __u64 period = 0; | ||
301 | |||
302 | period = r->timestamp - p->timestamp; | ||
303 | if (period > 0) | ||
304 | period_ = ((double) period / NANOSEC_PER_SEC); | ||
305 | |||
306 | return period_; | ||
307 | } | ||
308 | |||
309 | static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_) | ||
310 | { | ||
311 | __u64 packets = 0; | ||
312 | __u64 pps = 0; | ||
313 | |||
314 | if (period_ > 0) { | ||
315 | packets = r->processed - p->processed; | ||
316 | pps = packets / period_; | ||
317 | } | ||
318 | return pps; | ||
319 | } | ||
320 | |||
321 | static __u64 calc_errs_pps(struct datarec *r, | ||
322 | struct datarec *p, double period_) | ||
323 | { | ||
324 | __u64 packets = 0; | ||
325 | __u64 pps = 0; | ||
326 | |||
327 | if (period_ > 0) { | ||
328 | packets = r->issue - p->issue; | ||
329 | pps = packets / period_; | ||
330 | } | ||
331 | return pps; | ||
332 | } | ||
333 | |||
334 | static void stats_print(struct stats_record *stats_rec, | ||
335 | struct stats_record *stats_prev, | ||
336 | int action, __u32 cfg_opt) | ||
337 | { | ||
338 | unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries; | ||
339 | unsigned int nr_cpus = bpf_num_possible_cpus(); | ||
340 | double pps = 0, err = 0; | ||
341 | struct record *rec, *prev; | ||
342 | double t; | ||
343 | int rxq; | ||
344 | int i; | ||
345 | |||
346 | /* Header */ | ||
347 | printf("\nRunning XDP on dev:%s (ifindex:%d) action:%s options:%s\n", | ||
348 | ifname, ifindex, action2str(action), options2str(cfg_opt)); | ||
349 | |||
350 | /* stats_global_map */ | ||
351 | { | ||
352 | char *fmt_rx = "%-15s %-7d %'-11.0f %'-10.0f %s\n"; | ||
353 | char *fm2_rx = "%-15s %-7s %'-11.0f\n"; | ||
354 | char *errstr = ""; | ||
355 | |||
356 | printf("%-15s %-7s %-11s %-11s\n", | ||
357 | "XDP stats", "CPU", "pps", "issue-pps"); | ||
358 | |||
359 | rec = &stats_rec->stats; | ||
360 | prev = &stats_prev->stats; | ||
361 | t = calc_period(rec, prev); | ||
362 | for (i = 0; i < nr_cpus; i++) { | ||
363 | struct datarec *r = &rec->cpu[i]; | ||
364 | struct datarec *p = &prev->cpu[i]; | ||
365 | |||
366 | pps = calc_pps (r, p, t); | ||
367 | err = calc_errs_pps(r, p, t); | ||
368 | if (err > 0) | ||
369 | errstr = "invalid-ifindex"; | ||
370 | if (pps > 0) | ||
371 | printf(fmt_rx, "XDP-RX CPU", | ||
372 | i, pps, err, errstr); | ||
373 | } | ||
374 | pps = calc_pps (&rec->total, &prev->total, t); | ||
375 | err = calc_errs_pps(&rec->total, &prev->total, t); | ||
376 | printf(fm2_rx, "XDP-RX CPU", "total", pps, err); | ||
377 | } | ||
378 | |||
379 | /* rx_queue_index_map */ | ||
380 | printf("\n%-15s %-7s %-11s %-11s\n", | ||
381 | "RXQ stats", "RXQ:CPU", "pps", "issue-pps"); | ||
382 | |||
383 | for (rxq = 0; rxq < nr_rxqs; rxq++) { | ||
384 | char *fmt_rx = "%-15s %3d:%-3d %'-11.0f %'-10.0f %s\n"; | ||
385 | char *fm2_rx = "%-15s %3d:%-3s %'-11.0f\n"; | ||
386 | char *errstr = ""; | ||
387 | int rxq_ = rxq; | ||
388 | |||
389 | /* Last RXQ in map catch overflows */ | ||
390 | if (rxq_ == nr_rxqs - 1) | ||
391 | rxq_ = -1; | ||
392 | |||
393 | rec = &stats_rec->rxq[rxq]; | ||
394 | prev = &stats_prev->rxq[rxq]; | ||
395 | t = calc_period(rec, prev); | ||
396 | for (i = 0; i < nr_cpus; i++) { | ||
397 | struct datarec *r = &rec->cpu[i]; | ||
398 | struct datarec *p = &prev->cpu[i]; | ||
399 | |||
400 | pps = calc_pps (r, p, t); | ||
401 | err = calc_errs_pps(r, p, t); | ||
402 | if (err > 0) { | ||
403 | if (rxq_ == -1) | ||
404 | errstr = "map-overflow-RXQ"; | ||
405 | else | ||
406 | errstr = "err"; | ||
407 | } | ||
408 | if (pps > 0) | ||
409 | printf(fmt_rx, "rx_queue_index", | ||
410 | rxq_, i, pps, err, errstr); | ||
411 | } | ||
412 | pps = calc_pps (&rec->total, &prev->total, t); | ||
413 | err = calc_errs_pps(&rec->total, &prev->total, t); | ||
414 | if (pps || err) | ||
415 | printf(fm2_rx, "rx_queue_index", rxq_, "sum", pps, err); | ||
416 | } | ||
417 | } | ||
418 | |||
419 | |||
420 | /* Pointer swap trick */ | ||
421 | static inline void swap(struct stats_record **a, struct stats_record **b) | ||
422 | { | ||
423 | struct stats_record *tmp; | ||
424 | |||
425 | tmp = *a; | ||
426 | *a = *b; | ||
427 | *b = tmp; | ||
428 | } | ||
429 | |||
430 | static void stats_poll(int interval, int action, __u32 cfg_opt) | ||
431 | { | ||
432 | struct stats_record *record, *prev; | ||
433 | |||
434 | record = alloc_stats_record(); | ||
435 | prev = alloc_stats_record(); | ||
436 | stats_collect(record); | ||
437 | |||
438 | while (1) { | ||
439 | swap(&prev, &record); | ||
440 | stats_collect(record); | ||
441 | stats_print(record, prev, action, cfg_opt); | ||
442 | sleep(interval); | ||
443 | } | ||
444 | |||
445 | free_stats_record(record); | ||
446 | free_stats_record(prev); | ||
447 | } | ||
448 | |||
449 | |||
450 | int main(int argc, char **argv) | ||
451 | { | ||
452 | __u32 cfg_options= NO_TOUCH ; /* Default: Don't touch packet memory */ | ||
453 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
454 | struct bpf_prog_load_attr prog_load_attr = { | ||
455 | .prog_type = BPF_PROG_TYPE_XDP, | ||
456 | }; | ||
457 | struct bpf_prog_info info = {}; | ||
458 | __u32 info_len = sizeof(info); | ||
459 | int prog_fd, map_fd, opt, err; | ||
460 | bool use_separators = true; | ||
461 | struct config cfg = { 0 }; | ||
462 | struct bpf_object *obj; | ||
463 | struct bpf_map *map; | ||
464 | char filename[256]; | ||
465 | int longindex = 0; | ||
466 | int interval = 2; | ||
467 | __u32 key = 0; | ||
468 | |||
469 | |||
470 | char action_str_buf[XDP_ACTION_MAX_STRLEN + 1 /* for \0 */] = { 0 }; | ||
471 | int action = XDP_PASS; /* Default action */ | ||
472 | char *action_str = NULL; | ||
473 | |||
474 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
475 | prog_load_attr.file = filename; | ||
476 | |||
477 | if (setrlimit(RLIMIT_MEMLOCK, &r)) { | ||
478 | perror("setrlimit(RLIMIT_MEMLOCK)"); | ||
479 | return 1; | ||
480 | } | ||
481 | |||
482 | if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) | ||
483 | return EXIT_FAIL; | ||
484 | |||
485 | map = bpf_object__find_map_by_name(obj, "config_map"); | ||
486 | stats_global_map = bpf_object__find_map_by_name(obj, "stats_global_map"); | ||
487 | rx_queue_index_map = bpf_object__find_map_by_name(obj, "rx_queue_index_map"); | ||
488 | if (!map || !stats_global_map || !rx_queue_index_map) { | ||
489 | printf("finding a map in obj file failed\n"); | ||
490 | return EXIT_FAIL; | ||
491 | } | ||
492 | map_fd = bpf_map__fd(map); | ||
493 | |||
494 | if (!prog_fd) { | ||
495 | fprintf(stderr, "ERR: bpf_prog_load_xattr: %s\n", strerror(errno)); | ||
496 | return EXIT_FAIL; | ||
497 | } | ||
498 | |||
499 | /* Parse commands line args */ | ||
500 | while ((opt = getopt_long(argc, argv, "FhSrmzd:s:a:", | ||
501 | long_options, &longindex)) != -1) { | ||
502 | switch (opt) { | ||
503 | case 'd': | ||
504 | if (strlen(optarg) >= IF_NAMESIZE) { | ||
505 | fprintf(stderr, "ERR: --dev name too long\n"); | ||
506 | goto error; | ||
507 | } | ||
508 | ifname = (char *)&ifname_buf; | ||
509 | strncpy(ifname, optarg, IF_NAMESIZE); | ||
510 | ifindex = if_nametoindex(ifname); | ||
511 | if (ifindex == 0) { | ||
512 | fprintf(stderr, | ||
513 | "ERR: --dev name unknown err(%d):%s\n", | ||
514 | errno, strerror(errno)); | ||
515 | goto error; | ||
516 | } | ||
517 | break; | ||
518 | case 's': | ||
519 | interval = atoi(optarg); | ||
520 | break; | ||
521 | case 'S': | ||
522 | xdp_flags |= XDP_FLAGS_SKB_MODE; | ||
523 | break; | ||
524 | case 'z': | ||
525 | use_separators = false; | ||
526 | break; | ||
527 | case 'a': | ||
528 | action_str = (char *)&action_str_buf; | ||
529 | strncpy(action_str, optarg, XDP_ACTION_MAX_STRLEN); | ||
530 | break; | ||
531 | case 'r': | ||
532 | cfg_options |= READ_MEM; | ||
533 | break; | ||
534 | case 'm': | ||
535 | cfg_options |= SWAP_MAC; | ||
536 | break; | ||
537 | case 'F': | ||
538 | xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; | ||
539 | break; | ||
540 | case 'h': | ||
541 | error: | ||
542 | default: | ||
543 | usage(argv); | ||
544 | return EXIT_FAIL_OPTION; | ||
545 | } | ||
546 | } | ||
547 | |||
548 | if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) | ||
549 | xdp_flags |= XDP_FLAGS_DRV_MODE; | ||
550 | |||
551 | /* Required option */ | ||
552 | if (ifindex == -1) { | ||
553 | fprintf(stderr, "ERR: required option --dev missing\n"); | ||
554 | usage(argv); | ||
555 | return EXIT_FAIL_OPTION; | ||
556 | } | ||
557 | cfg.ifindex = ifindex; | ||
558 | |||
559 | /* Parse action string */ | ||
560 | if (action_str) { | ||
561 | action = parse_xdp_action(action_str); | ||
562 | if (action < 0) { | ||
563 | fprintf(stderr, "ERR: Invalid XDP --action: %s\n", | ||
564 | action_str); | ||
565 | list_xdp_actions(); | ||
566 | return EXIT_FAIL_OPTION; | ||
567 | } | ||
568 | } | ||
569 | cfg.action = action; | ||
570 | |||
571 | /* XDP_TX requires changing MAC-addrs, else HW may drop */ | ||
572 | if (action == XDP_TX) | ||
573 | cfg_options |= SWAP_MAC; | ||
574 | cfg.options = cfg_options; | ||
575 | |||
576 | /* Trick to pretty printf with thousands separators use %' */ | ||
577 | if (use_separators) | ||
578 | setlocale(LC_NUMERIC, "en_US"); | ||
579 | |||
580 | /* User-side setup ifindex in config_map */ | ||
581 | err = bpf_map_update_elem(map_fd, &key, &cfg, 0); | ||
582 | if (err) { | ||
583 | fprintf(stderr, "Store config failed (err:%d)\n", err); | ||
584 | exit(EXIT_FAIL_BPF); | ||
585 | } | ||
586 | |||
587 | /* Remove XDP program when program is interrupted or killed */ | ||
588 | signal(SIGINT, int_exit); | ||
589 | signal(SIGTERM, int_exit); | ||
590 | |||
591 | if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) { | ||
592 | fprintf(stderr, "link set xdp fd failed\n"); | ||
593 | return EXIT_FAIL_XDP; | ||
594 | } | ||
595 | |||
596 | err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); | ||
597 | if (err) { | ||
598 | printf("can't get prog info - %s\n", strerror(errno)); | ||
599 | return err; | ||
600 | } | ||
601 | prog_id = info.id; | ||
602 | |||
603 | stats_poll(interval, action, cfg_options); | ||
604 | return EXIT_OK; | ||
605 | } | ||
diff --git a/samples/bpf/xdp_sample_pkts_kern.c b/samples/bpf/xdp_sample_pkts_kern.c new file mode 100644 index 000000000..9cf76b340 --- /dev/null +++ b/samples/bpf/xdp_sample_pkts_kern.c | |||
@@ -0,0 +1,57 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #include <linux/ptrace.h> | ||
3 | #include <linux/version.h> | ||
4 | #include <uapi/linux/bpf.h> | ||
5 | #include <bpf/bpf_helpers.h> | ||
6 | |||
7 | #define SAMPLE_SIZE 64ul | ||
8 | |||
9 | struct { | ||
10 | __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); | ||
11 | __uint(key_size, sizeof(int)); | ||
12 | __uint(value_size, sizeof(u32)); | ||
13 | } my_map SEC(".maps"); | ||
14 | |||
15 | SEC("xdp_sample") | ||
16 | int xdp_sample_prog(struct xdp_md *ctx) | ||
17 | { | ||
18 | void *data_end = (void *)(long)ctx->data_end; | ||
19 | void *data = (void *)(long)ctx->data; | ||
20 | |||
21 | /* Metadata will be in the perf event before the packet data. */ | ||
22 | struct S { | ||
23 | u16 cookie; | ||
24 | u16 pkt_len; | ||
25 | } __packed metadata; | ||
26 | |||
27 | if (data < data_end) { | ||
28 | /* The XDP perf_event_output handler will use the upper 32 bits | ||
29 | * of the flags argument as a number of bytes to include of the | ||
30 | * packet payload in the event data. If the size is too big, the | ||
31 | * call to bpf_perf_event_output will fail and return -EFAULT. | ||
32 | * | ||
33 | * See bpf_xdp_event_output in net/core/filter.c. | ||
34 | * | ||
35 | * The BPF_F_CURRENT_CPU flag means that the event output fd | ||
36 | * will be indexed by the CPU number in the event map. | ||
37 | */ | ||
38 | u64 flags = BPF_F_CURRENT_CPU; | ||
39 | u16 sample_size; | ||
40 | int ret; | ||
41 | |||
42 | metadata.cookie = 0xdead; | ||
43 | metadata.pkt_len = (u16)(data_end - data); | ||
44 | sample_size = min(metadata.pkt_len, SAMPLE_SIZE); | ||
45 | flags |= (u64)sample_size << 32; | ||
46 | |||
47 | ret = bpf_perf_event_output(ctx, &my_map, flags, | ||
48 | &metadata, sizeof(metadata)); | ||
49 | if (ret) | ||
50 | bpf_printk("perf_event_output failed: %d\n", ret); | ||
51 | } | ||
52 | |||
53 | return XDP_PASS; | ||
54 | } | ||
55 | |||
56 | char _license[] SEC("license") = "GPL"; | ||
57 | u32 _version SEC("version") = LINUX_VERSION_CODE; | ||
diff --git a/samples/bpf/xdp_sample_pkts_user.c b/samples/bpf/xdp_sample_pkts_user.c new file mode 100644 index 000000000..4b2a300c7 --- /dev/null +++ b/samples/bpf/xdp_sample_pkts_user.c | |||
@@ -0,0 +1,202 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #include <stdio.h> | ||
3 | #include <stdlib.h> | ||
4 | #include <string.h> | ||
5 | #include <linux/perf_event.h> | ||
6 | #include <linux/bpf.h> | ||
7 | #include <net/if.h> | ||
8 | #include <errno.h> | ||
9 | #include <assert.h> | ||
10 | #include <sys/sysinfo.h> | ||
11 | #include <sys/ioctl.h> | ||
12 | #include <signal.h> | ||
13 | #include <bpf/libbpf.h> | ||
14 | #include <bpf/bpf.h> | ||
15 | #include <sys/resource.h> | ||
16 | #include <libgen.h> | ||
17 | #include <linux/if_link.h> | ||
18 | |||
19 | #include "perf-sys.h" | ||
20 | |||
21 | static int if_idx; | ||
22 | static char *if_name; | ||
23 | static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; | ||
24 | static __u32 prog_id; | ||
25 | static struct perf_buffer *pb = NULL; | ||
26 | |||
27 | static int do_attach(int idx, int fd, const char *name) | ||
28 | { | ||
29 | struct bpf_prog_info info = {}; | ||
30 | __u32 info_len = sizeof(info); | ||
31 | int err; | ||
32 | |||
33 | err = bpf_set_link_xdp_fd(idx, fd, xdp_flags); | ||
34 | if (err < 0) { | ||
35 | printf("ERROR: failed to attach program to %s\n", name); | ||
36 | return err; | ||
37 | } | ||
38 | |||
39 | err = bpf_obj_get_info_by_fd(fd, &info, &info_len); | ||
40 | if (err) { | ||
41 | printf("can't get prog info - %s\n", strerror(errno)); | ||
42 | return err; | ||
43 | } | ||
44 | prog_id = info.id; | ||
45 | |||
46 | return err; | ||
47 | } | ||
48 | |||
49 | static int do_detach(int idx, const char *name) | ||
50 | { | ||
51 | __u32 curr_prog_id = 0; | ||
52 | int err = 0; | ||
53 | |||
54 | err = bpf_get_link_xdp_id(idx, &curr_prog_id, xdp_flags); | ||
55 | if (err) { | ||
56 | printf("bpf_get_link_xdp_id failed\n"); | ||
57 | return err; | ||
58 | } | ||
59 | if (prog_id == curr_prog_id) { | ||
60 | err = bpf_set_link_xdp_fd(idx, -1, xdp_flags); | ||
61 | if (err < 0) | ||
62 | printf("ERROR: failed to detach prog from %s\n", name); | ||
63 | } else if (!curr_prog_id) { | ||
64 | printf("couldn't find a prog id on a %s\n", name); | ||
65 | } else { | ||
66 | printf("program on interface changed, not removing\n"); | ||
67 | } | ||
68 | |||
69 | return err; | ||
70 | } | ||
71 | |||
72 | #define SAMPLE_SIZE 64 | ||
73 | |||
74 | static void print_bpf_output(void *ctx, int cpu, void *data, __u32 size) | ||
75 | { | ||
76 | struct { | ||
77 | __u16 cookie; | ||
78 | __u16 pkt_len; | ||
79 | __u8 pkt_data[SAMPLE_SIZE]; | ||
80 | } __packed *e = data; | ||
81 | int i; | ||
82 | |||
83 | if (e->cookie != 0xdead) { | ||
84 | printf("BUG cookie %x sized %d\n", e->cookie, size); | ||
85 | return; | ||
86 | } | ||
87 | |||
88 | printf("Pkt len: %-5d bytes. Ethernet hdr: ", e->pkt_len); | ||
89 | for (i = 0; i < 14 && i < e->pkt_len; i++) | ||
90 | printf("%02x ", e->pkt_data[i]); | ||
91 | printf("\n"); | ||
92 | } | ||
93 | |||
94 | static void sig_handler(int signo) | ||
95 | { | ||
96 | do_detach(if_idx, if_name); | ||
97 | perf_buffer__free(pb); | ||
98 | exit(0); | ||
99 | } | ||
100 | |||
101 | static void usage(const char *prog) | ||
102 | { | ||
103 | fprintf(stderr, | ||
104 | "%s: %s [OPTS] <ifname|ifindex>\n\n" | ||
105 | "OPTS:\n" | ||
106 | " -F force loading prog\n", | ||
107 | __func__, prog); | ||
108 | } | ||
109 | |||
110 | int main(int argc, char **argv) | ||
111 | { | ||
112 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
113 | struct bpf_prog_load_attr prog_load_attr = { | ||
114 | .prog_type = BPF_PROG_TYPE_XDP, | ||
115 | }; | ||
116 | struct perf_buffer_opts pb_opts = {}; | ||
117 | const char *optstr = "FS"; | ||
118 | int prog_fd, map_fd, opt; | ||
119 | struct bpf_object *obj; | ||
120 | struct bpf_map *map; | ||
121 | char filename[256]; | ||
122 | int ret, err; | ||
123 | |||
124 | while ((opt = getopt(argc, argv, optstr)) != -1) { | ||
125 | switch (opt) { | ||
126 | case 'F': | ||
127 | xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; | ||
128 | break; | ||
129 | case 'S': | ||
130 | xdp_flags |= XDP_FLAGS_SKB_MODE; | ||
131 | break; | ||
132 | default: | ||
133 | usage(basename(argv[0])); | ||
134 | return 1; | ||
135 | } | ||
136 | } | ||
137 | |||
138 | if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) | ||
139 | xdp_flags |= XDP_FLAGS_DRV_MODE; | ||
140 | |||
141 | if (optind == argc) { | ||
142 | usage(basename(argv[0])); | ||
143 | return 1; | ||
144 | } | ||
145 | |||
146 | if (setrlimit(RLIMIT_MEMLOCK, &r)) { | ||
147 | perror("setrlimit(RLIMIT_MEMLOCK)"); | ||
148 | return 1; | ||
149 | } | ||
150 | |||
151 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
152 | prog_load_attr.file = filename; | ||
153 | |||
154 | if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) | ||
155 | return 1; | ||
156 | |||
157 | if (!prog_fd) { | ||
158 | printf("bpf_prog_load_xattr: %s\n", strerror(errno)); | ||
159 | return 1; | ||
160 | } | ||
161 | |||
162 | map = bpf_map__next(NULL, obj); | ||
163 | if (!map) { | ||
164 | printf("finding a map in obj file failed\n"); | ||
165 | return 1; | ||
166 | } | ||
167 | map_fd = bpf_map__fd(map); | ||
168 | |||
169 | if_idx = if_nametoindex(argv[optind]); | ||
170 | if (!if_idx) | ||
171 | if_idx = strtoul(argv[optind], NULL, 0); | ||
172 | |||
173 | if (!if_idx) { | ||
174 | fprintf(stderr, "Invalid ifname\n"); | ||
175 | return 1; | ||
176 | } | ||
177 | if_name = argv[optind]; | ||
178 | err = do_attach(if_idx, prog_fd, if_name); | ||
179 | if (err) | ||
180 | return err; | ||
181 | |||
182 | if (signal(SIGINT, sig_handler) || | ||
183 | signal(SIGHUP, sig_handler) || | ||
184 | signal(SIGTERM, sig_handler)) { | ||
185 | perror("signal"); | ||
186 | return 1; | ||
187 | } | ||
188 | |||
189 | pb_opts.sample_cb = print_bpf_output; | ||
190 | pb = perf_buffer__new(map_fd, 8, &pb_opts); | ||
191 | err = libbpf_get_error(pb); | ||
192 | if (err) { | ||
193 | perror("perf_buffer setup failed"); | ||
194 | return 1; | ||
195 | } | ||
196 | |||
197 | while ((ret = perf_buffer__poll(pb, 1000)) >= 0) { | ||
198 | } | ||
199 | |||
200 | kill(0, SIGINT); | ||
201 | return ret; | ||
202 | } | ||
diff --git a/samples/bpf/xdp_tx_iptunnel_common.h b/samples/bpf/xdp_tx_iptunnel_common.h new file mode 100644 index 000000000..be839892c --- /dev/null +++ b/samples/bpf/xdp_tx_iptunnel_common.h | |||
@@ -0,0 +1,34 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0-only */ | ||
2 | /* Copyright (c) 2016 Facebook | ||
3 | */ | ||
4 | #ifndef _SAMPLES_BPF_XDP_TX_IPTNL_COMMON_H | ||
5 | #define _SAMPLES_BPF_XDP_TX_IPTNL_COMMON_H | ||
6 | |||
7 | #include <linux/types.h> | ||
8 | |||
9 | #define MAX_IPTNL_ENTRIES 256U | ||
10 | |||
11 | struct vip { | ||
12 | union { | ||
13 | __u32 v6[4]; | ||
14 | __u32 v4; | ||
15 | } daddr; | ||
16 | __u16 dport; | ||
17 | __u16 family; | ||
18 | __u8 protocol; | ||
19 | }; | ||
20 | |||
21 | struct iptnl_info { | ||
22 | union { | ||
23 | __u32 v6[4]; | ||
24 | __u32 v4; | ||
25 | } saddr; | ||
26 | union { | ||
27 | __u32 v6[4]; | ||
28 | __u32 v4; | ||
29 | } daddr; | ||
30 | __u16 family; | ||
31 | __u8 dmac[6]; | ||
32 | }; | ||
33 | |||
34 | #endif | ||
diff --git a/samples/bpf/xdp_tx_iptunnel_kern.c b/samples/bpf/xdp_tx_iptunnel_kern.c new file mode 100644 index 000000000..575d57e4b --- /dev/null +++ b/samples/bpf/xdp_tx_iptunnel_kern.c | |||
@@ -0,0 +1,237 @@ | |||
1 | /* Copyright (c) 2016 Facebook | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | * | ||
7 | * This program shows how to use bpf_xdp_adjust_head() by | ||
8 | * encapsulating the incoming packet in an IPv4/v6 header | ||
9 | * and then XDP_TX it out. | ||
10 | */ | ||
11 | #define KBUILD_MODNAME "foo" | ||
12 | #include <uapi/linux/bpf.h> | ||
13 | #include <linux/in.h> | ||
14 | #include <linux/if_ether.h> | ||
15 | #include <linux/if_packet.h> | ||
16 | #include <linux/if_vlan.h> | ||
17 | #include <linux/ip.h> | ||
18 | #include <linux/ipv6.h> | ||
19 | #include <bpf/bpf_helpers.h> | ||
20 | #include "xdp_tx_iptunnel_common.h" | ||
21 | |||
22 | struct { | ||
23 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); | ||
24 | __type(key, __u32); | ||
25 | __type(value, __u64); | ||
26 | __uint(max_entries, 256); | ||
27 | } rxcnt SEC(".maps"); | ||
28 | |||
29 | struct { | ||
30 | __uint(type, BPF_MAP_TYPE_HASH); | ||
31 | __type(key, struct vip); | ||
32 | __type(value, struct iptnl_info); | ||
33 | __uint(max_entries, MAX_IPTNL_ENTRIES); | ||
34 | } vip2tnl SEC(".maps"); | ||
35 | |||
36 | static __always_inline void count_tx(u32 protocol) | ||
37 | { | ||
38 | u64 *rxcnt_count; | ||
39 | |||
40 | rxcnt_count = bpf_map_lookup_elem(&rxcnt, &protocol); | ||
41 | if (rxcnt_count) | ||
42 | *rxcnt_count += 1; | ||
43 | } | ||
44 | |||
45 | static __always_inline int get_dport(void *trans_data, void *data_end, | ||
46 | u8 protocol) | ||
47 | { | ||
48 | struct tcphdr *th; | ||
49 | struct udphdr *uh; | ||
50 | |||
51 | switch (protocol) { | ||
52 | case IPPROTO_TCP: | ||
53 | th = (struct tcphdr *)trans_data; | ||
54 | if (th + 1 > data_end) | ||
55 | return -1; | ||
56 | return th->dest; | ||
57 | case IPPROTO_UDP: | ||
58 | uh = (struct udphdr *)trans_data; | ||
59 | if (uh + 1 > data_end) | ||
60 | return -1; | ||
61 | return uh->dest; | ||
62 | default: | ||
63 | return 0; | ||
64 | } | ||
65 | } | ||
66 | |||
67 | static __always_inline void set_ethhdr(struct ethhdr *new_eth, | ||
68 | const struct ethhdr *old_eth, | ||
69 | const struct iptnl_info *tnl, | ||
70 | __be16 h_proto) | ||
71 | { | ||
72 | memcpy(new_eth->h_source, old_eth->h_dest, sizeof(new_eth->h_source)); | ||
73 | memcpy(new_eth->h_dest, tnl->dmac, sizeof(new_eth->h_dest)); | ||
74 | new_eth->h_proto = h_proto; | ||
75 | } | ||
76 | |||
77 | static __always_inline int handle_ipv4(struct xdp_md *xdp) | ||
78 | { | ||
79 | void *data_end = (void *)(long)xdp->data_end; | ||
80 | void *data = (void *)(long)xdp->data; | ||
81 | struct iptnl_info *tnl; | ||
82 | struct ethhdr *new_eth; | ||
83 | struct ethhdr *old_eth; | ||
84 | struct iphdr *iph = data + sizeof(struct ethhdr); | ||
85 | u16 *next_iph_u16; | ||
86 | u16 payload_len; | ||
87 | struct vip vip = {}; | ||
88 | int dport; | ||
89 | u32 csum = 0; | ||
90 | int i; | ||
91 | |||
92 | if (iph + 1 > data_end) | ||
93 | return XDP_DROP; | ||
94 | |||
95 | dport = get_dport(iph + 1, data_end, iph->protocol); | ||
96 | if (dport == -1) | ||
97 | return XDP_DROP; | ||
98 | |||
99 | vip.protocol = iph->protocol; | ||
100 | vip.family = AF_INET; | ||
101 | vip.daddr.v4 = iph->daddr; | ||
102 | vip.dport = dport; | ||
103 | payload_len = ntohs(iph->tot_len); | ||
104 | |||
105 | tnl = bpf_map_lookup_elem(&vip2tnl, &vip); | ||
106 | /* It only does v4-in-v4 */ | ||
107 | if (!tnl || tnl->family != AF_INET) | ||
108 | return XDP_PASS; | ||
109 | |||
110 | /* The vip key is found. Add an IP header and send it out */ | ||
111 | |||
112 | if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct iphdr))) | ||
113 | return XDP_DROP; | ||
114 | |||
115 | data = (void *)(long)xdp->data; | ||
116 | data_end = (void *)(long)xdp->data_end; | ||
117 | |||
118 | new_eth = data; | ||
119 | iph = data + sizeof(*new_eth); | ||
120 | old_eth = data + sizeof(*iph); | ||
121 | |||
122 | if (new_eth + 1 > data_end || | ||
123 | old_eth + 1 > data_end || | ||
124 | iph + 1 > data_end) | ||
125 | return XDP_DROP; | ||
126 | |||
127 | set_ethhdr(new_eth, old_eth, tnl, htons(ETH_P_IP)); | ||
128 | |||
129 | iph->version = 4; | ||
130 | iph->ihl = sizeof(*iph) >> 2; | ||
131 | iph->frag_off = 0; | ||
132 | iph->protocol = IPPROTO_IPIP; | ||
133 | iph->check = 0; | ||
134 | iph->tos = 0; | ||
135 | iph->tot_len = htons(payload_len + sizeof(*iph)); | ||
136 | iph->daddr = tnl->daddr.v4; | ||
137 | iph->saddr = tnl->saddr.v4; | ||
138 | iph->ttl = 8; | ||
139 | |||
140 | next_iph_u16 = (u16 *)iph; | ||
141 | #pragma clang loop unroll(full) | ||
142 | for (i = 0; i < sizeof(*iph) >> 1; i++) | ||
143 | csum += *next_iph_u16++; | ||
144 | |||
145 | iph->check = ~((csum & 0xffff) + (csum >> 16)); | ||
146 | |||
147 | count_tx(vip.protocol); | ||
148 | |||
149 | return XDP_TX; | ||
150 | } | ||
151 | |||
152 | static __always_inline int handle_ipv6(struct xdp_md *xdp) | ||
153 | { | ||
154 | void *data_end = (void *)(long)xdp->data_end; | ||
155 | void *data = (void *)(long)xdp->data; | ||
156 | struct iptnl_info *tnl; | ||
157 | struct ethhdr *new_eth; | ||
158 | struct ethhdr *old_eth; | ||
159 | struct ipv6hdr *ip6h = data + sizeof(struct ethhdr); | ||
160 | __u16 payload_len; | ||
161 | struct vip vip = {}; | ||
162 | int dport; | ||
163 | |||
164 | if (ip6h + 1 > data_end) | ||
165 | return XDP_DROP; | ||
166 | |||
167 | dport = get_dport(ip6h + 1, data_end, ip6h->nexthdr); | ||
168 | if (dport == -1) | ||
169 | return XDP_DROP; | ||
170 | |||
171 | vip.protocol = ip6h->nexthdr; | ||
172 | vip.family = AF_INET6; | ||
173 | memcpy(vip.daddr.v6, ip6h->daddr.s6_addr32, sizeof(vip.daddr)); | ||
174 | vip.dport = dport; | ||
175 | payload_len = ip6h->payload_len; | ||
176 | |||
177 | tnl = bpf_map_lookup_elem(&vip2tnl, &vip); | ||
178 | /* It only does v6-in-v6 */ | ||
179 | if (!tnl || tnl->family != AF_INET6) | ||
180 | return XDP_PASS; | ||
181 | |||
182 | /* The vip key is found. Add an IP header and send it out */ | ||
183 | |||
184 | if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct ipv6hdr))) | ||
185 | return XDP_DROP; | ||
186 | |||
187 | data = (void *)(long)xdp->data; | ||
188 | data_end = (void *)(long)xdp->data_end; | ||
189 | |||
190 | new_eth = data; | ||
191 | ip6h = data + sizeof(*new_eth); | ||
192 | old_eth = data + sizeof(*ip6h); | ||
193 | |||
194 | if (new_eth + 1 > data_end || | ||
195 | old_eth + 1 > data_end || | ||
196 | ip6h + 1 > data_end) | ||
197 | return XDP_DROP; | ||
198 | |||
199 | set_ethhdr(new_eth, old_eth, tnl, htons(ETH_P_IPV6)); | ||
200 | |||
201 | ip6h->version = 6; | ||
202 | ip6h->priority = 0; | ||
203 | memset(ip6h->flow_lbl, 0, sizeof(ip6h->flow_lbl)); | ||
204 | ip6h->payload_len = htons(ntohs(payload_len) + sizeof(*ip6h)); | ||
205 | ip6h->nexthdr = IPPROTO_IPV6; | ||
206 | ip6h->hop_limit = 8; | ||
207 | memcpy(ip6h->saddr.s6_addr32, tnl->saddr.v6, sizeof(tnl->saddr.v6)); | ||
208 | memcpy(ip6h->daddr.s6_addr32, tnl->daddr.v6, sizeof(tnl->daddr.v6)); | ||
209 | |||
210 | count_tx(vip.protocol); | ||
211 | |||
212 | return XDP_TX; | ||
213 | } | ||
214 | |||
215 | SEC("xdp_tx_iptunnel") | ||
216 | int _xdp_tx_iptunnel(struct xdp_md *xdp) | ||
217 | { | ||
218 | void *data_end = (void *)(long)xdp->data_end; | ||
219 | void *data = (void *)(long)xdp->data; | ||
220 | struct ethhdr *eth = data; | ||
221 | __u16 h_proto; | ||
222 | |||
223 | if (eth + 1 > data_end) | ||
224 | return XDP_DROP; | ||
225 | |||
226 | h_proto = eth->h_proto; | ||
227 | |||
228 | if (h_proto == htons(ETH_P_IP)) | ||
229 | return handle_ipv4(xdp); | ||
230 | else if (h_proto == htons(ETH_P_IPV6)) | ||
231 | |||
232 | return handle_ipv6(xdp); | ||
233 | else | ||
234 | return XDP_PASS; | ||
235 | } | ||
236 | |||
237 | char _license[] SEC("license") = "GPL"; | ||
diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c new file mode 100644 index 000000000..a419bee15 --- /dev/null +++ b/samples/bpf/xdp_tx_iptunnel_user.c | |||
@@ -0,0 +1,314 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* Copyright (c) 2016 Facebook | ||
3 | */ | ||
4 | #include <linux/bpf.h> | ||
5 | #include <linux/if_link.h> | ||
6 | #include <assert.h> | ||
7 | #include <errno.h> | ||
8 | #include <signal.h> | ||
9 | #include <stdio.h> | ||
10 | #include <stdlib.h> | ||
11 | #include <string.h> | ||
12 | #include <net/if.h> | ||
13 | #include <sys/resource.h> | ||
14 | #include <arpa/inet.h> | ||
15 | #include <netinet/ether.h> | ||
16 | #include <unistd.h> | ||
17 | #include <time.h> | ||
18 | #include <bpf/libbpf.h> | ||
19 | #include <bpf/bpf.h> | ||
20 | #include "bpf_util.h" | ||
21 | #include "xdp_tx_iptunnel_common.h" | ||
22 | |||
23 | #define STATS_INTERVAL_S 2U | ||
24 | |||
25 | static int ifindex = -1; | ||
26 | static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; | ||
27 | static int rxcnt_map_fd; | ||
28 | static __u32 prog_id; | ||
29 | |||
30 | static void int_exit(int sig) | ||
31 | { | ||
32 | __u32 curr_prog_id = 0; | ||
33 | |||
34 | if (ifindex > -1) { | ||
35 | if (bpf_get_link_xdp_id(ifindex, &curr_prog_id, xdp_flags)) { | ||
36 | printf("bpf_get_link_xdp_id failed\n"); | ||
37 | exit(1); | ||
38 | } | ||
39 | if (prog_id == curr_prog_id) | ||
40 | bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); | ||
41 | else if (!curr_prog_id) | ||
42 | printf("couldn't find a prog id on a given iface\n"); | ||
43 | else | ||
44 | printf("program on interface changed, not removing\n"); | ||
45 | } | ||
46 | exit(0); | ||
47 | } | ||
48 | |||
49 | /* simple per-protocol drop counter | ||
50 | */ | ||
51 | static void poll_stats(unsigned int kill_after_s) | ||
52 | { | ||
53 | const unsigned int nr_protos = 256; | ||
54 | unsigned int nr_cpus = bpf_num_possible_cpus(); | ||
55 | time_t started_at = time(NULL); | ||
56 | __u64 values[nr_cpus], prev[nr_protos][nr_cpus]; | ||
57 | __u32 proto; | ||
58 | int i; | ||
59 | |||
60 | memset(prev, 0, sizeof(prev)); | ||
61 | |||
62 | while (!kill_after_s || time(NULL) - started_at <= kill_after_s) { | ||
63 | sleep(STATS_INTERVAL_S); | ||
64 | |||
65 | for (proto = 0; proto < nr_protos; proto++) { | ||
66 | __u64 sum = 0; | ||
67 | |||
68 | assert(bpf_map_lookup_elem(rxcnt_map_fd, &proto, | ||
69 | values) == 0); | ||
70 | for (i = 0; i < nr_cpus; i++) | ||
71 | sum += (values[i] - prev[proto][i]); | ||
72 | |||
73 | if (sum) | ||
74 | printf("proto %u: sum:%10llu pkts, rate:%10llu pkts/s\n", | ||
75 | proto, sum, sum / STATS_INTERVAL_S); | ||
76 | memcpy(prev[proto], values, sizeof(values)); | ||
77 | } | ||
78 | } | ||
79 | } | ||
80 | |||
81 | static void usage(const char *cmd) | ||
82 | { | ||
83 | printf("Start a XDP prog which encapsulates incoming packets\n" | ||
84 | "in an IPv4/v6 header and XDP_TX it out. The dst <VIP:PORT>\n" | ||
85 | "is used to select packets to encapsulate\n\n"); | ||
86 | printf("Usage: %s [...]\n", cmd); | ||
87 | printf(" -i <ifname|ifindex> Interface\n"); | ||
88 | printf(" -a <vip-service-address> IPv4 or IPv6\n"); | ||
89 | printf(" -p <vip-service-port> A port range (e.g. 433-444) is also allowed\n"); | ||
90 | printf(" -s <source-ip> Used in the IPTunnel header\n"); | ||
91 | printf(" -d <dest-ip> Used in the IPTunnel header\n"); | ||
92 | printf(" -m <dest-MAC> Used in sending the IP Tunneled pkt\n"); | ||
93 | printf(" -T <stop-after-X-seconds> Default: 0 (forever)\n"); | ||
94 | printf(" -P <IP-Protocol> Default is TCP\n"); | ||
95 | printf(" -S use skb-mode\n"); | ||
96 | printf(" -N enforce native mode\n"); | ||
97 | printf(" -F Force loading the XDP prog\n"); | ||
98 | printf(" -h Display this help\n"); | ||
99 | } | ||
100 | |||
101 | static int parse_ipstr(const char *ipstr, unsigned int *addr) | ||
102 | { | ||
103 | if (inet_pton(AF_INET6, ipstr, addr) == 1) { | ||
104 | return AF_INET6; | ||
105 | } else if (inet_pton(AF_INET, ipstr, addr) == 1) { | ||
106 | addr[1] = addr[2] = addr[3] = 0; | ||
107 | return AF_INET; | ||
108 | } | ||
109 | |||
110 | fprintf(stderr, "%s is an invalid IP\n", ipstr); | ||
111 | return AF_UNSPEC; | ||
112 | } | ||
113 | |||
114 | static int parse_ports(const char *port_str, int *min_port, int *max_port) | ||
115 | { | ||
116 | char *end; | ||
117 | long tmp_min_port; | ||
118 | long tmp_max_port; | ||
119 | |||
120 | tmp_min_port = strtol(optarg, &end, 10); | ||
121 | if (tmp_min_port < 1 || tmp_min_port > 65535) { | ||
122 | fprintf(stderr, "Invalid port(s):%s\n", optarg); | ||
123 | return 1; | ||
124 | } | ||
125 | |||
126 | if (*end == '-') { | ||
127 | end++; | ||
128 | tmp_max_port = strtol(end, NULL, 10); | ||
129 | if (tmp_max_port < 1 || tmp_max_port > 65535) { | ||
130 | fprintf(stderr, "Invalid port(s):%s\n", optarg); | ||
131 | return 1; | ||
132 | } | ||
133 | } else { | ||
134 | tmp_max_port = tmp_min_port; | ||
135 | } | ||
136 | |||
137 | if (tmp_min_port > tmp_max_port) { | ||
138 | fprintf(stderr, "Invalid port(s):%s\n", optarg); | ||
139 | return 1; | ||
140 | } | ||
141 | |||
142 | if (tmp_max_port - tmp_min_port + 1 > MAX_IPTNL_ENTRIES) { | ||
143 | fprintf(stderr, "Port range (%s) is larger than %u\n", | ||
144 | port_str, MAX_IPTNL_ENTRIES); | ||
145 | return 1; | ||
146 | } | ||
147 | *min_port = tmp_min_port; | ||
148 | *max_port = tmp_max_port; | ||
149 | |||
150 | return 0; | ||
151 | } | ||
152 | |||
153 | int main(int argc, char **argv) | ||
154 | { | ||
155 | struct bpf_prog_load_attr prog_load_attr = { | ||
156 | .prog_type = BPF_PROG_TYPE_XDP, | ||
157 | }; | ||
158 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
159 | int min_port = 0, max_port = 0, vip2tnl_map_fd; | ||
160 | const char *optstr = "i:a:p:s:d:m:T:P:FSNh"; | ||
161 | unsigned char opt_flags[256] = {}; | ||
162 | struct bpf_prog_info info = {}; | ||
163 | __u32 info_len = sizeof(info); | ||
164 | unsigned int kill_after_s = 0; | ||
165 | struct iptnl_info tnl = {}; | ||
166 | struct bpf_object *obj; | ||
167 | struct vip vip = {}; | ||
168 | char filename[256]; | ||
169 | int opt, prog_fd; | ||
170 | int i, err; | ||
171 | |||
172 | tnl.family = AF_UNSPEC; | ||
173 | vip.protocol = IPPROTO_TCP; | ||
174 | |||
175 | for (i = 0; i < strlen(optstr); i++) | ||
176 | if (optstr[i] != 'h' && 'a' <= optstr[i] && optstr[i] <= 'z') | ||
177 | opt_flags[(unsigned char)optstr[i]] = 1; | ||
178 | |||
179 | while ((opt = getopt(argc, argv, optstr)) != -1) { | ||
180 | unsigned short family; | ||
181 | unsigned int *v6; | ||
182 | |||
183 | switch (opt) { | ||
184 | case 'i': | ||
185 | ifindex = if_nametoindex(optarg); | ||
186 | if (!ifindex) | ||
187 | ifindex = atoi(optarg); | ||
188 | break; | ||
189 | case 'a': | ||
190 | vip.family = parse_ipstr(optarg, vip.daddr.v6); | ||
191 | if (vip.family == AF_UNSPEC) | ||
192 | return 1; | ||
193 | break; | ||
194 | case 'p': | ||
195 | if (parse_ports(optarg, &min_port, &max_port)) | ||
196 | return 1; | ||
197 | break; | ||
198 | case 'P': | ||
199 | vip.protocol = atoi(optarg); | ||
200 | break; | ||
201 | case 's': | ||
202 | case 'd': | ||
203 | if (opt == 's') | ||
204 | v6 = tnl.saddr.v6; | ||
205 | else | ||
206 | v6 = tnl.daddr.v6; | ||
207 | |||
208 | family = parse_ipstr(optarg, v6); | ||
209 | if (family == AF_UNSPEC) | ||
210 | return 1; | ||
211 | if (tnl.family == AF_UNSPEC) { | ||
212 | tnl.family = family; | ||
213 | } else if (tnl.family != family) { | ||
214 | fprintf(stderr, | ||
215 | "The IP version of the src and dst addresses used in the IP encapsulation does not match\n"); | ||
216 | return 1; | ||
217 | } | ||
218 | break; | ||
219 | case 'm': | ||
220 | if (!ether_aton_r(optarg, | ||
221 | (struct ether_addr *)tnl.dmac)) { | ||
222 | fprintf(stderr, "Invalid mac address:%s\n", | ||
223 | optarg); | ||
224 | return 1; | ||
225 | } | ||
226 | break; | ||
227 | case 'T': | ||
228 | kill_after_s = atoi(optarg); | ||
229 | break; | ||
230 | case 'S': | ||
231 | xdp_flags |= XDP_FLAGS_SKB_MODE; | ||
232 | break; | ||
233 | case 'N': | ||
234 | /* default, set below */ | ||
235 | break; | ||
236 | case 'F': | ||
237 | xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; | ||
238 | break; | ||
239 | default: | ||
240 | usage(argv[0]); | ||
241 | return 1; | ||
242 | } | ||
243 | opt_flags[opt] = 0; | ||
244 | } | ||
245 | |||
246 | if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) | ||
247 | xdp_flags |= XDP_FLAGS_DRV_MODE; | ||
248 | |||
249 | for (i = 0; i < strlen(optstr); i++) { | ||
250 | if (opt_flags[(unsigned int)optstr[i]]) { | ||
251 | fprintf(stderr, "Missing argument -%c\n", optstr[i]); | ||
252 | usage(argv[0]); | ||
253 | return 1; | ||
254 | } | ||
255 | } | ||
256 | |||
257 | if (setrlimit(RLIMIT_MEMLOCK, &r)) { | ||
258 | perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); | ||
259 | return 1; | ||
260 | } | ||
261 | |||
262 | if (!ifindex) { | ||
263 | fprintf(stderr, "Invalid ifname\n"); | ||
264 | return 1; | ||
265 | } | ||
266 | |||
267 | snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); | ||
268 | prog_load_attr.file = filename; | ||
269 | |||
270 | if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) | ||
271 | return 1; | ||
272 | |||
273 | if (!prog_fd) { | ||
274 | printf("bpf_prog_load_xattr: %s\n", strerror(errno)); | ||
275 | return 1; | ||
276 | } | ||
277 | |||
278 | rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt"); | ||
279 | vip2tnl_map_fd = bpf_object__find_map_fd_by_name(obj, "vip2tnl"); | ||
280 | if (vip2tnl_map_fd < 0 || rxcnt_map_fd < 0) { | ||
281 | printf("bpf_object__find_map_fd_by_name failed\n"); | ||
282 | return 1; | ||
283 | } | ||
284 | |||
285 | signal(SIGINT, int_exit); | ||
286 | signal(SIGTERM, int_exit); | ||
287 | |||
288 | while (min_port <= max_port) { | ||
289 | vip.dport = htons(min_port++); | ||
290 | if (bpf_map_update_elem(vip2tnl_map_fd, &vip, &tnl, | ||
291 | BPF_NOEXIST)) { | ||
292 | perror("bpf_map_update_elem(&vip2tnl)"); | ||
293 | return 1; | ||
294 | } | ||
295 | } | ||
296 | |||
297 | if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) { | ||
298 | printf("link set xdp fd failed\n"); | ||
299 | return 1; | ||
300 | } | ||
301 | |||
302 | err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); | ||
303 | if (err) { | ||
304 | printf("can't get prog info - %s\n", strerror(errno)); | ||
305 | return err; | ||
306 | } | ||
307 | prog_id = info.id; | ||
308 | |||
309 | poll_stats(kill_after_s); | ||
310 | |||
311 | bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); | ||
312 | |||
313 | return 0; | ||
314 | } | ||
diff --git a/samples/bpf/xdpsock.h b/samples/bpf/xdpsock.h new file mode 100644 index 000000000..b7eca15c7 --- /dev/null +++ b/samples/bpf/xdpsock.h | |||
@@ -0,0 +1,11 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 | ||
2 | * | ||
3 | * Copyright(c) 2019 Intel Corporation. | ||
4 | */ | ||
5 | |||
6 | #ifndef XDPSOCK_H_ | ||
7 | #define XDPSOCK_H_ | ||
8 | |||
9 | #define MAX_SOCKS 4 | ||
10 | |||
11 | #endif /* XDPSOCK_H */ | ||
diff --git a/samples/bpf/xdpsock_kern.c b/samples/bpf/xdpsock_kern.c new file mode 100644 index 000000000..054304843 --- /dev/null +++ b/samples/bpf/xdpsock_kern.c | |||
@@ -0,0 +1,24 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #include <linux/bpf.h> | ||
3 | #include <bpf/bpf_helpers.h> | ||
4 | #include "xdpsock.h" | ||
5 | |||
6 | /* This XDP program is only needed for the XDP_SHARED_UMEM mode. | ||
7 | * If you do not use this mode, libbpf can supply an XDP program for you. | ||
8 | */ | ||
9 | |||
10 | struct { | ||
11 | __uint(type, BPF_MAP_TYPE_XSKMAP); | ||
12 | __uint(max_entries, MAX_SOCKS); | ||
13 | __uint(key_size, sizeof(int)); | ||
14 | __uint(value_size, sizeof(int)); | ||
15 | } xsks_map SEC(".maps"); | ||
16 | |||
17 | static unsigned int rr; | ||
18 | |||
19 | SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) | ||
20 | { | ||
21 | rr = (rr + 1) & (MAX_SOCKS - 1); | ||
22 | |||
23 | return bpf_redirect_map(&xsks_map, rr, XDP_DROP); | ||
24 | } | ||
diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c new file mode 100644 index 000000000..cf5b0a895 --- /dev/null +++ b/samples/bpf/xdpsock_user.c | |||
@@ -0,0 +1,1550 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* Copyright(c) 2017 - 2018 Intel Corporation. */ | ||
3 | |||
4 | #include <asm/barrier.h> | ||
5 | #include <errno.h> | ||
6 | #include <getopt.h> | ||
7 | #include <libgen.h> | ||
8 | #include <linux/bpf.h> | ||
9 | #include <linux/compiler.h> | ||
10 | #include <linux/if_link.h> | ||
11 | #include <linux/if_xdp.h> | ||
12 | #include <linux/if_ether.h> | ||
13 | #include <linux/ip.h> | ||
14 | #include <linux/limits.h> | ||
15 | #include <linux/udp.h> | ||
16 | #include <arpa/inet.h> | ||
17 | #include <locale.h> | ||
18 | #include <net/ethernet.h> | ||
19 | #include <net/if.h> | ||
20 | #include <poll.h> | ||
21 | #include <pthread.h> | ||
22 | #include <signal.h> | ||
23 | #include <stdbool.h> | ||
24 | #include <stdio.h> | ||
25 | #include <stdlib.h> | ||
26 | #include <string.h> | ||
27 | #include <sys/mman.h> | ||
28 | #include <sys/resource.h> | ||
29 | #include <sys/socket.h> | ||
30 | #include <sys/types.h> | ||
31 | #include <time.h> | ||
32 | #include <unistd.h> | ||
33 | |||
34 | #include <bpf/libbpf.h> | ||
35 | #include <bpf/xsk.h> | ||
36 | #include <bpf/bpf.h> | ||
37 | #include "xdpsock.h" | ||
38 | |||
39 | #ifndef SOL_XDP | ||
40 | #define SOL_XDP 283 | ||
41 | #endif | ||
42 | |||
43 | #ifndef AF_XDP | ||
44 | #define AF_XDP 44 | ||
45 | #endif | ||
46 | |||
47 | #ifndef PF_XDP | ||
48 | #define PF_XDP AF_XDP | ||
49 | #endif | ||
50 | |||
51 | #define NUM_FRAMES (4 * 1024) | ||
52 | #define MIN_PKT_SIZE 64 | ||
53 | |||
54 | #define DEBUG_HEXDUMP 0 | ||
55 | |||
56 | typedef __u64 u64; | ||
57 | typedef __u32 u32; | ||
58 | typedef __u16 u16; | ||
59 | typedef __u8 u8; | ||
60 | |||
61 | static unsigned long prev_time; | ||
62 | |||
63 | enum benchmark_type { | ||
64 | BENCH_RXDROP = 0, | ||
65 | BENCH_TXONLY = 1, | ||
66 | BENCH_L2FWD = 2, | ||
67 | }; | ||
68 | |||
69 | static enum benchmark_type opt_bench = BENCH_RXDROP; | ||
70 | static u32 opt_xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; | ||
71 | static const char *opt_if = ""; | ||
72 | static int opt_ifindex; | ||
73 | static int opt_queue; | ||
74 | static unsigned long opt_duration; | ||
75 | static unsigned long start_time; | ||
76 | static bool benchmark_done; | ||
77 | static u32 opt_batch_size = 64; | ||
78 | static int opt_pkt_count; | ||
79 | static u16 opt_pkt_size = MIN_PKT_SIZE; | ||
80 | static u32 opt_pkt_fill_pattern = 0x12345678; | ||
81 | static bool opt_extra_stats; | ||
82 | static bool opt_quiet; | ||
83 | static bool opt_app_stats; | ||
84 | static const char *opt_irq_str = ""; | ||
85 | static u32 irq_no; | ||
86 | static int irqs_at_init = -1; | ||
87 | static int opt_poll; | ||
88 | static int opt_interval = 1; | ||
89 | static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; | ||
90 | static u32 opt_umem_flags; | ||
91 | static int opt_unaligned_chunks; | ||
92 | static int opt_mmap_flags; | ||
93 | static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; | ||
94 | static int opt_timeout = 1000; | ||
95 | static bool opt_need_wakeup = true; | ||
96 | static u32 opt_num_xsks = 1; | ||
97 | static u32 prog_id; | ||
98 | |||
99 | struct xsk_ring_stats { | ||
100 | unsigned long rx_npkts; | ||
101 | unsigned long tx_npkts; | ||
102 | unsigned long rx_dropped_npkts; | ||
103 | unsigned long rx_invalid_npkts; | ||
104 | unsigned long tx_invalid_npkts; | ||
105 | unsigned long rx_full_npkts; | ||
106 | unsigned long rx_fill_empty_npkts; | ||
107 | unsigned long tx_empty_npkts; | ||
108 | unsigned long prev_rx_npkts; | ||
109 | unsigned long prev_tx_npkts; | ||
110 | unsigned long prev_rx_dropped_npkts; | ||
111 | unsigned long prev_rx_invalid_npkts; | ||
112 | unsigned long prev_tx_invalid_npkts; | ||
113 | unsigned long prev_rx_full_npkts; | ||
114 | unsigned long prev_rx_fill_empty_npkts; | ||
115 | unsigned long prev_tx_empty_npkts; | ||
116 | }; | ||
117 | |||
118 | struct xsk_driver_stats { | ||
119 | unsigned long intrs; | ||
120 | unsigned long prev_intrs; | ||
121 | }; | ||
122 | |||
123 | struct xsk_app_stats { | ||
124 | unsigned long rx_empty_polls; | ||
125 | unsigned long fill_fail_polls; | ||
126 | unsigned long copy_tx_sendtos; | ||
127 | unsigned long tx_wakeup_sendtos; | ||
128 | unsigned long opt_polls; | ||
129 | unsigned long prev_rx_empty_polls; | ||
130 | unsigned long prev_fill_fail_polls; | ||
131 | unsigned long prev_copy_tx_sendtos; | ||
132 | unsigned long prev_tx_wakeup_sendtos; | ||
133 | unsigned long prev_opt_polls; | ||
134 | }; | ||
135 | |||
136 | struct xsk_umem_info { | ||
137 | struct xsk_ring_prod fq; | ||
138 | struct xsk_ring_cons cq; | ||
139 | struct xsk_umem *umem; | ||
140 | void *buffer; | ||
141 | }; | ||
142 | |||
143 | struct xsk_socket_info { | ||
144 | struct xsk_ring_cons rx; | ||
145 | struct xsk_ring_prod tx; | ||
146 | struct xsk_umem_info *umem; | ||
147 | struct xsk_socket *xsk; | ||
148 | struct xsk_ring_stats ring_stats; | ||
149 | struct xsk_app_stats app_stats; | ||
150 | struct xsk_driver_stats drv_stats; | ||
151 | u32 outstanding_tx; | ||
152 | }; | ||
153 | |||
154 | static int num_socks; | ||
155 | struct xsk_socket_info *xsks[MAX_SOCKS]; | ||
156 | |||
157 | static unsigned long get_nsecs(void) | ||
158 | { | ||
159 | struct timespec ts; | ||
160 | |||
161 | clock_gettime(CLOCK_MONOTONIC, &ts); | ||
162 | return ts.tv_sec * 1000000000UL + ts.tv_nsec; | ||
163 | } | ||
164 | |||
165 | static void print_benchmark(bool running) | ||
166 | { | ||
167 | const char *bench_str = "INVALID"; | ||
168 | |||
169 | if (opt_bench == BENCH_RXDROP) | ||
170 | bench_str = "rxdrop"; | ||
171 | else if (opt_bench == BENCH_TXONLY) | ||
172 | bench_str = "txonly"; | ||
173 | else if (opt_bench == BENCH_L2FWD) | ||
174 | bench_str = "l2fwd"; | ||
175 | |||
176 | printf("%s:%d %s ", opt_if, opt_queue, bench_str); | ||
177 | if (opt_xdp_flags & XDP_FLAGS_SKB_MODE) | ||
178 | printf("xdp-skb "); | ||
179 | else if (opt_xdp_flags & XDP_FLAGS_DRV_MODE) | ||
180 | printf("xdp-drv "); | ||
181 | else | ||
182 | printf(" "); | ||
183 | |||
184 | if (opt_poll) | ||
185 | printf("poll() "); | ||
186 | |||
187 | if (running) { | ||
188 | printf("running..."); | ||
189 | fflush(stdout); | ||
190 | } | ||
191 | } | ||
192 | |||
193 | static int xsk_get_xdp_stats(int fd, struct xsk_socket_info *xsk) | ||
194 | { | ||
195 | struct xdp_statistics stats; | ||
196 | socklen_t optlen; | ||
197 | int err; | ||
198 | |||
199 | optlen = sizeof(stats); | ||
200 | err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen); | ||
201 | if (err) | ||
202 | return err; | ||
203 | |||
204 | if (optlen == sizeof(struct xdp_statistics)) { | ||
205 | xsk->ring_stats.rx_dropped_npkts = stats.rx_dropped; | ||
206 | xsk->ring_stats.rx_invalid_npkts = stats.rx_invalid_descs; | ||
207 | xsk->ring_stats.tx_invalid_npkts = stats.tx_invalid_descs; | ||
208 | xsk->ring_stats.rx_full_npkts = stats.rx_ring_full; | ||
209 | xsk->ring_stats.rx_fill_empty_npkts = stats.rx_fill_ring_empty_descs; | ||
210 | xsk->ring_stats.tx_empty_npkts = stats.tx_ring_empty_descs; | ||
211 | return 0; | ||
212 | } | ||
213 | |||
214 | return -EINVAL; | ||
215 | } | ||
216 | |||
217 | static void dump_app_stats(long dt) | ||
218 | { | ||
219 | int i; | ||
220 | |||
221 | for (i = 0; i < num_socks && xsks[i]; i++) { | ||
222 | char *fmt = "%-18s %'-14.0f %'-14lu\n"; | ||
223 | double rx_empty_polls_ps, fill_fail_polls_ps, copy_tx_sendtos_ps, | ||
224 | tx_wakeup_sendtos_ps, opt_polls_ps; | ||
225 | |||
226 | rx_empty_polls_ps = (xsks[i]->app_stats.rx_empty_polls - | ||
227 | xsks[i]->app_stats.prev_rx_empty_polls) * 1000000000. / dt; | ||
228 | fill_fail_polls_ps = (xsks[i]->app_stats.fill_fail_polls - | ||
229 | xsks[i]->app_stats.prev_fill_fail_polls) * 1000000000. / dt; | ||
230 | copy_tx_sendtos_ps = (xsks[i]->app_stats.copy_tx_sendtos - | ||
231 | xsks[i]->app_stats.prev_copy_tx_sendtos) * 1000000000. / dt; | ||
232 | tx_wakeup_sendtos_ps = (xsks[i]->app_stats.tx_wakeup_sendtos - | ||
233 | xsks[i]->app_stats.prev_tx_wakeup_sendtos) | ||
234 | * 1000000000. / dt; | ||
235 | opt_polls_ps = (xsks[i]->app_stats.opt_polls - | ||
236 | xsks[i]->app_stats.prev_opt_polls) * 1000000000. / dt; | ||
237 | |||
238 | printf("\n%-18s %-14s %-14s\n", "", "calls/s", "count"); | ||
239 | printf(fmt, "rx empty polls", rx_empty_polls_ps, xsks[i]->app_stats.rx_empty_polls); | ||
240 | printf(fmt, "fill fail polls", fill_fail_polls_ps, | ||
241 | xsks[i]->app_stats.fill_fail_polls); | ||
242 | printf(fmt, "copy tx sendtos", copy_tx_sendtos_ps, | ||
243 | xsks[i]->app_stats.copy_tx_sendtos); | ||
244 | printf(fmt, "tx wakeup sendtos", tx_wakeup_sendtos_ps, | ||
245 | xsks[i]->app_stats.tx_wakeup_sendtos); | ||
246 | printf(fmt, "opt polls", opt_polls_ps, xsks[i]->app_stats.opt_polls); | ||
247 | |||
248 | xsks[i]->app_stats.prev_rx_empty_polls = xsks[i]->app_stats.rx_empty_polls; | ||
249 | xsks[i]->app_stats.prev_fill_fail_polls = xsks[i]->app_stats.fill_fail_polls; | ||
250 | xsks[i]->app_stats.prev_copy_tx_sendtos = xsks[i]->app_stats.copy_tx_sendtos; | ||
251 | xsks[i]->app_stats.prev_tx_wakeup_sendtos = xsks[i]->app_stats.tx_wakeup_sendtos; | ||
252 | xsks[i]->app_stats.prev_opt_polls = xsks[i]->app_stats.opt_polls; | ||
253 | } | ||
254 | } | ||
255 | |||
256 | static bool get_interrupt_number(void) | ||
257 | { | ||
258 | FILE *f_int_proc; | ||
259 | char line[4096]; | ||
260 | bool found = false; | ||
261 | |||
262 | f_int_proc = fopen("/proc/interrupts", "r"); | ||
263 | if (f_int_proc == NULL) { | ||
264 | printf("Failed to open /proc/interrupts.\n"); | ||
265 | return found; | ||
266 | } | ||
267 | |||
268 | while (!feof(f_int_proc) && !found) { | ||
269 | /* Make sure to read a full line at a time */ | ||
270 | if (fgets(line, sizeof(line), f_int_proc) == NULL || | ||
271 | line[strlen(line) - 1] != '\n') { | ||
272 | printf("Error reading from interrupts file\n"); | ||
273 | break; | ||
274 | } | ||
275 | |||
276 | /* Extract interrupt number from line */ | ||
277 | if (strstr(line, opt_irq_str) != NULL) { | ||
278 | irq_no = atoi(line); | ||
279 | found = true; | ||
280 | break; | ||
281 | } | ||
282 | } | ||
283 | |||
284 | fclose(f_int_proc); | ||
285 | |||
286 | return found; | ||
287 | } | ||
288 | |||
289 | static int get_irqs(void) | ||
290 | { | ||
291 | char count_path[PATH_MAX]; | ||
292 | int total_intrs = -1; | ||
293 | FILE *f_count_proc; | ||
294 | char line[4096]; | ||
295 | |||
296 | snprintf(count_path, sizeof(count_path), | ||
297 | "/sys/kernel/irq/%i/per_cpu_count", irq_no); | ||
298 | f_count_proc = fopen(count_path, "r"); | ||
299 | if (f_count_proc == NULL) { | ||
300 | printf("Failed to open %s\n", count_path); | ||
301 | return total_intrs; | ||
302 | } | ||
303 | |||
304 | if (fgets(line, sizeof(line), f_count_proc) == NULL || | ||
305 | line[strlen(line) - 1] != '\n') { | ||
306 | printf("Error reading from %s\n", count_path); | ||
307 | } else { | ||
308 | static const char com[2] = ","; | ||
309 | char *token; | ||
310 | |||
311 | total_intrs = 0; | ||
312 | token = strtok(line, com); | ||
313 | while (token != NULL) { | ||
314 | /* sum up interrupts across all cores */ | ||
315 | total_intrs += atoi(token); | ||
316 | token = strtok(NULL, com); | ||
317 | } | ||
318 | } | ||
319 | |||
320 | fclose(f_count_proc); | ||
321 | |||
322 | return total_intrs; | ||
323 | } | ||
324 | |||
325 | static void dump_driver_stats(long dt) | ||
326 | { | ||
327 | int i; | ||
328 | |||
329 | for (i = 0; i < num_socks && xsks[i]; i++) { | ||
330 | char *fmt = "%-18s %'-14.0f %'-14lu\n"; | ||
331 | double intrs_ps; | ||
332 | int n_ints = get_irqs(); | ||
333 | |||
334 | if (n_ints < 0) { | ||
335 | printf("error getting intr info for intr %i\n", irq_no); | ||
336 | return; | ||
337 | } | ||
338 | xsks[i]->drv_stats.intrs = n_ints - irqs_at_init; | ||
339 | |||
340 | intrs_ps = (xsks[i]->drv_stats.intrs - xsks[i]->drv_stats.prev_intrs) * | ||
341 | 1000000000. / dt; | ||
342 | |||
343 | printf("\n%-18s %-14s %-14s\n", "", "intrs/s", "count"); | ||
344 | printf(fmt, "irqs", intrs_ps, xsks[i]->drv_stats.intrs); | ||
345 | |||
346 | xsks[i]->drv_stats.prev_intrs = xsks[i]->drv_stats.intrs; | ||
347 | } | ||
348 | } | ||
349 | |||
350 | static void dump_stats(void) | ||
351 | { | ||
352 | unsigned long now = get_nsecs(); | ||
353 | long dt = now - prev_time; | ||
354 | int i; | ||
355 | |||
356 | prev_time = now; | ||
357 | |||
358 | for (i = 0; i < num_socks && xsks[i]; i++) { | ||
359 | char *fmt = "%-18s %'-14.0f %'-14lu\n"; | ||
360 | double rx_pps, tx_pps, dropped_pps, rx_invalid_pps, full_pps, fill_empty_pps, | ||
361 | tx_invalid_pps, tx_empty_pps; | ||
362 | |||
363 | rx_pps = (xsks[i]->ring_stats.rx_npkts - xsks[i]->ring_stats.prev_rx_npkts) * | ||
364 | 1000000000. / dt; | ||
365 | tx_pps = (xsks[i]->ring_stats.tx_npkts - xsks[i]->ring_stats.prev_tx_npkts) * | ||
366 | 1000000000. / dt; | ||
367 | |||
368 | printf("\n sock%d@", i); | ||
369 | print_benchmark(false); | ||
370 | printf("\n"); | ||
371 | |||
372 | printf("%-18s %-14s %-14s %-14.2f\n", "", "pps", "pkts", | ||
373 | dt / 1000000000.); | ||
374 | printf(fmt, "rx", rx_pps, xsks[i]->ring_stats.rx_npkts); | ||
375 | printf(fmt, "tx", tx_pps, xsks[i]->ring_stats.tx_npkts); | ||
376 | |||
377 | xsks[i]->ring_stats.prev_rx_npkts = xsks[i]->ring_stats.rx_npkts; | ||
378 | xsks[i]->ring_stats.prev_tx_npkts = xsks[i]->ring_stats.tx_npkts; | ||
379 | |||
380 | if (opt_extra_stats) { | ||
381 | if (!xsk_get_xdp_stats(xsk_socket__fd(xsks[i]->xsk), xsks[i])) { | ||
382 | dropped_pps = (xsks[i]->ring_stats.rx_dropped_npkts - | ||
383 | xsks[i]->ring_stats.prev_rx_dropped_npkts) * | ||
384 | 1000000000. / dt; | ||
385 | rx_invalid_pps = (xsks[i]->ring_stats.rx_invalid_npkts - | ||
386 | xsks[i]->ring_stats.prev_rx_invalid_npkts) * | ||
387 | 1000000000. / dt; | ||
388 | tx_invalid_pps = (xsks[i]->ring_stats.tx_invalid_npkts - | ||
389 | xsks[i]->ring_stats.prev_tx_invalid_npkts) * | ||
390 | 1000000000. / dt; | ||
391 | full_pps = (xsks[i]->ring_stats.rx_full_npkts - | ||
392 | xsks[i]->ring_stats.prev_rx_full_npkts) * | ||
393 | 1000000000. / dt; | ||
394 | fill_empty_pps = (xsks[i]->ring_stats.rx_fill_empty_npkts - | ||
395 | xsks[i]->ring_stats.prev_rx_fill_empty_npkts) * | ||
396 | 1000000000. / dt; | ||
397 | tx_empty_pps = (xsks[i]->ring_stats.tx_empty_npkts - | ||
398 | xsks[i]->ring_stats.prev_tx_empty_npkts) * | ||
399 | 1000000000. / dt; | ||
400 | |||
401 | printf(fmt, "rx dropped", dropped_pps, | ||
402 | xsks[i]->ring_stats.rx_dropped_npkts); | ||
403 | printf(fmt, "rx invalid", rx_invalid_pps, | ||
404 | xsks[i]->ring_stats.rx_invalid_npkts); | ||
405 | printf(fmt, "tx invalid", tx_invalid_pps, | ||
406 | xsks[i]->ring_stats.tx_invalid_npkts); | ||
407 | printf(fmt, "rx queue full", full_pps, | ||
408 | xsks[i]->ring_stats.rx_full_npkts); | ||
409 | printf(fmt, "fill ring empty", fill_empty_pps, | ||
410 | xsks[i]->ring_stats.rx_fill_empty_npkts); | ||
411 | printf(fmt, "tx ring empty", tx_empty_pps, | ||
412 | xsks[i]->ring_stats.tx_empty_npkts); | ||
413 | |||
414 | xsks[i]->ring_stats.prev_rx_dropped_npkts = | ||
415 | xsks[i]->ring_stats.rx_dropped_npkts; | ||
416 | xsks[i]->ring_stats.prev_rx_invalid_npkts = | ||
417 | xsks[i]->ring_stats.rx_invalid_npkts; | ||
418 | xsks[i]->ring_stats.prev_tx_invalid_npkts = | ||
419 | xsks[i]->ring_stats.tx_invalid_npkts; | ||
420 | xsks[i]->ring_stats.prev_rx_full_npkts = | ||
421 | xsks[i]->ring_stats.rx_full_npkts; | ||
422 | xsks[i]->ring_stats.prev_rx_fill_empty_npkts = | ||
423 | xsks[i]->ring_stats.rx_fill_empty_npkts; | ||
424 | xsks[i]->ring_stats.prev_tx_empty_npkts = | ||
425 | xsks[i]->ring_stats.tx_empty_npkts; | ||
426 | } else { | ||
427 | printf("%-15s\n", "Error retrieving extra stats"); | ||
428 | } | ||
429 | } | ||
430 | } | ||
431 | |||
432 | if (opt_app_stats) | ||
433 | dump_app_stats(dt); | ||
434 | if (irq_no) | ||
435 | dump_driver_stats(dt); | ||
436 | } | ||
437 | |||
438 | static bool is_benchmark_done(void) | ||
439 | { | ||
440 | if (opt_duration > 0) { | ||
441 | unsigned long dt = (get_nsecs() - start_time); | ||
442 | |||
443 | if (dt >= opt_duration) | ||
444 | benchmark_done = true; | ||
445 | } | ||
446 | return benchmark_done; | ||
447 | } | ||
448 | |||
449 | static void *poller(void *arg) | ||
450 | { | ||
451 | (void)arg; | ||
452 | while (!is_benchmark_done()) { | ||
453 | sleep(opt_interval); | ||
454 | dump_stats(); | ||
455 | } | ||
456 | |||
457 | return NULL; | ||
458 | } | ||
459 | |||
460 | static void remove_xdp_program(void) | ||
461 | { | ||
462 | u32 curr_prog_id = 0; | ||
463 | |||
464 | if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) { | ||
465 | printf("bpf_get_link_xdp_id failed\n"); | ||
466 | exit(EXIT_FAILURE); | ||
467 | } | ||
468 | if (prog_id == curr_prog_id) | ||
469 | bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags); | ||
470 | else if (!curr_prog_id) | ||
471 | printf("couldn't find a prog id on a given interface\n"); | ||
472 | else | ||
473 | printf("program on interface changed, not removing\n"); | ||
474 | } | ||
475 | |||
476 | static void int_exit(int sig) | ||
477 | { | ||
478 | benchmark_done = true; | ||
479 | } | ||
480 | |||
481 | static void xdpsock_cleanup(void) | ||
482 | { | ||
483 | struct xsk_umem *umem = xsks[0]->umem->umem; | ||
484 | int i; | ||
485 | |||
486 | dump_stats(); | ||
487 | for (i = 0; i < num_socks; i++) | ||
488 | xsk_socket__delete(xsks[i]->xsk); | ||
489 | (void)xsk_umem__delete(umem); | ||
490 | remove_xdp_program(); | ||
491 | } | ||
492 | |||
493 | static void __exit_with_error(int error, const char *file, const char *func, | ||
494 | int line) | ||
495 | { | ||
496 | fprintf(stderr, "%s:%s:%i: errno: %d/\"%s\"\n", file, func, | ||
497 | line, error, strerror(error)); | ||
498 | dump_stats(); | ||
499 | remove_xdp_program(); | ||
500 | exit(EXIT_FAILURE); | ||
501 | } | ||
502 | |||
503 | #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, \ | ||
504 | __LINE__) | ||
505 | static void swap_mac_addresses(void *data) | ||
506 | { | ||
507 | struct ether_header *eth = (struct ether_header *)data; | ||
508 | struct ether_addr *src_addr = (struct ether_addr *)ð->ether_shost; | ||
509 | struct ether_addr *dst_addr = (struct ether_addr *)ð->ether_dhost; | ||
510 | struct ether_addr tmp; | ||
511 | |||
512 | tmp = *src_addr; | ||
513 | *src_addr = *dst_addr; | ||
514 | *dst_addr = tmp; | ||
515 | } | ||
516 | |||
517 | static void hex_dump(void *pkt, size_t length, u64 addr) | ||
518 | { | ||
519 | const unsigned char *address = (unsigned char *)pkt; | ||
520 | const unsigned char *line = address; | ||
521 | size_t line_size = 32; | ||
522 | unsigned char c; | ||
523 | char buf[32]; | ||
524 | int i = 0; | ||
525 | |||
526 | if (!DEBUG_HEXDUMP) | ||
527 | return; | ||
528 | |||
529 | sprintf(buf, "addr=%llu", addr); | ||
530 | printf("length = %zu\n", length); | ||
531 | printf("%s | ", buf); | ||
532 | while (length-- > 0) { | ||
533 | printf("%02X ", *address++); | ||
534 | if (!(++i % line_size) || (length == 0 && i % line_size)) { | ||
535 | if (length == 0) { | ||
536 | while (i++ % line_size) | ||
537 | printf("__ "); | ||
538 | } | ||
539 | printf(" | "); /* right close */ | ||
540 | while (line < address) { | ||
541 | c = *line++; | ||
542 | printf("%c", (c < 33 || c == 255) ? 0x2E : c); | ||
543 | } | ||
544 | printf("\n"); | ||
545 | if (length > 0) | ||
546 | printf("%s | ", buf); | ||
547 | } | ||
548 | } | ||
549 | printf("\n"); | ||
550 | } | ||
551 | |||
552 | static void *memset32_htonl(void *dest, u32 val, u32 size) | ||
553 | { | ||
554 | u32 *ptr = (u32 *)dest; | ||
555 | int i; | ||
556 | |||
557 | val = htonl(val); | ||
558 | |||
559 | for (i = 0; i < (size & (~0x3)); i += 4) | ||
560 | ptr[i >> 2] = val; | ||
561 | |||
562 | for (; i < size; i++) | ||
563 | ((char *)dest)[i] = ((char *)&val)[i & 3]; | ||
564 | |||
565 | return dest; | ||
566 | } | ||
567 | |||
568 | /* | ||
569 | * This function code has been taken from | ||
570 | * Linux kernel lib/checksum.c | ||
571 | */ | ||
572 | static inline unsigned short from32to16(unsigned int x) | ||
573 | { | ||
574 | /* add up 16-bit and 16-bit for 16+c bit */ | ||
575 | x = (x & 0xffff) + (x >> 16); | ||
576 | /* add up carry.. */ | ||
577 | x = (x & 0xffff) + (x >> 16); | ||
578 | return x; | ||
579 | } | ||
580 | |||
581 | /* | ||
582 | * This function code has been taken from | ||
583 | * Linux kernel lib/checksum.c | ||
584 | */ | ||
585 | static unsigned int do_csum(const unsigned char *buff, int len) | ||
586 | { | ||
587 | unsigned int result = 0; | ||
588 | int odd; | ||
589 | |||
590 | if (len <= 0) | ||
591 | goto out; | ||
592 | odd = 1 & (unsigned long)buff; | ||
593 | if (odd) { | ||
594 | #ifdef __LITTLE_ENDIAN | ||
595 | result += (*buff << 8); | ||
596 | #else | ||
597 | result = *buff; | ||
598 | #endif | ||
599 | len--; | ||
600 | buff++; | ||
601 | } | ||
602 | if (len >= 2) { | ||
603 | if (2 & (unsigned long)buff) { | ||
604 | result += *(unsigned short *)buff; | ||
605 | len -= 2; | ||
606 | buff += 2; | ||
607 | } | ||
608 | if (len >= 4) { | ||
609 | const unsigned char *end = buff + | ||
610 | ((unsigned int)len & ~3); | ||
611 | unsigned int carry = 0; | ||
612 | |||
613 | do { | ||
614 | unsigned int w = *(unsigned int *)buff; | ||
615 | |||
616 | buff += 4; | ||
617 | result += carry; | ||
618 | result += w; | ||
619 | carry = (w > result); | ||
620 | } while (buff < end); | ||
621 | result += carry; | ||
622 | result = (result & 0xffff) + (result >> 16); | ||
623 | } | ||
624 | if (len & 2) { | ||
625 | result += *(unsigned short *)buff; | ||
626 | buff += 2; | ||
627 | } | ||
628 | } | ||
629 | if (len & 1) | ||
630 | #ifdef __LITTLE_ENDIAN | ||
631 | result += *buff; | ||
632 | #else | ||
633 | result += (*buff << 8); | ||
634 | #endif | ||
635 | result = from32to16(result); | ||
636 | if (odd) | ||
637 | result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); | ||
638 | out: | ||
639 | return result; | ||
640 | } | ||
641 | |||
642 | __sum16 ip_fast_csum(const void *iph, unsigned int ihl); | ||
643 | |||
644 | /* | ||
645 | * This is a version of ip_compute_csum() optimized for IP headers, | ||
646 | * which always checksum on 4 octet boundaries. | ||
647 | * This function code has been taken from | ||
648 | * Linux kernel lib/checksum.c | ||
649 | */ | ||
650 | __sum16 ip_fast_csum(const void *iph, unsigned int ihl) | ||
651 | { | ||
652 | return (__force __sum16)~do_csum(iph, ihl * 4); | ||
653 | } | ||
654 | |||
655 | /* | ||
656 | * Fold a partial checksum | ||
657 | * This function code has been taken from | ||
658 | * Linux kernel include/asm-generic/checksum.h | ||
659 | */ | ||
660 | static inline __sum16 csum_fold(__wsum csum) | ||
661 | { | ||
662 | u32 sum = (__force u32)csum; | ||
663 | |||
664 | sum = (sum & 0xffff) + (sum >> 16); | ||
665 | sum = (sum & 0xffff) + (sum >> 16); | ||
666 | return (__force __sum16)~sum; | ||
667 | } | ||
668 | |||
669 | /* | ||
670 | * This function code has been taken from | ||
671 | * Linux kernel lib/checksum.c | ||
672 | */ | ||
673 | static inline u32 from64to32(u64 x) | ||
674 | { | ||
675 | /* add up 32-bit and 32-bit for 32+c bit */ | ||
676 | x = (x & 0xffffffff) + (x >> 32); | ||
677 | /* add up carry.. */ | ||
678 | x = (x & 0xffffffff) + (x >> 32); | ||
679 | return (u32)x; | ||
680 | } | ||
681 | |||
682 | __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, | ||
683 | __u32 len, __u8 proto, __wsum sum); | ||
684 | |||
685 | /* | ||
686 | * This function code has been taken from | ||
687 | * Linux kernel lib/checksum.c | ||
688 | */ | ||
689 | __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, | ||
690 | __u32 len, __u8 proto, __wsum sum) | ||
691 | { | ||
692 | unsigned long long s = (__force u32)sum; | ||
693 | |||
694 | s += (__force u32)saddr; | ||
695 | s += (__force u32)daddr; | ||
696 | #ifdef __BIG_ENDIAN__ | ||
697 | s += proto + len; | ||
698 | #else | ||
699 | s += (proto + len) << 8; | ||
700 | #endif | ||
701 | return (__force __wsum)from64to32(s); | ||
702 | } | ||
703 | |||
704 | /* | ||
705 | * This function has been taken from | ||
706 | * Linux kernel include/asm-generic/checksum.h | ||
707 | */ | ||
708 | static inline __sum16 | ||
709 | csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, | ||
710 | __u8 proto, __wsum sum) | ||
711 | { | ||
712 | return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); | ||
713 | } | ||
714 | |||
715 | static inline u16 udp_csum(u32 saddr, u32 daddr, u32 len, | ||
716 | u8 proto, u16 *udp_pkt) | ||
717 | { | ||
718 | u32 csum = 0; | ||
719 | u32 cnt = 0; | ||
720 | |||
721 | /* udp hdr and data */ | ||
722 | for (; cnt < len; cnt += 2) | ||
723 | csum += udp_pkt[cnt >> 1]; | ||
724 | |||
725 | return csum_tcpudp_magic(saddr, daddr, len, proto, csum); | ||
726 | } | ||
727 | |||
728 | #define ETH_FCS_SIZE 4 | ||
729 | |||
730 | #define PKT_HDR_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ | ||
731 | sizeof(struct udphdr)) | ||
732 | |||
733 | #define PKT_SIZE (opt_pkt_size - ETH_FCS_SIZE) | ||
734 | #define IP_PKT_SIZE (PKT_SIZE - sizeof(struct ethhdr)) | ||
735 | #define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr)) | ||
736 | #define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr)) | ||
737 | |||
738 | static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE]; | ||
739 | |||
740 | static void gen_eth_hdr_data(void) | ||
741 | { | ||
742 | struct udphdr *udp_hdr = (struct udphdr *)(pkt_data + | ||
743 | sizeof(struct ethhdr) + | ||
744 | sizeof(struct iphdr)); | ||
745 | struct iphdr *ip_hdr = (struct iphdr *)(pkt_data + | ||
746 | sizeof(struct ethhdr)); | ||
747 | struct ethhdr *eth_hdr = (struct ethhdr *)pkt_data; | ||
748 | |||
749 | /* ethernet header */ | ||
750 | memcpy(eth_hdr->h_dest, "\x3c\xfd\xfe\x9e\x7f\x71", ETH_ALEN); | ||
751 | memcpy(eth_hdr->h_source, "\xec\xb1\xd7\x98\x3a\xc0", ETH_ALEN); | ||
752 | eth_hdr->h_proto = htons(ETH_P_IP); | ||
753 | |||
754 | /* IP header */ | ||
755 | ip_hdr->version = IPVERSION; | ||
756 | ip_hdr->ihl = 0x5; /* 20 byte header */ | ||
757 | ip_hdr->tos = 0x0; | ||
758 | ip_hdr->tot_len = htons(IP_PKT_SIZE); | ||
759 | ip_hdr->id = 0; | ||
760 | ip_hdr->frag_off = 0; | ||
761 | ip_hdr->ttl = IPDEFTTL; | ||
762 | ip_hdr->protocol = IPPROTO_UDP; | ||
763 | ip_hdr->saddr = htonl(0x0a0a0a10); | ||
764 | ip_hdr->daddr = htonl(0x0a0a0a20); | ||
765 | |||
766 | /* IP header checksum */ | ||
767 | ip_hdr->check = 0; | ||
768 | ip_hdr->check = ip_fast_csum((const void *)ip_hdr, ip_hdr->ihl); | ||
769 | |||
770 | /* UDP header */ | ||
771 | udp_hdr->source = htons(0x1000); | ||
772 | udp_hdr->dest = htons(0x1000); | ||
773 | udp_hdr->len = htons(UDP_PKT_SIZE); | ||
774 | |||
775 | /* UDP data */ | ||
776 | memset32_htonl(pkt_data + PKT_HDR_SIZE, opt_pkt_fill_pattern, | ||
777 | UDP_PKT_DATA_SIZE); | ||
778 | |||
779 | /* UDP header checksum */ | ||
780 | udp_hdr->check = 0; | ||
781 | udp_hdr->check = udp_csum(ip_hdr->saddr, ip_hdr->daddr, UDP_PKT_SIZE, | ||
782 | IPPROTO_UDP, (u16 *)udp_hdr); | ||
783 | } | ||
784 | |||
785 | static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr) | ||
786 | { | ||
787 | memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data, | ||
788 | PKT_SIZE); | ||
789 | } | ||
790 | |||
791 | static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size) | ||
792 | { | ||
793 | struct xsk_umem_info *umem; | ||
794 | struct xsk_umem_config cfg = { | ||
795 | /* We recommend that you set the fill ring size >= HW RX ring size + | ||
796 | * AF_XDP RX ring size. Make sure you fill up the fill ring | ||
797 | * with buffers at regular intervals, and you will with this setting | ||
798 | * avoid allocation failures in the driver. These are usually quite | ||
799 | * expensive since drivers have not been written to assume that | ||
800 | * allocation failures are common. For regular sockets, kernel | ||
801 | * allocated memory is used that only runs out in OOM situations | ||
802 | * that should be rare. | ||
803 | */ | ||
804 | .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, | ||
805 | .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, | ||
806 | .frame_size = opt_xsk_frame_size, | ||
807 | .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM, | ||
808 | .flags = opt_umem_flags | ||
809 | }; | ||
810 | int ret; | ||
811 | |||
812 | umem = calloc(1, sizeof(*umem)); | ||
813 | if (!umem) | ||
814 | exit_with_error(errno); | ||
815 | |||
816 | ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq, | ||
817 | &cfg); | ||
818 | if (ret) | ||
819 | exit_with_error(-ret); | ||
820 | |||
821 | umem->buffer = buffer; | ||
822 | return umem; | ||
823 | } | ||
824 | |||
825 | static void xsk_populate_fill_ring(struct xsk_umem_info *umem) | ||
826 | { | ||
827 | int ret, i; | ||
828 | u32 idx; | ||
829 | |||
830 | ret = xsk_ring_prod__reserve(&umem->fq, | ||
831 | XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, &idx); | ||
832 | if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS * 2) | ||
833 | exit_with_error(-ret); | ||
834 | for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS * 2; i++) | ||
835 | *xsk_ring_prod__fill_addr(&umem->fq, idx++) = | ||
836 | i * opt_xsk_frame_size; | ||
837 | xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS * 2); | ||
838 | } | ||
839 | |||
840 | static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem, | ||
841 | bool rx, bool tx) | ||
842 | { | ||
843 | struct xsk_socket_config cfg; | ||
844 | struct xsk_socket_info *xsk; | ||
845 | struct xsk_ring_cons *rxr; | ||
846 | struct xsk_ring_prod *txr; | ||
847 | int ret; | ||
848 | |||
849 | xsk = calloc(1, sizeof(*xsk)); | ||
850 | if (!xsk) | ||
851 | exit_with_error(errno); | ||
852 | |||
853 | xsk->umem = umem; | ||
854 | cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; | ||
855 | cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; | ||
856 | if (opt_num_xsks > 1) | ||
857 | cfg.libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD; | ||
858 | else | ||
859 | cfg.libbpf_flags = 0; | ||
860 | cfg.xdp_flags = opt_xdp_flags; | ||
861 | cfg.bind_flags = opt_xdp_bind_flags; | ||
862 | |||
863 | rxr = rx ? &xsk->rx : NULL; | ||
864 | txr = tx ? &xsk->tx : NULL; | ||
865 | ret = xsk_socket__create(&xsk->xsk, opt_if, opt_queue, umem->umem, | ||
866 | rxr, txr, &cfg); | ||
867 | if (ret) | ||
868 | exit_with_error(-ret); | ||
869 | |||
870 | ret = bpf_get_link_xdp_id(opt_ifindex, &prog_id, opt_xdp_flags); | ||
871 | if (ret) | ||
872 | exit_with_error(-ret); | ||
873 | |||
874 | xsk->app_stats.rx_empty_polls = 0; | ||
875 | xsk->app_stats.fill_fail_polls = 0; | ||
876 | xsk->app_stats.copy_tx_sendtos = 0; | ||
877 | xsk->app_stats.tx_wakeup_sendtos = 0; | ||
878 | xsk->app_stats.opt_polls = 0; | ||
879 | xsk->app_stats.prev_rx_empty_polls = 0; | ||
880 | xsk->app_stats.prev_fill_fail_polls = 0; | ||
881 | xsk->app_stats.prev_copy_tx_sendtos = 0; | ||
882 | xsk->app_stats.prev_tx_wakeup_sendtos = 0; | ||
883 | xsk->app_stats.prev_opt_polls = 0; | ||
884 | |||
885 | return xsk; | ||
886 | } | ||
887 | |||
888 | static struct option long_options[] = { | ||
889 | {"rxdrop", no_argument, 0, 'r'}, | ||
890 | {"txonly", no_argument, 0, 't'}, | ||
891 | {"l2fwd", no_argument, 0, 'l'}, | ||
892 | {"interface", required_argument, 0, 'i'}, | ||
893 | {"queue", required_argument, 0, 'q'}, | ||
894 | {"poll", no_argument, 0, 'p'}, | ||
895 | {"xdp-skb", no_argument, 0, 'S'}, | ||
896 | {"xdp-native", no_argument, 0, 'N'}, | ||
897 | {"interval", required_argument, 0, 'n'}, | ||
898 | {"zero-copy", no_argument, 0, 'z'}, | ||
899 | {"copy", no_argument, 0, 'c'}, | ||
900 | {"frame-size", required_argument, 0, 'f'}, | ||
901 | {"no-need-wakeup", no_argument, 0, 'm'}, | ||
902 | {"unaligned", no_argument, 0, 'u'}, | ||
903 | {"shared-umem", no_argument, 0, 'M'}, | ||
904 | {"force", no_argument, 0, 'F'}, | ||
905 | {"duration", required_argument, 0, 'd'}, | ||
906 | {"batch-size", required_argument, 0, 'b'}, | ||
907 | {"tx-pkt-count", required_argument, 0, 'C'}, | ||
908 | {"tx-pkt-size", required_argument, 0, 's'}, | ||
909 | {"tx-pkt-pattern", required_argument, 0, 'P'}, | ||
910 | {"extra-stats", no_argument, 0, 'x'}, | ||
911 | {"quiet", no_argument, 0, 'Q'}, | ||
912 | {"app-stats", no_argument, 0, 'a'}, | ||
913 | {"irq-string", no_argument, 0, 'I'}, | ||
914 | {0, 0, 0, 0} | ||
915 | }; | ||
916 | |||
917 | static void usage(const char *prog) | ||
918 | { | ||
919 | const char *str = | ||
920 | " Usage: %s [OPTIONS]\n" | ||
921 | " Options:\n" | ||
922 | " -r, --rxdrop Discard all incoming packets (default)\n" | ||
923 | " -t, --txonly Only send packets\n" | ||
924 | " -l, --l2fwd MAC swap L2 forwarding\n" | ||
925 | " -i, --interface=n Run on interface n\n" | ||
926 | " -q, --queue=n Use queue n (default 0)\n" | ||
927 | " -p, --poll Use poll syscall\n" | ||
928 | " -S, --xdp-skb=n Use XDP skb-mod\n" | ||
929 | " -N, --xdp-native=n Enforce XDP native mode\n" | ||
930 | " -n, --interval=n Specify statistics update interval (default 1 sec).\n" | ||
931 | " -z, --zero-copy Force zero-copy mode.\n" | ||
932 | " -c, --copy Force copy mode.\n" | ||
933 | " -m, --no-need-wakeup Turn off use of driver need wakeup flag.\n" | ||
934 | " -f, --frame-size=n Set the frame size (must be a power of two in aligned mode, default is %d).\n" | ||
935 | " -u, --unaligned Enable unaligned chunk placement\n" | ||
936 | " -M, --shared-umem Enable XDP_SHARED_UMEM\n" | ||
937 | " -F, --force Force loading the XDP prog\n" | ||
938 | " -d, --duration=n Duration in secs to run command.\n" | ||
939 | " Default: forever.\n" | ||
940 | " -b, --batch-size=n Batch size for sending or receiving\n" | ||
941 | " packets. Default: %d\n" | ||
942 | " -C, --tx-pkt-count=n Number of packets to send.\n" | ||
943 | " Default: Continuous packets.\n" | ||
944 | " -s, --tx-pkt-size=n Transmit packet size.\n" | ||
945 | " (Default: %d bytes)\n" | ||
946 | " Min size: %d, Max size %d.\n" | ||
947 | " -P, --tx-pkt-pattern=nPacket fill pattern. Default: 0x%x\n" | ||
948 | " -x, --extra-stats Display extra statistics.\n" | ||
949 | " -Q, --quiet Do not display any stats.\n" | ||
950 | " -a, --app-stats Display application (syscall) statistics.\n" | ||
951 | " -I, --irq-string Display driver interrupt statistics for interface associated with irq-string.\n" | ||
952 | "\n"; | ||
953 | fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE, | ||
954 | opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE, | ||
955 | XSK_UMEM__DEFAULT_FRAME_SIZE, opt_pkt_fill_pattern); | ||
956 | |||
957 | exit(EXIT_FAILURE); | ||
958 | } | ||
959 | |||
960 | static void parse_command_line(int argc, char **argv) | ||
961 | { | ||
962 | int option_index, c; | ||
963 | |||
964 | opterr = 0; | ||
965 | |||
966 | for (;;) { | ||
967 | c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQaI:", | ||
968 | long_options, &option_index); | ||
969 | if (c == -1) | ||
970 | break; | ||
971 | |||
972 | switch (c) { | ||
973 | case 'r': | ||
974 | opt_bench = BENCH_RXDROP; | ||
975 | break; | ||
976 | case 't': | ||
977 | opt_bench = BENCH_TXONLY; | ||
978 | break; | ||
979 | case 'l': | ||
980 | opt_bench = BENCH_L2FWD; | ||
981 | break; | ||
982 | case 'i': | ||
983 | opt_if = optarg; | ||
984 | break; | ||
985 | case 'q': | ||
986 | opt_queue = atoi(optarg); | ||
987 | break; | ||
988 | case 'p': | ||
989 | opt_poll = 1; | ||
990 | break; | ||
991 | case 'S': | ||
992 | opt_xdp_flags |= XDP_FLAGS_SKB_MODE; | ||
993 | opt_xdp_bind_flags |= XDP_COPY; | ||
994 | break; | ||
995 | case 'N': | ||
996 | /* default, set below */ | ||
997 | break; | ||
998 | case 'n': | ||
999 | opt_interval = atoi(optarg); | ||
1000 | break; | ||
1001 | case 'z': | ||
1002 | opt_xdp_bind_flags |= XDP_ZEROCOPY; | ||
1003 | break; | ||
1004 | case 'c': | ||
1005 | opt_xdp_bind_flags |= XDP_COPY; | ||
1006 | break; | ||
1007 | case 'u': | ||
1008 | opt_umem_flags |= XDP_UMEM_UNALIGNED_CHUNK_FLAG; | ||
1009 | opt_unaligned_chunks = 1; | ||
1010 | opt_mmap_flags = MAP_HUGETLB; | ||
1011 | break; | ||
1012 | case 'F': | ||
1013 | opt_xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; | ||
1014 | break; | ||
1015 | case 'f': | ||
1016 | opt_xsk_frame_size = atoi(optarg); | ||
1017 | break; | ||
1018 | case 'm': | ||
1019 | opt_need_wakeup = false; | ||
1020 | opt_xdp_bind_flags &= ~XDP_USE_NEED_WAKEUP; | ||
1021 | break; | ||
1022 | case 'M': | ||
1023 | opt_num_xsks = MAX_SOCKS; | ||
1024 | break; | ||
1025 | case 'd': | ||
1026 | opt_duration = atoi(optarg); | ||
1027 | opt_duration *= 1000000000; | ||
1028 | break; | ||
1029 | case 'b': | ||
1030 | opt_batch_size = atoi(optarg); | ||
1031 | break; | ||
1032 | case 'C': | ||
1033 | opt_pkt_count = atoi(optarg); | ||
1034 | break; | ||
1035 | case 's': | ||
1036 | opt_pkt_size = atoi(optarg); | ||
1037 | if (opt_pkt_size > (XSK_UMEM__DEFAULT_FRAME_SIZE) || | ||
1038 | opt_pkt_size < MIN_PKT_SIZE) { | ||
1039 | fprintf(stderr, | ||
1040 | "ERROR: Invalid frame size %d\n", | ||
1041 | opt_pkt_size); | ||
1042 | usage(basename(argv[0])); | ||
1043 | } | ||
1044 | break; | ||
1045 | case 'P': | ||
1046 | opt_pkt_fill_pattern = strtol(optarg, NULL, 16); | ||
1047 | break; | ||
1048 | case 'x': | ||
1049 | opt_extra_stats = 1; | ||
1050 | break; | ||
1051 | case 'Q': | ||
1052 | opt_quiet = 1; | ||
1053 | break; | ||
1054 | case 'a': | ||
1055 | opt_app_stats = 1; | ||
1056 | break; | ||
1057 | case 'I': | ||
1058 | opt_irq_str = optarg; | ||
1059 | if (get_interrupt_number()) | ||
1060 | irqs_at_init = get_irqs(); | ||
1061 | if (irqs_at_init < 0) { | ||
1062 | fprintf(stderr, "ERROR: Failed to get irqs for %s\n", opt_irq_str); | ||
1063 | usage(basename(argv[0])); | ||
1064 | } | ||
1065 | |||
1066 | break; | ||
1067 | default: | ||
1068 | usage(basename(argv[0])); | ||
1069 | } | ||
1070 | } | ||
1071 | |||
1072 | if (!(opt_xdp_flags & XDP_FLAGS_SKB_MODE)) | ||
1073 | opt_xdp_flags |= XDP_FLAGS_DRV_MODE; | ||
1074 | |||
1075 | opt_ifindex = if_nametoindex(opt_if); | ||
1076 | if (!opt_ifindex) { | ||
1077 | fprintf(stderr, "ERROR: interface \"%s\" does not exist\n", | ||
1078 | opt_if); | ||
1079 | usage(basename(argv[0])); | ||
1080 | } | ||
1081 | |||
1082 | if ((opt_xsk_frame_size & (opt_xsk_frame_size - 1)) && | ||
1083 | !opt_unaligned_chunks) { | ||
1084 | fprintf(stderr, "--frame-size=%d is not a power of two\n", | ||
1085 | opt_xsk_frame_size); | ||
1086 | usage(basename(argv[0])); | ||
1087 | } | ||
1088 | } | ||
1089 | |||
1090 | static void kick_tx(struct xsk_socket_info *xsk) | ||
1091 | { | ||
1092 | int ret; | ||
1093 | |||
1094 | ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); | ||
1095 | if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN || | ||
1096 | errno == EBUSY || errno == ENETDOWN) | ||
1097 | return; | ||
1098 | exit_with_error(errno); | ||
1099 | } | ||
1100 | |||
1101 | static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk, | ||
1102 | struct pollfd *fds) | ||
1103 | { | ||
1104 | struct xsk_umem_info *umem = xsk->umem; | ||
1105 | u32 idx_cq = 0, idx_fq = 0; | ||
1106 | unsigned int rcvd; | ||
1107 | size_t ndescs; | ||
1108 | |||
1109 | if (!xsk->outstanding_tx) | ||
1110 | return; | ||
1111 | |||
1112 | /* In copy mode, Tx is driven by a syscall so we need to use e.g. sendto() to | ||
1113 | * really send the packets. In zero-copy mode we do not have to do this, since Tx | ||
1114 | * is driven by the NAPI loop. So as an optimization, we do not have to call | ||
1115 | * sendto() all the time in zero-copy mode for l2fwd. | ||
1116 | */ | ||
1117 | if (opt_xdp_bind_flags & XDP_COPY) { | ||
1118 | xsk->app_stats.copy_tx_sendtos++; | ||
1119 | kick_tx(xsk); | ||
1120 | } | ||
1121 | |||
1122 | ndescs = (xsk->outstanding_tx > opt_batch_size) ? opt_batch_size : | ||
1123 | xsk->outstanding_tx; | ||
1124 | |||
1125 | /* re-add completed Tx buffers */ | ||
1126 | rcvd = xsk_ring_cons__peek(&umem->cq, ndescs, &idx_cq); | ||
1127 | if (rcvd > 0) { | ||
1128 | unsigned int i; | ||
1129 | int ret; | ||
1130 | |||
1131 | ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); | ||
1132 | while (ret != rcvd) { | ||
1133 | if (ret < 0) | ||
1134 | exit_with_error(-ret); | ||
1135 | if (xsk_ring_prod__needs_wakeup(&umem->fq)) { | ||
1136 | xsk->app_stats.fill_fail_polls++; | ||
1137 | ret = poll(fds, num_socks, opt_timeout); | ||
1138 | } | ||
1139 | ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); | ||
1140 | } | ||
1141 | |||
1142 | for (i = 0; i < rcvd; i++) | ||
1143 | *xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) = | ||
1144 | *xsk_ring_cons__comp_addr(&umem->cq, idx_cq++); | ||
1145 | |||
1146 | xsk_ring_prod__submit(&xsk->umem->fq, rcvd); | ||
1147 | xsk_ring_cons__release(&xsk->umem->cq, rcvd); | ||
1148 | xsk->outstanding_tx -= rcvd; | ||
1149 | xsk->ring_stats.tx_npkts += rcvd; | ||
1150 | } | ||
1151 | } | ||
1152 | |||
1153 | static inline void complete_tx_only(struct xsk_socket_info *xsk, | ||
1154 | int batch_size) | ||
1155 | { | ||
1156 | unsigned int rcvd; | ||
1157 | u32 idx; | ||
1158 | |||
1159 | if (!xsk->outstanding_tx) | ||
1160 | return; | ||
1161 | |||
1162 | if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) { | ||
1163 | xsk->app_stats.tx_wakeup_sendtos++; | ||
1164 | kick_tx(xsk); | ||
1165 | } | ||
1166 | |||
1167 | rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx); | ||
1168 | if (rcvd > 0) { | ||
1169 | xsk_ring_cons__release(&xsk->umem->cq, rcvd); | ||
1170 | xsk->outstanding_tx -= rcvd; | ||
1171 | xsk->ring_stats.tx_npkts += rcvd; | ||
1172 | } | ||
1173 | } | ||
1174 | |||
1175 | static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds) | ||
1176 | { | ||
1177 | unsigned int rcvd, i; | ||
1178 | u32 idx_rx = 0, idx_fq = 0; | ||
1179 | int ret; | ||
1180 | |||
1181 | rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); | ||
1182 | if (!rcvd) { | ||
1183 | if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { | ||
1184 | xsk->app_stats.rx_empty_polls++; | ||
1185 | ret = poll(fds, num_socks, opt_timeout); | ||
1186 | } | ||
1187 | return; | ||
1188 | } | ||
1189 | |||
1190 | ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); | ||
1191 | while (ret != rcvd) { | ||
1192 | if (ret < 0) | ||
1193 | exit_with_error(-ret); | ||
1194 | if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { | ||
1195 | xsk->app_stats.fill_fail_polls++; | ||
1196 | ret = poll(fds, num_socks, opt_timeout); | ||
1197 | } | ||
1198 | ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); | ||
1199 | } | ||
1200 | |||
1201 | for (i = 0; i < rcvd; i++) { | ||
1202 | u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; | ||
1203 | u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len; | ||
1204 | u64 orig = xsk_umem__extract_addr(addr); | ||
1205 | |||
1206 | addr = xsk_umem__add_offset_to_addr(addr); | ||
1207 | char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); | ||
1208 | |||
1209 | hex_dump(pkt, len, addr); | ||
1210 | *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig; | ||
1211 | } | ||
1212 | |||
1213 | xsk_ring_prod__submit(&xsk->umem->fq, rcvd); | ||
1214 | xsk_ring_cons__release(&xsk->rx, rcvd); | ||
1215 | xsk->ring_stats.rx_npkts += rcvd; | ||
1216 | } | ||
1217 | |||
1218 | static void rx_drop_all(void) | ||
1219 | { | ||
1220 | struct pollfd fds[MAX_SOCKS] = {}; | ||
1221 | int i, ret; | ||
1222 | |||
1223 | for (i = 0; i < num_socks; i++) { | ||
1224 | fds[i].fd = xsk_socket__fd(xsks[i]->xsk); | ||
1225 | fds[i].events = POLLIN; | ||
1226 | } | ||
1227 | |||
1228 | for (;;) { | ||
1229 | if (opt_poll) { | ||
1230 | for (i = 0; i < num_socks; i++) | ||
1231 | xsks[i]->app_stats.opt_polls++; | ||
1232 | ret = poll(fds, num_socks, opt_timeout); | ||
1233 | if (ret <= 0) | ||
1234 | continue; | ||
1235 | } | ||
1236 | |||
1237 | for (i = 0; i < num_socks; i++) | ||
1238 | rx_drop(xsks[i], fds); | ||
1239 | |||
1240 | if (benchmark_done) | ||
1241 | break; | ||
1242 | } | ||
1243 | } | ||
1244 | |||
1245 | static void tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size) | ||
1246 | { | ||
1247 | u32 idx; | ||
1248 | unsigned int i; | ||
1249 | |||
1250 | while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) < | ||
1251 | batch_size) { | ||
1252 | complete_tx_only(xsk, batch_size); | ||
1253 | if (benchmark_done) | ||
1254 | return; | ||
1255 | } | ||
1256 | |||
1257 | for (i = 0; i < batch_size; i++) { | ||
1258 | struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, | ||
1259 | idx + i); | ||
1260 | tx_desc->addr = (*frame_nb + i) * opt_xsk_frame_size; | ||
1261 | tx_desc->len = PKT_SIZE; | ||
1262 | } | ||
1263 | |||
1264 | xsk_ring_prod__submit(&xsk->tx, batch_size); | ||
1265 | xsk->outstanding_tx += batch_size; | ||
1266 | *frame_nb += batch_size; | ||
1267 | *frame_nb %= NUM_FRAMES; | ||
1268 | complete_tx_only(xsk, batch_size); | ||
1269 | } | ||
1270 | |||
1271 | static inline int get_batch_size(int pkt_cnt) | ||
1272 | { | ||
1273 | if (!opt_pkt_count) | ||
1274 | return opt_batch_size; | ||
1275 | |||
1276 | if (pkt_cnt + opt_batch_size <= opt_pkt_count) | ||
1277 | return opt_batch_size; | ||
1278 | |||
1279 | return opt_pkt_count - pkt_cnt; | ||
1280 | } | ||
1281 | |||
1282 | static void complete_tx_only_all(void) | ||
1283 | { | ||
1284 | bool pending; | ||
1285 | int i; | ||
1286 | |||
1287 | do { | ||
1288 | pending = false; | ||
1289 | for (i = 0; i < num_socks; i++) { | ||
1290 | if (xsks[i]->outstanding_tx) { | ||
1291 | complete_tx_only(xsks[i], opt_batch_size); | ||
1292 | pending = !!xsks[i]->outstanding_tx; | ||
1293 | } | ||
1294 | } | ||
1295 | } while (pending); | ||
1296 | } | ||
1297 | |||
1298 | static void tx_only_all(void) | ||
1299 | { | ||
1300 | struct pollfd fds[MAX_SOCKS] = {}; | ||
1301 | u32 frame_nb[MAX_SOCKS] = {}; | ||
1302 | int pkt_cnt = 0; | ||
1303 | int i, ret; | ||
1304 | |||
1305 | for (i = 0; i < num_socks; i++) { | ||
1306 | fds[0].fd = xsk_socket__fd(xsks[i]->xsk); | ||
1307 | fds[0].events = POLLOUT; | ||
1308 | } | ||
1309 | |||
1310 | while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) { | ||
1311 | int batch_size = get_batch_size(pkt_cnt); | ||
1312 | |||
1313 | if (opt_poll) { | ||
1314 | for (i = 0; i < num_socks; i++) | ||
1315 | xsks[i]->app_stats.opt_polls++; | ||
1316 | ret = poll(fds, num_socks, opt_timeout); | ||
1317 | if (ret <= 0) | ||
1318 | continue; | ||
1319 | |||
1320 | if (!(fds[0].revents & POLLOUT)) | ||
1321 | continue; | ||
1322 | } | ||
1323 | |||
1324 | for (i = 0; i < num_socks; i++) | ||
1325 | tx_only(xsks[i], &frame_nb[i], batch_size); | ||
1326 | |||
1327 | pkt_cnt += batch_size; | ||
1328 | |||
1329 | if (benchmark_done) | ||
1330 | break; | ||
1331 | } | ||
1332 | |||
1333 | if (opt_pkt_count) | ||
1334 | complete_tx_only_all(); | ||
1335 | } | ||
1336 | |||
1337 | static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds) | ||
1338 | { | ||
1339 | unsigned int rcvd, i; | ||
1340 | u32 idx_rx = 0, idx_tx = 0; | ||
1341 | int ret; | ||
1342 | |||
1343 | complete_tx_l2fwd(xsk, fds); | ||
1344 | |||
1345 | rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx); | ||
1346 | if (!rcvd) { | ||
1347 | if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) { | ||
1348 | xsk->app_stats.rx_empty_polls++; | ||
1349 | ret = poll(fds, num_socks, opt_timeout); | ||
1350 | } | ||
1351 | return; | ||
1352 | } | ||
1353 | |||
1354 | ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); | ||
1355 | while (ret != rcvd) { | ||
1356 | if (ret < 0) | ||
1357 | exit_with_error(-ret); | ||
1358 | complete_tx_l2fwd(xsk, fds); | ||
1359 | if (xsk_ring_prod__needs_wakeup(&xsk->tx)) { | ||
1360 | xsk->app_stats.tx_wakeup_sendtos++; | ||
1361 | kick_tx(xsk); | ||
1362 | } | ||
1363 | ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); | ||
1364 | } | ||
1365 | |||
1366 | for (i = 0; i < rcvd; i++) { | ||
1367 | u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; | ||
1368 | u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len; | ||
1369 | u64 orig = addr; | ||
1370 | |||
1371 | addr = xsk_umem__add_offset_to_addr(addr); | ||
1372 | char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); | ||
1373 | |||
1374 | swap_mac_addresses(pkt); | ||
1375 | |||
1376 | hex_dump(pkt, len, addr); | ||
1377 | xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = orig; | ||
1378 | xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len; | ||
1379 | } | ||
1380 | |||
1381 | xsk_ring_prod__submit(&xsk->tx, rcvd); | ||
1382 | xsk_ring_cons__release(&xsk->rx, rcvd); | ||
1383 | |||
1384 | xsk->ring_stats.rx_npkts += rcvd; | ||
1385 | xsk->outstanding_tx += rcvd; | ||
1386 | } | ||
1387 | |||
1388 | static void l2fwd_all(void) | ||
1389 | { | ||
1390 | struct pollfd fds[MAX_SOCKS] = {}; | ||
1391 | int i, ret; | ||
1392 | |||
1393 | for (i = 0; i < num_socks; i++) { | ||
1394 | fds[i].fd = xsk_socket__fd(xsks[i]->xsk); | ||
1395 | fds[i].events = POLLOUT | POLLIN; | ||
1396 | } | ||
1397 | |||
1398 | for (;;) { | ||
1399 | if (opt_poll) { | ||
1400 | for (i = 0; i < num_socks; i++) | ||
1401 | xsks[i]->app_stats.opt_polls++; | ||
1402 | ret = poll(fds, num_socks, opt_timeout); | ||
1403 | if (ret <= 0) | ||
1404 | continue; | ||
1405 | } | ||
1406 | |||
1407 | for (i = 0; i < num_socks; i++) | ||
1408 | l2fwd(xsks[i], fds); | ||
1409 | |||
1410 | if (benchmark_done) | ||
1411 | break; | ||
1412 | } | ||
1413 | } | ||
1414 | |||
1415 | static void load_xdp_program(char **argv, struct bpf_object **obj) | ||
1416 | { | ||
1417 | struct bpf_prog_load_attr prog_load_attr = { | ||
1418 | .prog_type = BPF_PROG_TYPE_XDP, | ||
1419 | }; | ||
1420 | char xdp_filename[256]; | ||
1421 | int prog_fd; | ||
1422 | |||
1423 | snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv[0]); | ||
1424 | prog_load_attr.file = xdp_filename; | ||
1425 | |||
1426 | if (bpf_prog_load_xattr(&prog_load_attr, obj, &prog_fd)) | ||
1427 | exit(EXIT_FAILURE); | ||
1428 | if (prog_fd < 0) { | ||
1429 | fprintf(stderr, "ERROR: no program found: %s\n", | ||
1430 | strerror(prog_fd)); | ||
1431 | exit(EXIT_FAILURE); | ||
1432 | } | ||
1433 | |||
1434 | if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd, opt_xdp_flags) < 0) { | ||
1435 | fprintf(stderr, "ERROR: link set xdp fd failed\n"); | ||
1436 | exit(EXIT_FAILURE); | ||
1437 | } | ||
1438 | } | ||
1439 | |||
1440 | static void enter_xsks_into_map(struct bpf_object *obj) | ||
1441 | { | ||
1442 | struct bpf_map *map; | ||
1443 | int i, xsks_map; | ||
1444 | |||
1445 | map = bpf_object__find_map_by_name(obj, "xsks_map"); | ||
1446 | xsks_map = bpf_map__fd(map); | ||
1447 | if (xsks_map < 0) { | ||
1448 | fprintf(stderr, "ERROR: no xsks map found: %s\n", | ||
1449 | strerror(xsks_map)); | ||
1450 | exit(EXIT_FAILURE); | ||
1451 | } | ||
1452 | |||
1453 | for (i = 0; i < num_socks; i++) { | ||
1454 | int fd = xsk_socket__fd(xsks[i]->xsk); | ||
1455 | int key, ret; | ||
1456 | |||
1457 | key = i; | ||
1458 | ret = bpf_map_update_elem(xsks_map, &key, &fd, 0); | ||
1459 | if (ret) { | ||
1460 | fprintf(stderr, "ERROR: bpf_map_update_elem %d\n", i); | ||
1461 | exit(EXIT_FAILURE); | ||
1462 | } | ||
1463 | } | ||
1464 | } | ||
1465 | |||
1466 | int main(int argc, char **argv) | ||
1467 | { | ||
1468 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
1469 | bool rx = false, tx = false; | ||
1470 | struct xsk_umem_info *umem; | ||
1471 | struct bpf_object *obj; | ||
1472 | pthread_t pt; | ||
1473 | int i, ret; | ||
1474 | void *bufs; | ||
1475 | |||
1476 | parse_command_line(argc, argv); | ||
1477 | |||
1478 | if (setrlimit(RLIMIT_MEMLOCK, &r)) { | ||
1479 | fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n", | ||
1480 | strerror(errno)); | ||
1481 | exit(EXIT_FAILURE); | ||
1482 | } | ||
1483 | |||
1484 | if (opt_num_xsks > 1) | ||
1485 | load_xdp_program(argv, &obj); | ||
1486 | |||
1487 | /* Reserve memory for the umem. Use hugepages if unaligned chunk mode */ | ||
1488 | bufs = mmap(NULL, NUM_FRAMES * opt_xsk_frame_size, | ||
1489 | PROT_READ | PROT_WRITE, | ||
1490 | MAP_PRIVATE | MAP_ANONYMOUS | opt_mmap_flags, -1, 0); | ||
1491 | if (bufs == MAP_FAILED) { | ||
1492 | printf("ERROR: mmap failed\n"); | ||
1493 | exit(EXIT_FAILURE); | ||
1494 | } | ||
1495 | |||
1496 | /* Create sockets... */ | ||
1497 | umem = xsk_configure_umem(bufs, NUM_FRAMES * opt_xsk_frame_size); | ||
1498 | if (opt_bench == BENCH_RXDROP || opt_bench == BENCH_L2FWD) { | ||
1499 | rx = true; | ||
1500 | xsk_populate_fill_ring(umem); | ||
1501 | } | ||
1502 | if (opt_bench == BENCH_L2FWD || opt_bench == BENCH_TXONLY) | ||
1503 | tx = true; | ||
1504 | for (i = 0; i < opt_num_xsks; i++) | ||
1505 | xsks[num_socks++] = xsk_configure_socket(umem, rx, tx); | ||
1506 | |||
1507 | if (opt_bench == BENCH_TXONLY) { | ||
1508 | gen_eth_hdr_data(); | ||
1509 | |||
1510 | for (i = 0; i < NUM_FRAMES; i++) | ||
1511 | gen_eth_frame(umem, i * opt_xsk_frame_size); | ||
1512 | } | ||
1513 | |||
1514 | if (opt_num_xsks > 1 && opt_bench != BENCH_TXONLY) | ||
1515 | enter_xsks_into_map(obj); | ||
1516 | |||
1517 | signal(SIGINT, int_exit); | ||
1518 | signal(SIGTERM, int_exit); | ||
1519 | signal(SIGABRT, int_exit); | ||
1520 | |||
1521 | setlocale(LC_ALL, ""); | ||
1522 | |||
1523 | prev_time = get_nsecs(); | ||
1524 | start_time = prev_time; | ||
1525 | |||
1526 | if (!opt_quiet) { | ||
1527 | ret = pthread_create(&pt, NULL, poller, NULL); | ||
1528 | if (ret) | ||
1529 | exit_with_error(ret); | ||
1530 | } | ||
1531 | |||
1532 | |||
1533 | if (opt_bench == BENCH_RXDROP) | ||
1534 | rx_drop_all(); | ||
1535 | else if (opt_bench == BENCH_TXONLY) | ||
1536 | tx_only_all(); | ||
1537 | else | ||
1538 | l2fwd_all(); | ||
1539 | |||
1540 | benchmark_done = true; | ||
1541 | |||
1542 | if (!opt_quiet) | ||
1543 | pthread_join(pt, NULL); | ||
1544 | |||
1545 | xdpsock_cleanup(); | ||
1546 | |||
1547 | munmap(bufs, NUM_FRAMES * opt_xsk_frame_size); | ||
1548 | |||
1549 | return 0; | ||
1550 | } | ||
diff --git a/samples/bpf/xsk_fwd.c b/samples/bpf/xsk_fwd.c new file mode 100644 index 000000000..1cd97c84c --- /dev/null +++ b/samples/bpf/xsk_fwd.c | |||
@@ -0,0 +1,1085 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* Copyright(c) 2020 Intel Corporation. */ | ||
3 | |||
4 | #define _GNU_SOURCE | ||
5 | #include <poll.h> | ||
6 | #include <pthread.h> | ||
7 | #include <signal.h> | ||
8 | #include <sched.h> | ||
9 | #include <stdio.h> | ||
10 | #include <stdlib.h> | ||
11 | #include <string.h> | ||
12 | #include <sys/mman.h> | ||
13 | #include <sys/resource.h> | ||
14 | #include <sys/socket.h> | ||
15 | #include <sys/types.h> | ||
16 | #include <time.h> | ||
17 | #include <unistd.h> | ||
18 | #include <getopt.h> | ||
19 | #include <netinet/ether.h> | ||
20 | #include <net/if.h> | ||
21 | |||
22 | #include <linux/bpf.h> | ||
23 | #include <linux/if_link.h> | ||
24 | #include <linux/if_xdp.h> | ||
25 | |||
26 | #include <bpf/libbpf.h> | ||
27 | #include <bpf/xsk.h> | ||
28 | #include <bpf/bpf.h> | ||
29 | |||
30 | #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) | ||
31 | |||
32 | typedef __u64 u64; | ||
33 | typedef __u32 u32; | ||
34 | typedef __u16 u16; | ||
35 | typedef __u8 u8; | ||
36 | |||
37 | /* This program illustrates the packet forwarding between multiple AF_XDP | ||
38 | * sockets in multi-threaded environment. All threads are sharing a common | ||
39 | * buffer pool, with each socket having its own private buffer cache. | ||
40 | * | ||
41 | * Example 1: Single thread handling two sockets. The packets received by socket | ||
42 | * A (interface IFA, queue QA) are forwarded to socket B (interface IFB, queue | ||
43 | * QB), while the packets received by socket B are forwarded to socket A. The | ||
44 | * thread is running on CPU core X: | ||
45 | * | ||
46 | * ./xsk_fwd -i IFA -q QA -i IFB -q QB -c X | ||
47 | * | ||
48 | * Example 2: Two threads, each handling two sockets. The thread running on CPU | ||
49 | * core X forwards all the packets received by socket A to socket B, and all the | ||
50 | * packets received by socket B to socket A. The thread running on CPU core Y is | ||
51 | * performing the same packet forwarding between sockets C and D: | ||
52 | * | ||
53 | * ./xsk_fwd -i IFA -q QA -i IFB -q QB -i IFC -q QC -i IFD -q QD | ||
54 | * -c CX -c CY | ||
55 | */ | ||
56 | |||
57 | /* | ||
58 | * Buffer pool and buffer cache | ||
59 | * | ||
60 | * For packet forwarding, the packet buffers are typically allocated from the | ||
61 | * pool for packet reception and freed back to the pool for further reuse once | ||
62 | * the packet transmission is completed. | ||
63 | * | ||
64 | * The buffer pool is shared between multiple threads. In order to minimize the | ||
65 | * access latency to the shared buffer pool, each thread creates one (or | ||
66 | * several) buffer caches, which, unlike the buffer pool, are private to the | ||
67 | * thread that creates them and therefore cannot be shared with other threads. | ||
68 | * The access to the shared pool is only needed either (A) when the cache gets | ||
69 | * empty due to repeated buffer allocations and it needs to be replenished from | ||
70 | * the pool, or (B) when the cache gets full due to repeated buffer free and it | ||
71 | * needs to be flushed back to the pull. | ||
72 | * | ||
73 | * In a packet forwarding system, a packet received on any input port can | ||
74 | * potentially be transmitted on any output port, depending on the forwarding | ||
75 | * configuration. For AF_XDP sockets, for this to work with zero-copy of the | ||
76 | * packet buffers when, it is required that the buffer pool memory fits into the | ||
77 | * UMEM area shared by all the sockets. | ||
78 | */ | ||
79 | |||
80 | struct bpool_params { | ||
81 | u32 n_buffers; | ||
82 | u32 buffer_size; | ||
83 | int mmap_flags; | ||
84 | |||
85 | u32 n_users_max; | ||
86 | u32 n_buffers_per_slab; | ||
87 | }; | ||
88 | |||
89 | /* This buffer pool implementation organizes the buffers into equally sized | ||
90 | * slabs of *n_buffers_per_slab*. Initially, there are *n_slabs* slabs in the | ||
91 | * pool that are completely filled with buffer pointers (full slabs). | ||
92 | * | ||
93 | * Each buffer cache has a slab for buffer allocation and a slab for buffer | ||
94 | * free, with both of these slabs initially empty. When the cache's allocation | ||
95 | * slab goes empty, it is swapped with one of the available full slabs from the | ||
96 | * pool, if any is available. When the cache's free slab goes full, it is | ||
97 | * swapped for one of the empty slabs from the pool, which is guaranteed to | ||
98 | * succeed. | ||
99 | * | ||
100 | * Partially filled slabs never get traded between the cache and the pool | ||
101 | * (except when the cache itself is destroyed), which enables fast operation | ||
102 | * through pointer swapping. | ||
103 | */ | ||
104 | struct bpool { | ||
105 | struct bpool_params params; | ||
106 | pthread_mutex_t lock; | ||
107 | void *addr; | ||
108 | |||
109 | u64 **slabs; | ||
110 | u64 **slabs_reserved; | ||
111 | u64 *buffers; | ||
112 | u64 *buffers_reserved; | ||
113 | |||
114 | u64 n_slabs; | ||
115 | u64 n_slabs_reserved; | ||
116 | u64 n_buffers; | ||
117 | |||
118 | u64 n_slabs_available; | ||
119 | u64 n_slabs_reserved_available; | ||
120 | |||
121 | struct xsk_umem_config umem_cfg; | ||
122 | struct xsk_ring_prod umem_fq; | ||
123 | struct xsk_ring_cons umem_cq; | ||
124 | struct xsk_umem *umem; | ||
125 | }; | ||
126 | |||
127 | static struct bpool * | ||
128 | bpool_init(struct bpool_params *params, | ||
129 | struct xsk_umem_config *umem_cfg) | ||
130 | { | ||
131 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | ||
132 | u64 n_slabs, n_slabs_reserved, n_buffers, n_buffers_reserved; | ||
133 | u64 slabs_size, slabs_reserved_size; | ||
134 | u64 buffers_size, buffers_reserved_size; | ||
135 | u64 total_size, i; | ||
136 | struct bpool *bp; | ||
137 | u8 *p; | ||
138 | int status; | ||
139 | |||
140 | /* mmap prep. */ | ||
141 | if (setrlimit(RLIMIT_MEMLOCK, &r)) | ||
142 | return NULL; | ||
143 | |||
144 | /* bpool internals dimensioning. */ | ||
145 | n_slabs = (params->n_buffers + params->n_buffers_per_slab - 1) / | ||
146 | params->n_buffers_per_slab; | ||
147 | n_slabs_reserved = params->n_users_max * 2; | ||
148 | n_buffers = n_slabs * params->n_buffers_per_slab; | ||
149 | n_buffers_reserved = n_slabs_reserved * params->n_buffers_per_slab; | ||
150 | |||
151 | slabs_size = n_slabs * sizeof(u64 *); | ||
152 | slabs_reserved_size = n_slabs_reserved * sizeof(u64 *); | ||
153 | buffers_size = n_buffers * sizeof(u64); | ||
154 | buffers_reserved_size = n_buffers_reserved * sizeof(u64); | ||
155 | |||
156 | total_size = sizeof(struct bpool) + | ||
157 | slabs_size + slabs_reserved_size + | ||
158 | buffers_size + buffers_reserved_size; | ||
159 | |||
160 | /* bpool memory allocation. */ | ||
161 | p = calloc(total_size, sizeof(u8)); | ||
162 | if (!p) | ||
163 | return NULL; | ||
164 | |||
165 | /* bpool memory initialization. */ | ||
166 | bp = (struct bpool *)p; | ||
167 | memcpy(&bp->params, params, sizeof(*params)); | ||
168 | bp->params.n_buffers = n_buffers; | ||
169 | |||
170 | bp->slabs = (u64 **)&p[sizeof(struct bpool)]; | ||
171 | bp->slabs_reserved = (u64 **)&p[sizeof(struct bpool) + | ||
172 | slabs_size]; | ||
173 | bp->buffers = (u64 *)&p[sizeof(struct bpool) + | ||
174 | slabs_size + slabs_reserved_size]; | ||
175 | bp->buffers_reserved = (u64 *)&p[sizeof(struct bpool) + | ||
176 | slabs_size + slabs_reserved_size + buffers_size]; | ||
177 | |||
178 | bp->n_slabs = n_slabs; | ||
179 | bp->n_slabs_reserved = n_slabs_reserved; | ||
180 | bp->n_buffers = n_buffers; | ||
181 | |||
182 | for (i = 0; i < n_slabs; i++) | ||
183 | bp->slabs[i] = &bp->buffers[i * params->n_buffers_per_slab]; | ||
184 | bp->n_slabs_available = n_slabs; | ||
185 | |||
186 | for (i = 0; i < n_slabs_reserved; i++) | ||
187 | bp->slabs_reserved[i] = &bp->buffers_reserved[i * | ||
188 | params->n_buffers_per_slab]; | ||
189 | bp->n_slabs_reserved_available = n_slabs_reserved; | ||
190 | |||
191 | for (i = 0; i < n_buffers; i++) | ||
192 | bp->buffers[i] = i * params->buffer_size; | ||
193 | |||
194 | /* lock. */ | ||
195 | status = pthread_mutex_init(&bp->lock, NULL); | ||
196 | if (status) { | ||
197 | free(p); | ||
198 | return NULL; | ||
199 | } | ||
200 | |||
201 | /* mmap. */ | ||
202 | bp->addr = mmap(NULL, | ||
203 | n_buffers * params->buffer_size, | ||
204 | PROT_READ | PROT_WRITE, | ||
205 | MAP_PRIVATE | MAP_ANONYMOUS | params->mmap_flags, | ||
206 | -1, | ||
207 | 0); | ||
208 | if (bp->addr == MAP_FAILED) { | ||
209 | pthread_mutex_destroy(&bp->lock); | ||
210 | free(p); | ||
211 | return NULL; | ||
212 | } | ||
213 | |||
214 | /* umem. */ | ||
215 | status = xsk_umem__create(&bp->umem, | ||
216 | bp->addr, | ||
217 | bp->params.n_buffers * bp->params.buffer_size, | ||
218 | &bp->umem_fq, | ||
219 | &bp->umem_cq, | ||
220 | umem_cfg); | ||
221 | if (status) { | ||
222 | munmap(bp->addr, bp->params.n_buffers * bp->params.buffer_size); | ||
223 | pthread_mutex_destroy(&bp->lock); | ||
224 | free(p); | ||
225 | return NULL; | ||
226 | } | ||
227 | memcpy(&bp->umem_cfg, umem_cfg, sizeof(*umem_cfg)); | ||
228 | |||
229 | return bp; | ||
230 | } | ||
231 | |||
232 | static void | ||
233 | bpool_free(struct bpool *bp) | ||
234 | { | ||
235 | if (!bp) | ||
236 | return; | ||
237 | |||
238 | xsk_umem__delete(bp->umem); | ||
239 | munmap(bp->addr, bp->params.n_buffers * bp->params.buffer_size); | ||
240 | pthread_mutex_destroy(&bp->lock); | ||
241 | free(bp); | ||
242 | } | ||
243 | |||
244 | struct bcache { | ||
245 | struct bpool *bp; | ||
246 | |||
247 | u64 *slab_cons; | ||
248 | u64 *slab_prod; | ||
249 | |||
250 | u64 n_buffers_cons; | ||
251 | u64 n_buffers_prod; | ||
252 | }; | ||
253 | |||
254 | static u32 | ||
255 | bcache_slab_size(struct bcache *bc) | ||
256 | { | ||
257 | struct bpool *bp = bc->bp; | ||
258 | |||
259 | return bp->params.n_buffers_per_slab; | ||
260 | } | ||
261 | |||
262 | static struct bcache * | ||
263 | bcache_init(struct bpool *bp) | ||
264 | { | ||
265 | struct bcache *bc; | ||
266 | |||
267 | bc = calloc(1, sizeof(struct bcache)); | ||
268 | if (!bc) | ||
269 | return NULL; | ||
270 | |||
271 | bc->bp = bp; | ||
272 | bc->n_buffers_cons = 0; | ||
273 | bc->n_buffers_prod = 0; | ||
274 | |||
275 | pthread_mutex_lock(&bp->lock); | ||
276 | if (bp->n_slabs_reserved_available == 0) { | ||
277 | pthread_mutex_unlock(&bp->lock); | ||
278 | free(bc); | ||
279 | return NULL; | ||
280 | } | ||
281 | |||
282 | bc->slab_cons = bp->slabs_reserved[bp->n_slabs_reserved_available - 1]; | ||
283 | bc->slab_prod = bp->slabs_reserved[bp->n_slabs_reserved_available - 2]; | ||
284 | bp->n_slabs_reserved_available -= 2; | ||
285 | pthread_mutex_unlock(&bp->lock); | ||
286 | |||
287 | return bc; | ||
288 | } | ||
289 | |||
290 | static void | ||
291 | bcache_free(struct bcache *bc) | ||
292 | { | ||
293 | struct bpool *bp; | ||
294 | |||
295 | if (!bc) | ||
296 | return; | ||
297 | |||
298 | /* In order to keep this example simple, the case of freeing any | ||
299 | * existing buffers from the cache back to the pool is ignored. | ||
300 | */ | ||
301 | |||
302 | bp = bc->bp; | ||
303 | pthread_mutex_lock(&bp->lock); | ||
304 | bp->slabs_reserved[bp->n_slabs_reserved_available] = bc->slab_prod; | ||
305 | bp->slabs_reserved[bp->n_slabs_reserved_available + 1] = bc->slab_cons; | ||
306 | bp->n_slabs_reserved_available += 2; | ||
307 | pthread_mutex_unlock(&bp->lock); | ||
308 | |||
309 | free(bc); | ||
310 | } | ||
311 | |||
312 | /* To work correctly, the implementation requires that the *n_buffers* input | ||
313 | * argument is never greater than the buffer pool's *n_buffers_per_slab*. This | ||
314 | * is typically the case, with one exception taking place when large number of | ||
315 | * buffers are allocated at init time (e.g. for the UMEM fill queue setup). | ||
316 | */ | ||
317 | static inline u32 | ||
318 | bcache_cons_check(struct bcache *bc, u32 n_buffers) | ||
319 | { | ||
320 | struct bpool *bp = bc->bp; | ||
321 | u64 n_buffers_per_slab = bp->params.n_buffers_per_slab; | ||
322 | u64 n_buffers_cons = bc->n_buffers_cons; | ||
323 | u64 n_slabs_available; | ||
324 | u64 *slab_full; | ||
325 | |||
326 | /* | ||
327 | * Consumer slab is not empty: Use what's available locally. Do not | ||
328 | * look for more buffers from the pool when the ask can only be | ||
329 | * partially satisfied. | ||
330 | */ | ||
331 | if (n_buffers_cons) | ||
332 | return (n_buffers_cons < n_buffers) ? | ||
333 | n_buffers_cons : | ||
334 | n_buffers; | ||
335 | |||
336 | /* | ||
337 | * Consumer slab is empty: look to trade the current consumer slab | ||
338 | * (full) for a full slab from the pool, if any is available. | ||
339 | */ | ||
340 | pthread_mutex_lock(&bp->lock); | ||
341 | n_slabs_available = bp->n_slabs_available; | ||
342 | if (!n_slabs_available) { | ||
343 | pthread_mutex_unlock(&bp->lock); | ||
344 | return 0; | ||
345 | } | ||
346 | |||
347 | n_slabs_available--; | ||
348 | slab_full = bp->slabs[n_slabs_available]; | ||
349 | bp->slabs[n_slabs_available] = bc->slab_cons; | ||
350 | bp->n_slabs_available = n_slabs_available; | ||
351 | pthread_mutex_unlock(&bp->lock); | ||
352 | |||
353 | bc->slab_cons = slab_full; | ||
354 | bc->n_buffers_cons = n_buffers_per_slab; | ||
355 | return n_buffers; | ||
356 | } | ||
357 | |||
358 | static inline u64 | ||
359 | bcache_cons(struct bcache *bc) | ||
360 | { | ||
361 | u64 n_buffers_cons = bc->n_buffers_cons - 1; | ||
362 | u64 buffer; | ||
363 | |||
364 | buffer = bc->slab_cons[n_buffers_cons]; | ||
365 | bc->n_buffers_cons = n_buffers_cons; | ||
366 | return buffer; | ||
367 | } | ||
368 | |||
369 | static inline void | ||
370 | bcache_prod(struct bcache *bc, u64 buffer) | ||
371 | { | ||
372 | struct bpool *bp = bc->bp; | ||
373 | u64 n_buffers_per_slab = bp->params.n_buffers_per_slab; | ||
374 | u64 n_buffers_prod = bc->n_buffers_prod; | ||
375 | u64 n_slabs_available; | ||
376 | u64 *slab_empty; | ||
377 | |||
378 | /* | ||
379 | * Producer slab is not yet full: store the current buffer to it. | ||
380 | */ | ||
381 | if (n_buffers_prod < n_buffers_per_slab) { | ||
382 | bc->slab_prod[n_buffers_prod] = buffer; | ||
383 | bc->n_buffers_prod = n_buffers_prod + 1; | ||
384 | return; | ||
385 | } | ||
386 | |||
387 | /* | ||
388 | * Producer slab is full: trade the cache's current producer slab | ||
389 | * (full) for an empty slab from the pool, then store the current | ||
390 | * buffer to the new producer slab. As one full slab exists in the | ||
391 | * cache, it is guaranteed that there is at least one empty slab | ||
392 | * available in the pool. | ||
393 | */ | ||
394 | pthread_mutex_lock(&bp->lock); | ||
395 | n_slabs_available = bp->n_slabs_available; | ||
396 | slab_empty = bp->slabs[n_slabs_available]; | ||
397 | bp->slabs[n_slabs_available] = bc->slab_prod; | ||
398 | bp->n_slabs_available = n_slabs_available + 1; | ||
399 | pthread_mutex_unlock(&bp->lock); | ||
400 | |||
401 | slab_empty[0] = buffer; | ||
402 | bc->slab_prod = slab_empty; | ||
403 | bc->n_buffers_prod = 1; | ||
404 | } | ||
405 | |||
406 | /* | ||
407 | * Port | ||
408 | * | ||
409 | * Each of the forwarding ports sits on top of an AF_XDP socket. In order for | ||
410 | * packet forwarding to happen with no packet buffer copy, all the sockets need | ||
411 | * to share the same UMEM area, which is used as the buffer pool memory. | ||
412 | */ | ||
413 | #ifndef MAX_BURST_RX | ||
414 | #define MAX_BURST_RX 64 | ||
415 | #endif | ||
416 | |||
417 | #ifndef MAX_BURST_TX | ||
418 | #define MAX_BURST_TX 64 | ||
419 | #endif | ||
420 | |||
421 | struct burst_rx { | ||
422 | u64 addr[MAX_BURST_RX]; | ||
423 | u32 len[MAX_BURST_RX]; | ||
424 | }; | ||
425 | |||
426 | struct burst_tx { | ||
427 | u64 addr[MAX_BURST_TX]; | ||
428 | u32 len[MAX_BURST_TX]; | ||
429 | u32 n_pkts; | ||
430 | }; | ||
431 | |||
432 | struct port_params { | ||
433 | struct xsk_socket_config xsk_cfg; | ||
434 | struct bpool *bp; | ||
435 | const char *iface; | ||
436 | u32 iface_queue; | ||
437 | }; | ||
438 | |||
439 | struct port { | ||
440 | struct port_params params; | ||
441 | |||
442 | struct bcache *bc; | ||
443 | |||
444 | struct xsk_ring_cons rxq; | ||
445 | struct xsk_ring_prod txq; | ||
446 | struct xsk_ring_prod umem_fq; | ||
447 | struct xsk_ring_cons umem_cq; | ||
448 | struct xsk_socket *xsk; | ||
449 | int umem_fq_initialized; | ||
450 | |||
451 | u64 n_pkts_rx; | ||
452 | u64 n_pkts_tx; | ||
453 | }; | ||
454 | |||
455 | static void | ||
456 | port_free(struct port *p) | ||
457 | { | ||
458 | if (!p) | ||
459 | return; | ||
460 | |||
461 | /* To keep this example simple, the code to free the buffers from the | ||
462 | * socket's receive and transmit queues, as well as from the UMEM fill | ||
463 | * and completion queues, is not included. | ||
464 | */ | ||
465 | |||
466 | if (p->xsk) | ||
467 | xsk_socket__delete(p->xsk); | ||
468 | |||
469 | bcache_free(p->bc); | ||
470 | |||
471 | free(p); | ||
472 | } | ||
473 | |||
474 | static struct port * | ||
475 | port_init(struct port_params *params) | ||
476 | { | ||
477 | struct port *p; | ||
478 | u32 umem_fq_size, pos = 0; | ||
479 | int status, i; | ||
480 | |||
481 | /* Memory allocation and initialization. */ | ||
482 | p = calloc(sizeof(struct port), 1); | ||
483 | if (!p) | ||
484 | return NULL; | ||
485 | |||
486 | memcpy(&p->params, params, sizeof(p->params)); | ||
487 | umem_fq_size = params->bp->umem_cfg.fill_size; | ||
488 | |||
489 | /* bcache. */ | ||
490 | p->bc = bcache_init(params->bp); | ||
491 | if (!p->bc || | ||
492 | (bcache_slab_size(p->bc) < umem_fq_size) || | ||
493 | (bcache_cons_check(p->bc, umem_fq_size) < umem_fq_size)) { | ||
494 | port_free(p); | ||
495 | return NULL; | ||
496 | } | ||
497 | |||
498 | /* xsk socket. */ | ||
499 | status = xsk_socket__create_shared(&p->xsk, | ||
500 | params->iface, | ||
501 | params->iface_queue, | ||
502 | params->bp->umem, | ||
503 | &p->rxq, | ||
504 | &p->txq, | ||
505 | &p->umem_fq, | ||
506 | &p->umem_cq, | ||
507 | ¶ms->xsk_cfg); | ||
508 | if (status) { | ||
509 | port_free(p); | ||
510 | return NULL; | ||
511 | } | ||
512 | |||
513 | /* umem fq. */ | ||
514 | xsk_ring_prod__reserve(&p->umem_fq, umem_fq_size, &pos); | ||
515 | |||
516 | for (i = 0; i < umem_fq_size; i++) | ||
517 | *xsk_ring_prod__fill_addr(&p->umem_fq, pos + i) = | ||
518 | bcache_cons(p->bc); | ||
519 | |||
520 | xsk_ring_prod__submit(&p->umem_fq, umem_fq_size); | ||
521 | p->umem_fq_initialized = 1; | ||
522 | |||
523 | return p; | ||
524 | } | ||
525 | |||
526 | static inline u32 | ||
527 | port_rx_burst(struct port *p, struct burst_rx *b) | ||
528 | { | ||
529 | u32 n_pkts, pos, i; | ||
530 | |||
531 | /* Free buffers for FQ replenish. */ | ||
532 | n_pkts = ARRAY_SIZE(b->addr); | ||
533 | |||
534 | n_pkts = bcache_cons_check(p->bc, n_pkts); | ||
535 | if (!n_pkts) | ||
536 | return 0; | ||
537 | |||
538 | /* RXQ. */ | ||
539 | n_pkts = xsk_ring_cons__peek(&p->rxq, n_pkts, &pos); | ||
540 | if (!n_pkts) { | ||
541 | if (xsk_ring_prod__needs_wakeup(&p->umem_fq)) { | ||
542 | struct pollfd pollfd = { | ||
543 | .fd = xsk_socket__fd(p->xsk), | ||
544 | .events = POLLIN, | ||
545 | }; | ||
546 | |||
547 | poll(&pollfd, 1, 0); | ||
548 | } | ||
549 | return 0; | ||
550 | } | ||
551 | |||
552 | for (i = 0; i < n_pkts; i++) { | ||
553 | b->addr[i] = xsk_ring_cons__rx_desc(&p->rxq, pos + i)->addr; | ||
554 | b->len[i] = xsk_ring_cons__rx_desc(&p->rxq, pos + i)->len; | ||
555 | } | ||
556 | |||
557 | xsk_ring_cons__release(&p->rxq, n_pkts); | ||
558 | p->n_pkts_rx += n_pkts; | ||
559 | |||
560 | /* UMEM FQ. */ | ||
561 | for ( ; ; ) { | ||
562 | int status; | ||
563 | |||
564 | status = xsk_ring_prod__reserve(&p->umem_fq, n_pkts, &pos); | ||
565 | if (status == n_pkts) | ||
566 | break; | ||
567 | |||
568 | if (xsk_ring_prod__needs_wakeup(&p->umem_fq)) { | ||
569 | struct pollfd pollfd = { | ||
570 | .fd = xsk_socket__fd(p->xsk), | ||
571 | .events = POLLIN, | ||
572 | }; | ||
573 | |||
574 | poll(&pollfd, 1, 0); | ||
575 | } | ||
576 | } | ||
577 | |||
578 | for (i = 0; i < n_pkts; i++) | ||
579 | *xsk_ring_prod__fill_addr(&p->umem_fq, pos + i) = | ||
580 | bcache_cons(p->bc); | ||
581 | |||
582 | xsk_ring_prod__submit(&p->umem_fq, n_pkts); | ||
583 | |||
584 | return n_pkts; | ||
585 | } | ||
586 | |||
587 | static inline void | ||
588 | port_tx_burst(struct port *p, struct burst_tx *b) | ||
589 | { | ||
590 | u32 n_pkts, pos, i; | ||
591 | int status; | ||
592 | |||
593 | /* UMEM CQ. */ | ||
594 | n_pkts = p->params.bp->umem_cfg.comp_size; | ||
595 | |||
596 | n_pkts = xsk_ring_cons__peek(&p->umem_cq, n_pkts, &pos); | ||
597 | |||
598 | for (i = 0; i < n_pkts; i++) { | ||
599 | u64 addr = *xsk_ring_cons__comp_addr(&p->umem_cq, pos + i); | ||
600 | |||
601 | bcache_prod(p->bc, addr); | ||
602 | } | ||
603 | |||
604 | xsk_ring_cons__release(&p->umem_cq, n_pkts); | ||
605 | |||
606 | /* TXQ. */ | ||
607 | n_pkts = b->n_pkts; | ||
608 | |||
609 | for ( ; ; ) { | ||
610 | status = xsk_ring_prod__reserve(&p->txq, n_pkts, &pos); | ||
611 | if (status == n_pkts) | ||
612 | break; | ||
613 | |||
614 | if (xsk_ring_prod__needs_wakeup(&p->txq)) | ||
615 | sendto(xsk_socket__fd(p->xsk), NULL, 0, MSG_DONTWAIT, | ||
616 | NULL, 0); | ||
617 | } | ||
618 | |||
619 | for (i = 0; i < n_pkts; i++) { | ||
620 | xsk_ring_prod__tx_desc(&p->txq, pos + i)->addr = b->addr[i]; | ||
621 | xsk_ring_prod__tx_desc(&p->txq, pos + i)->len = b->len[i]; | ||
622 | } | ||
623 | |||
624 | xsk_ring_prod__submit(&p->txq, n_pkts); | ||
625 | if (xsk_ring_prod__needs_wakeup(&p->txq)) | ||
626 | sendto(xsk_socket__fd(p->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); | ||
627 | p->n_pkts_tx += n_pkts; | ||
628 | } | ||
629 | |||
630 | /* | ||
631 | * Thread | ||
632 | * | ||
633 | * Packet forwarding threads. | ||
634 | */ | ||
635 | #ifndef MAX_PORTS_PER_THREAD | ||
636 | #define MAX_PORTS_PER_THREAD 16 | ||
637 | #endif | ||
638 | |||
639 | struct thread_data { | ||
640 | struct port *ports_rx[MAX_PORTS_PER_THREAD]; | ||
641 | struct port *ports_tx[MAX_PORTS_PER_THREAD]; | ||
642 | u32 n_ports_rx; | ||
643 | struct burst_rx burst_rx; | ||
644 | struct burst_tx burst_tx[MAX_PORTS_PER_THREAD]; | ||
645 | u32 cpu_core_id; | ||
646 | int quit; | ||
647 | }; | ||
648 | |||
649 | static void swap_mac_addresses(void *data) | ||
650 | { | ||
651 | struct ether_header *eth = (struct ether_header *)data; | ||
652 | struct ether_addr *src_addr = (struct ether_addr *)ð->ether_shost; | ||
653 | struct ether_addr *dst_addr = (struct ether_addr *)ð->ether_dhost; | ||
654 | struct ether_addr tmp; | ||
655 | |||
656 | tmp = *src_addr; | ||
657 | *src_addr = *dst_addr; | ||
658 | *dst_addr = tmp; | ||
659 | } | ||
660 | |||
661 | static void * | ||
662 | thread_func(void *arg) | ||
663 | { | ||
664 | struct thread_data *t = arg; | ||
665 | cpu_set_t cpu_cores; | ||
666 | u32 i; | ||
667 | |||
668 | CPU_ZERO(&cpu_cores); | ||
669 | CPU_SET(t->cpu_core_id, &cpu_cores); | ||
670 | pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpu_cores); | ||
671 | |||
672 | for (i = 0; !t->quit; i = (i + 1) & (t->n_ports_rx - 1)) { | ||
673 | struct port *port_rx = t->ports_rx[i]; | ||
674 | struct port *port_tx = t->ports_tx[i]; | ||
675 | struct burst_rx *brx = &t->burst_rx; | ||
676 | struct burst_tx *btx = &t->burst_tx[i]; | ||
677 | u32 n_pkts, j; | ||
678 | |||
679 | /* RX. */ | ||
680 | n_pkts = port_rx_burst(port_rx, brx); | ||
681 | if (!n_pkts) | ||
682 | continue; | ||
683 | |||
684 | /* Process & TX. */ | ||
685 | for (j = 0; j < n_pkts; j++) { | ||
686 | u64 addr = xsk_umem__add_offset_to_addr(brx->addr[j]); | ||
687 | u8 *pkt = xsk_umem__get_data(port_rx->params.bp->addr, | ||
688 | addr); | ||
689 | |||
690 | swap_mac_addresses(pkt); | ||
691 | |||
692 | btx->addr[btx->n_pkts] = brx->addr[j]; | ||
693 | btx->len[btx->n_pkts] = brx->len[j]; | ||
694 | btx->n_pkts++; | ||
695 | |||
696 | if (btx->n_pkts == MAX_BURST_TX) { | ||
697 | port_tx_burst(port_tx, btx); | ||
698 | btx->n_pkts = 0; | ||
699 | } | ||
700 | } | ||
701 | } | ||
702 | |||
703 | return NULL; | ||
704 | } | ||
705 | |||
706 | /* | ||
707 | * Process | ||
708 | */ | ||
709 | static const struct bpool_params bpool_params_default = { | ||
710 | .n_buffers = 64 * 1024, | ||
711 | .buffer_size = XSK_UMEM__DEFAULT_FRAME_SIZE, | ||
712 | .mmap_flags = 0, | ||
713 | |||
714 | .n_users_max = 16, | ||
715 | .n_buffers_per_slab = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, | ||
716 | }; | ||
717 | |||
718 | static const struct xsk_umem_config umem_cfg_default = { | ||
719 | .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, | ||
720 | .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, | ||
721 | .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE, | ||
722 | .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM, | ||
723 | .flags = 0, | ||
724 | }; | ||
725 | |||
726 | static const struct port_params port_params_default = { | ||
727 | .xsk_cfg = { | ||
728 | .rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, | ||
729 | .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, | ||
730 | .libbpf_flags = 0, | ||
731 | .xdp_flags = XDP_FLAGS_DRV_MODE, | ||
732 | .bind_flags = XDP_USE_NEED_WAKEUP | XDP_ZEROCOPY, | ||
733 | }, | ||
734 | |||
735 | .bp = NULL, | ||
736 | .iface = NULL, | ||
737 | .iface_queue = 0, | ||
738 | }; | ||
739 | |||
740 | #ifndef MAX_PORTS | ||
741 | #define MAX_PORTS 64 | ||
742 | #endif | ||
743 | |||
744 | #ifndef MAX_THREADS | ||
745 | #define MAX_THREADS 64 | ||
746 | #endif | ||
747 | |||
748 | static struct bpool_params bpool_params; | ||
749 | static struct xsk_umem_config umem_cfg; | ||
750 | static struct bpool *bp; | ||
751 | |||
752 | static struct port_params port_params[MAX_PORTS]; | ||
753 | static struct port *ports[MAX_PORTS]; | ||
754 | static u64 n_pkts_rx[MAX_PORTS]; | ||
755 | static u64 n_pkts_tx[MAX_PORTS]; | ||
756 | static int n_ports; | ||
757 | |||
758 | static pthread_t threads[MAX_THREADS]; | ||
759 | static struct thread_data thread_data[MAX_THREADS]; | ||
760 | static int n_threads; | ||
761 | |||
762 | static void | ||
763 | print_usage(char *prog_name) | ||
764 | { | ||
765 | const char *usage = | ||
766 | "Usage:\n" | ||
767 | "\t%s [ -b SIZE ] -c CORE -i INTERFACE [ -q QUEUE ]\n" | ||
768 | "\n" | ||
769 | "-c CORE CPU core to run a packet forwarding thread\n" | ||
770 | " on. May be invoked multiple times.\n" | ||
771 | "\n" | ||
772 | "-b SIZE Number of buffers in the buffer pool shared\n" | ||
773 | " by all the forwarding threads. Default: %u.\n" | ||
774 | "\n" | ||
775 | "-i INTERFACE Network interface. Each (INTERFACE, QUEUE)\n" | ||
776 | " pair specifies one forwarding port. May be\n" | ||
777 | " invoked multiple times.\n" | ||
778 | "\n" | ||
779 | "-q QUEUE Network interface queue for RX and TX. Each\n" | ||
780 | " (INTERFACE, QUEUE) pair specified one\n" | ||
781 | " forwarding port. Default: %u. May be invoked\n" | ||
782 | " multiple times.\n" | ||
783 | "\n"; | ||
784 | printf(usage, | ||
785 | prog_name, | ||
786 | bpool_params_default.n_buffers, | ||
787 | port_params_default.iface_queue); | ||
788 | } | ||
789 | |||
790 | static int | ||
791 | parse_args(int argc, char **argv) | ||
792 | { | ||
793 | struct option lgopts[] = { | ||
794 | { NULL, 0, 0, 0 } | ||
795 | }; | ||
796 | int opt, option_index; | ||
797 | |||
798 | /* Parse the input arguments. */ | ||
799 | for ( ; ;) { | ||
800 | opt = getopt_long(argc, argv, "c:i:q:", lgopts, &option_index); | ||
801 | if (opt == EOF) | ||
802 | break; | ||
803 | |||
804 | switch (opt) { | ||
805 | case 'b': | ||
806 | bpool_params.n_buffers = atoi(optarg); | ||
807 | break; | ||
808 | |||
809 | case 'c': | ||
810 | if (n_threads == MAX_THREADS) { | ||
811 | printf("Max number of threads (%d) reached.\n", | ||
812 | MAX_THREADS); | ||
813 | return -1; | ||
814 | } | ||
815 | |||
816 | thread_data[n_threads].cpu_core_id = atoi(optarg); | ||
817 | n_threads++; | ||
818 | break; | ||
819 | |||
820 | case 'i': | ||
821 | if (n_ports == MAX_PORTS) { | ||
822 | printf("Max number of ports (%d) reached.\n", | ||
823 | MAX_PORTS); | ||
824 | return -1; | ||
825 | } | ||
826 | |||
827 | port_params[n_ports].iface = optarg; | ||
828 | port_params[n_ports].iface_queue = 0; | ||
829 | n_ports++; | ||
830 | break; | ||
831 | |||
832 | case 'q': | ||
833 | if (n_ports == 0) { | ||
834 | printf("No port specified for queue.\n"); | ||
835 | return -1; | ||
836 | } | ||
837 | port_params[n_ports - 1].iface_queue = atoi(optarg); | ||
838 | break; | ||
839 | |||
840 | default: | ||
841 | printf("Illegal argument.\n"); | ||
842 | return -1; | ||
843 | } | ||
844 | } | ||
845 | |||
846 | optind = 1; /* reset getopt lib */ | ||
847 | |||
848 | /* Check the input arguments. */ | ||
849 | if (!n_ports) { | ||
850 | printf("No ports specified.\n"); | ||
851 | return -1; | ||
852 | } | ||
853 | |||
854 | if (!n_threads) { | ||
855 | printf("No threads specified.\n"); | ||
856 | return -1; | ||
857 | } | ||
858 | |||
859 | if (n_ports % n_threads) { | ||
860 | printf("Ports cannot be evenly distributed to threads.\n"); | ||
861 | return -1; | ||
862 | } | ||
863 | |||
864 | return 0; | ||
865 | } | ||
866 | |||
867 | static void | ||
868 | print_port(u32 port_id) | ||
869 | { | ||
870 | struct port *port = ports[port_id]; | ||
871 | |||
872 | printf("Port %u: interface = %s, queue = %u\n", | ||
873 | port_id, port->params.iface, port->params.iface_queue); | ||
874 | } | ||
875 | |||
876 | static void | ||
877 | print_thread(u32 thread_id) | ||
878 | { | ||
879 | struct thread_data *t = &thread_data[thread_id]; | ||
880 | u32 i; | ||
881 | |||
882 | printf("Thread %u (CPU core %u): ", | ||
883 | thread_id, t->cpu_core_id); | ||
884 | |||
885 | for (i = 0; i < t->n_ports_rx; i++) { | ||
886 | struct port *port_rx = t->ports_rx[i]; | ||
887 | struct port *port_tx = t->ports_tx[i]; | ||
888 | |||
889 | printf("(%s, %u) -> (%s, %u), ", | ||
890 | port_rx->params.iface, | ||
891 | port_rx->params.iface_queue, | ||
892 | port_tx->params.iface, | ||
893 | port_tx->params.iface_queue); | ||
894 | } | ||
895 | |||
896 | printf("\n"); | ||
897 | } | ||
898 | |||
899 | static void | ||
900 | print_port_stats_separator(void) | ||
901 | { | ||
902 | printf("+-%4s-+-%12s-+-%13s-+-%12s-+-%13s-+\n", | ||
903 | "----", | ||
904 | "------------", | ||
905 | "-------------", | ||
906 | "------------", | ||
907 | "-------------"); | ||
908 | } | ||
909 | |||
910 | static void | ||
911 | print_port_stats_header(void) | ||
912 | { | ||
913 | print_port_stats_separator(); | ||
914 | printf("| %4s | %12s | %13s | %12s | %13s |\n", | ||
915 | "Port", | ||
916 | "RX packets", | ||
917 | "RX rate (pps)", | ||
918 | "TX packets", | ||
919 | "TX_rate (pps)"); | ||
920 | print_port_stats_separator(); | ||
921 | } | ||
922 | |||
923 | static void | ||
924 | print_port_stats_trailer(void) | ||
925 | { | ||
926 | print_port_stats_separator(); | ||
927 | printf("\n"); | ||
928 | } | ||
929 | |||
930 | static void | ||
931 | print_port_stats(int port_id, u64 ns_diff) | ||
932 | { | ||
933 | struct port *p = ports[port_id]; | ||
934 | double rx_pps, tx_pps; | ||
935 | |||
936 | rx_pps = (p->n_pkts_rx - n_pkts_rx[port_id]) * 1000000000. / ns_diff; | ||
937 | tx_pps = (p->n_pkts_tx - n_pkts_tx[port_id]) * 1000000000. / ns_diff; | ||
938 | |||
939 | printf("| %4d | %12llu | %13.0f | %12llu | %13.0f |\n", | ||
940 | port_id, | ||
941 | p->n_pkts_rx, | ||
942 | rx_pps, | ||
943 | p->n_pkts_tx, | ||
944 | tx_pps); | ||
945 | |||
946 | n_pkts_rx[port_id] = p->n_pkts_rx; | ||
947 | n_pkts_tx[port_id] = p->n_pkts_tx; | ||
948 | } | ||
949 | |||
950 | static void | ||
951 | print_port_stats_all(u64 ns_diff) | ||
952 | { | ||
953 | int i; | ||
954 | |||
955 | print_port_stats_header(); | ||
956 | for (i = 0; i < n_ports; i++) | ||
957 | print_port_stats(i, ns_diff); | ||
958 | print_port_stats_trailer(); | ||
959 | } | ||
960 | |||
961 | static int quit; | ||
962 | |||
963 | static void | ||
964 | signal_handler(int sig) | ||
965 | { | ||
966 | quit = 1; | ||
967 | } | ||
968 | |||
969 | static void remove_xdp_program(void) | ||
970 | { | ||
971 | int i; | ||
972 | |||
973 | for (i = 0 ; i < n_ports; i++) | ||
974 | bpf_set_link_xdp_fd(if_nametoindex(port_params[i].iface), -1, | ||
975 | port_params[i].xsk_cfg.xdp_flags); | ||
976 | } | ||
977 | |||
978 | int main(int argc, char **argv) | ||
979 | { | ||
980 | struct timespec time; | ||
981 | u64 ns0; | ||
982 | int i; | ||
983 | |||
984 | /* Parse args. */ | ||
985 | memcpy(&bpool_params, &bpool_params_default, | ||
986 | sizeof(struct bpool_params)); | ||
987 | memcpy(&umem_cfg, &umem_cfg_default, | ||
988 | sizeof(struct xsk_umem_config)); | ||
989 | for (i = 0; i < MAX_PORTS; i++) | ||
990 | memcpy(&port_params[i], &port_params_default, | ||
991 | sizeof(struct port_params)); | ||
992 | |||
993 | if (parse_args(argc, argv)) { | ||
994 | print_usage(argv[0]); | ||
995 | return -1; | ||
996 | } | ||
997 | |||
998 | /* Buffer pool initialization. */ | ||
999 | bp = bpool_init(&bpool_params, &umem_cfg); | ||
1000 | if (!bp) { | ||
1001 | printf("Buffer pool initialization failed.\n"); | ||
1002 | return -1; | ||
1003 | } | ||
1004 | printf("Buffer pool created successfully.\n"); | ||
1005 | |||
1006 | /* Ports initialization. */ | ||
1007 | for (i = 0; i < MAX_PORTS; i++) | ||
1008 | port_params[i].bp = bp; | ||
1009 | |||
1010 | for (i = 0; i < n_ports; i++) { | ||
1011 | ports[i] = port_init(&port_params[i]); | ||
1012 | if (!ports[i]) { | ||
1013 | printf("Port %d initialization failed.\n", i); | ||
1014 | return -1; | ||
1015 | } | ||
1016 | print_port(i); | ||
1017 | } | ||
1018 | printf("All ports created successfully.\n"); | ||
1019 | |||
1020 | /* Threads. */ | ||
1021 | for (i = 0; i < n_threads; i++) { | ||
1022 | struct thread_data *t = &thread_data[i]; | ||
1023 | u32 n_ports_per_thread = n_ports / n_threads, j; | ||
1024 | |||
1025 | for (j = 0; j < n_ports_per_thread; j++) { | ||
1026 | t->ports_rx[j] = ports[i * n_ports_per_thread + j]; | ||
1027 | t->ports_tx[j] = ports[i * n_ports_per_thread + | ||
1028 | (j + 1) % n_ports_per_thread]; | ||
1029 | } | ||
1030 | |||
1031 | t->n_ports_rx = n_ports_per_thread; | ||
1032 | |||
1033 | print_thread(i); | ||
1034 | } | ||
1035 | |||
1036 | for (i = 0; i < n_threads; i++) { | ||
1037 | int status; | ||
1038 | |||
1039 | status = pthread_create(&threads[i], | ||
1040 | NULL, | ||
1041 | thread_func, | ||
1042 | &thread_data[i]); | ||
1043 | if (status) { | ||
1044 | printf("Thread %d creation failed.\n", i); | ||
1045 | return -1; | ||
1046 | } | ||
1047 | } | ||
1048 | printf("All threads created successfully.\n"); | ||
1049 | |||
1050 | /* Print statistics. */ | ||
1051 | signal(SIGINT, signal_handler); | ||
1052 | signal(SIGTERM, signal_handler); | ||
1053 | signal(SIGABRT, signal_handler); | ||
1054 | |||
1055 | clock_gettime(CLOCK_MONOTONIC, &time); | ||
1056 | ns0 = time.tv_sec * 1000000000UL + time.tv_nsec; | ||
1057 | for ( ; !quit; ) { | ||
1058 | u64 ns1, ns_diff; | ||
1059 | |||
1060 | sleep(1); | ||
1061 | clock_gettime(CLOCK_MONOTONIC, &time); | ||
1062 | ns1 = time.tv_sec * 1000000000UL + time.tv_nsec; | ||
1063 | ns_diff = ns1 - ns0; | ||
1064 | ns0 = ns1; | ||
1065 | |||
1066 | print_port_stats_all(ns_diff); | ||
1067 | } | ||
1068 | |||
1069 | /* Threads completion. */ | ||
1070 | printf("Quit.\n"); | ||
1071 | for (i = 0; i < n_threads; i++) | ||
1072 | thread_data[i].quit = 1; | ||
1073 | |||
1074 | for (i = 0; i < n_threads; i++) | ||
1075 | pthread_join(threads[i], NULL); | ||
1076 | |||
1077 | for (i = 0; i < n_ports; i++) | ||
1078 | port_free(ports[i]); | ||
1079 | |||
1080 | bpool_free(bp); | ||
1081 | |||
1082 | remove_xdp_program(); | ||
1083 | |||
1084 | return 0; | ||
1085 | } | ||
diff --git a/samples/configfs/Makefile b/samples/configfs/Makefile new file mode 100644 index 000000000..92d661fcb --- /dev/null +++ b/samples/configfs/Makefile | |||
@@ -0,0 +1,3 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | |||
3 | obj-$(CONFIG_SAMPLE_CONFIGFS) += configfs_sample.o | ||
diff --git a/samples/configfs/configfs_sample.c b/samples/configfs/configfs_sample.c new file mode 100644 index 000000000..f9008be7a --- /dev/null +++ b/samples/configfs/configfs_sample.c | |||
@@ -0,0 +1,369 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
2 | /* | ||
3 | * vim: noexpandtab ts=8 sts=0 sw=8: | ||
4 | * | ||
5 | * configfs_example_macros.c - This file is a demonstration module | ||
6 | * containing a number of configfs subsystems. It uses the helper | ||
7 | * macros defined by configfs.h | ||
8 | * | ||
9 | * Based on sysfs: | ||
10 | * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel | ||
11 | * | ||
12 | * configfs Copyright (C) 2005 Oracle. All rights reserved. | ||
13 | */ | ||
14 | |||
15 | #include <linux/init.h> | ||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/slab.h> | ||
19 | #include <linux/configfs.h> | ||
20 | |||
21 | /* | ||
22 | * 01-childless | ||
23 | * | ||
24 | * This first example is a childless subsystem. It cannot create | ||
25 | * any config_items. It just has attributes. | ||
26 | * | ||
27 | * Note that we are enclosing the configfs_subsystem inside a container. | ||
28 | * This is not necessary if a subsystem has no attributes directly | ||
29 | * on the subsystem. See the next example, 02-simple-children, for | ||
30 | * such a subsystem. | ||
31 | */ | ||
32 | |||
33 | struct childless { | ||
34 | struct configfs_subsystem subsys; | ||
35 | int showme; | ||
36 | int storeme; | ||
37 | }; | ||
38 | |||
39 | static inline struct childless *to_childless(struct config_item *item) | ||
40 | { | ||
41 | return container_of(to_configfs_subsystem(to_config_group(item)), | ||
42 | struct childless, subsys); | ||
43 | } | ||
44 | |||
45 | static ssize_t childless_showme_show(struct config_item *item, char *page) | ||
46 | { | ||
47 | struct childless *childless = to_childless(item); | ||
48 | ssize_t pos; | ||
49 | |||
50 | pos = sprintf(page, "%d\n", childless->showme); | ||
51 | childless->showme++; | ||
52 | |||
53 | return pos; | ||
54 | } | ||
55 | |||
56 | static ssize_t childless_storeme_show(struct config_item *item, char *page) | ||
57 | { | ||
58 | return sprintf(page, "%d\n", to_childless(item)->storeme); | ||
59 | } | ||
60 | |||
61 | static ssize_t childless_storeme_store(struct config_item *item, | ||
62 | const char *page, size_t count) | ||
63 | { | ||
64 | struct childless *childless = to_childless(item); | ||
65 | int ret; | ||
66 | |||
67 | ret = kstrtoint(page, 10, &childless->storeme); | ||
68 | if (ret) | ||
69 | return ret; | ||
70 | |||
71 | return count; | ||
72 | } | ||
73 | |||
74 | static ssize_t childless_description_show(struct config_item *item, char *page) | ||
75 | { | ||
76 | return sprintf(page, | ||
77 | "[01-childless]\n" | ||
78 | "\n" | ||
79 | "The childless subsystem is the simplest possible subsystem in\n" | ||
80 | "configfs. It does not support the creation of child config_items.\n" | ||
81 | "It only has a few attributes. In fact, it isn't much different\n" | ||
82 | "than a directory in /proc.\n"); | ||
83 | } | ||
84 | |||
85 | CONFIGFS_ATTR_RO(childless_, showme); | ||
86 | CONFIGFS_ATTR(childless_, storeme); | ||
87 | CONFIGFS_ATTR_RO(childless_, description); | ||
88 | |||
89 | static struct configfs_attribute *childless_attrs[] = { | ||
90 | &childless_attr_showme, | ||
91 | &childless_attr_storeme, | ||
92 | &childless_attr_description, | ||
93 | NULL, | ||
94 | }; | ||
95 | |||
96 | static const struct config_item_type childless_type = { | ||
97 | .ct_attrs = childless_attrs, | ||
98 | .ct_owner = THIS_MODULE, | ||
99 | }; | ||
100 | |||
101 | static struct childless childless_subsys = { | ||
102 | .subsys = { | ||
103 | .su_group = { | ||
104 | .cg_item = { | ||
105 | .ci_namebuf = "01-childless", | ||
106 | .ci_type = &childless_type, | ||
107 | }, | ||
108 | }, | ||
109 | }, | ||
110 | }; | ||
111 | |||
112 | /* ----------------------------------------------------------------- */ | ||
113 | |||
114 | /* | ||
115 | * 02-simple-children | ||
116 | * | ||
117 | * This example merely has a simple one-attribute child. Note that | ||
118 | * there is no extra attribute structure, as the child's attribute is | ||
119 | * known from the get-go. Also, there is no container for the | ||
120 | * subsystem, as it has no attributes of its own. | ||
121 | */ | ||
122 | |||
123 | struct simple_child { | ||
124 | struct config_item item; | ||
125 | int storeme; | ||
126 | }; | ||
127 | |||
128 | static inline struct simple_child *to_simple_child(struct config_item *item) | ||
129 | { | ||
130 | return container_of(item, struct simple_child, item); | ||
131 | } | ||
132 | |||
133 | static ssize_t simple_child_storeme_show(struct config_item *item, char *page) | ||
134 | { | ||
135 | return sprintf(page, "%d\n", to_simple_child(item)->storeme); | ||
136 | } | ||
137 | |||
138 | static ssize_t simple_child_storeme_store(struct config_item *item, | ||
139 | const char *page, size_t count) | ||
140 | { | ||
141 | struct simple_child *simple_child = to_simple_child(item); | ||
142 | int ret; | ||
143 | |||
144 | ret = kstrtoint(page, 10, &simple_child->storeme); | ||
145 | if (ret) | ||
146 | return ret; | ||
147 | |||
148 | return count; | ||
149 | } | ||
150 | |||
151 | CONFIGFS_ATTR(simple_child_, storeme); | ||
152 | |||
153 | static struct configfs_attribute *simple_child_attrs[] = { | ||
154 | &simple_child_attr_storeme, | ||
155 | NULL, | ||
156 | }; | ||
157 | |||
158 | static void simple_child_release(struct config_item *item) | ||
159 | { | ||
160 | kfree(to_simple_child(item)); | ||
161 | } | ||
162 | |||
163 | static struct configfs_item_operations simple_child_item_ops = { | ||
164 | .release = simple_child_release, | ||
165 | }; | ||
166 | |||
167 | static const struct config_item_type simple_child_type = { | ||
168 | .ct_item_ops = &simple_child_item_ops, | ||
169 | .ct_attrs = simple_child_attrs, | ||
170 | .ct_owner = THIS_MODULE, | ||
171 | }; | ||
172 | |||
173 | struct simple_children { | ||
174 | struct config_group group; | ||
175 | }; | ||
176 | |||
177 | static inline struct simple_children *to_simple_children(struct config_item *item) | ||
178 | { | ||
179 | return container_of(to_config_group(item), | ||
180 | struct simple_children, group); | ||
181 | } | ||
182 | |||
183 | static struct config_item *simple_children_make_item(struct config_group *group, | ||
184 | const char *name) | ||
185 | { | ||
186 | struct simple_child *simple_child; | ||
187 | |||
188 | simple_child = kzalloc(sizeof(struct simple_child), GFP_KERNEL); | ||
189 | if (!simple_child) | ||
190 | return ERR_PTR(-ENOMEM); | ||
191 | |||
192 | config_item_init_type_name(&simple_child->item, name, | ||
193 | &simple_child_type); | ||
194 | |||
195 | return &simple_child->item; | ||
196 | } | ||
197 | |||
198 | static ssize_t simple_children_description_show(struct config_item *item, | ||
199 | char *page) | ||
200 | { | ||
201 | return sprintf(page, | ||
202 | "[02-simple-children]\n" | ||
203 | "\n" | ||
204 | "This subsystem allows the creation of child config_items. These\n" | ||
205 | "items have only one attribute that is readable and writeable.\n"); | ||
206 | } | ||
207 | |||
208 | CONFIGFS_ATTR_RO(simple_children_, description); | ||
209 | |||
210 | static struct configfs_attribute *simple_children_attrs[] = { | ||
211 | &simple_children_attr_description, | ||
212 | NULL, | ||
213 | }; | ||
214 | |||
215 | static void simple_children_release(struct config_item *item) | ||
216 | { | ||
217 | kfree(to_simple_children(item)); | ||
218 | } | ||
219 | |||
220 | static struct configfs_item_operations simple_children_item_ops = { | ||
221 | .release = simple_children_release, | ||
222 | }; | ||
223 | |||
224 | /* | ||
225 | * Note that, since no extra work is required on ->drop_item(), | ||
226 | * no ->drop_item() is provided. | ||
227 | */ | ||
228 | static struct configfs_group_operations simple_children_group_ops = { | ||
229 | .make_item = simple_children_make_item, | ||
230 | }; | ||
231 | |||
232 | static const struct config_item_type simple_children_type = { | ||
233 | .ct_item_ops = &simple_children_item_ops, | ||
234 | .ct_group_ops = &simple_children_group_ops, | ||
235 | .ct_attrs = simple_children_attrs, | ||
236 | .ct_owner = THIS_MODULE, | ||
237 | }; | ||
238 | |||
239 | static struct configfs_subsystem simple_children_subsys = { | ||
240 | .su_group = { | ||
241 | .cg_item = { | ||
242 | .ci_namebuf = "02-simple-children", | ||
243 | .ci_type = &simple_children_type, | ||
244 | }, | ||
245 | }, | ||
246 | }; | ||
247 | |||
248 | /* ----------------------------------------------------------------- */ | ||
249 | |||
250 | /* | ||
251 | * 03-group-children | ||
252 | * | ||
253 | * This example reuses the simple_children group from above. However, | ||
254 | * the simple_children group is not the subsystem itself, it is a | ||
255 | * child of the subsystem. Creation of a group in the subsystem creates | ||
256 | * a new simple_children group. That group can then have simple_child | ||
257 | * children of its own. | ||
258 | */ | ||
259 | |||
260 | static struct config_group *group_children_make_group( | ||
261 | struct config_group *group, const char *name) | ||
262 | { | ||
263 | struct simple_children *simple_children; | ||
264 | |||
265 | simple_children = kzalloc(sizeof(struct simple_children), | ||
266 | GFP_KERNEL); | ||
267 | if (!simple_children) | ||
268 | return ERR_PTR(-ENOMEM); | ||
269 | |||
270 | config_group_init_type_name(&simple_children->group, name, | ||
271 | &simple_children_type); | ||
272 | |||
273 | return &simple_children->group; | ||
274 | } | ||
275 | |||
276 | static ssize_t group_children_description_show(struct config_item *item, | ||
277 | char *page) | ||
278 | { | ||
279 | return sprintf(page, | ||
280 | "[03-group-children]\n" | ||
281 | "\n" | ||
282 | "This subsystem allows the creation of child config_groups. These\n" | ||
283 | "groups are like the subsystem simple-children.\n"); | ||
284 | } | ||
285 | |||
286 | CONFIGFS_ATTR_RO(group_children_, description); | ||
287 | |||
288 | static struct configfs_attribute *group_children_attrs[] = { | ||
289 | &group_children_attr_description, | ||
290 | NULL, | ||
291 | }; | ||
292 | |||
293 | /* | ||
294 | * Note that, since no extra work is required on ->drop_item(), | ||
295 | * no ->drop_item() is provided. | ||
296 | */ | ||
297 | static struct configfs_group_operations group_children_group_ops = { | ||
298 | .make_group = group_children_make_group, | ||
299 | }; | ||
300 | |||
301 | static const struct config_item_type group_children_type = { | ||
302 | .ct_group_ops = &group_children_group_ops, | ||
303 | .ct_attrs = group_children_attrs, | ||
304 | .ct_owner = THIS_MODULE, | ||
305 | }; | ||
306 | |||
307 | static struct configfs_subsystem group_children_subsys = { | ||
308 | .su_group = { | ||
309 | .cg_item = { | ||
310 | .ci_namebuf = "03-group-children", | ||
311 | .ci_type = &group_children_type, | ||
312 | }, | ||
313 | }, | ||
314 | }; | ||
315 | |||
316 | /* ----------------------------------------------------------------- */ | ||
317 | |||
318 | /* | ||
319 | * We're now done with our subsystem definitions. | ||
320 | * For convenience in this module, here's a list of them all. It | ||
321 | * allows the init function to easily register them. Most modules | ||
322 | * will only have one subsystem, and will only call register_subsystem | ||
323 | * on it directly. | ||
324 | */ | ||
325 | static struct configfs_subsystem *example_subsys[] = { | ||
326 | &childless_subsys.subsys, | ||
327 | &simple_children_subsys, | ||
328 | &group_children_subsys, | ||
329 | NULL, | ||
330 | }; | ||
331 | |||
332 | static int __init configfs_example_init(void) | ||
333 | { | ||
334 | struct configfs_subsystem *subsys; | ||
335 | int ret, i; | ||
336 | |||
337 | for (i = 0; example_subsys[i]; i++) { | ||
338 | subsys = example_subsys[i]; | ||
339 | |||
340 | config_group_init(&subsys->su_group); | ||
341 | mutex_init(&subsys->su_mutex); | ||
342 | ret = configfs_register_subsystem(subsys); | ||
343 | if (ret) { | ||
344 | pr_err("Error %d while registering subsystem %s\n", | ||
345 | ret, subsys->su_group.cg_item.ci_namebuf); | ||
346 | goto out_unregister; | ||
347 | } | ||
348 | } | ||
349 | |||
350 | return 0; | ||
351 | |||
352 | out_unregister: | ||
353 | for (i--; i >= 0; i--) | ||
354 | configfs_unregister_subsystem(example_subsys[i]); | ||
355 | |||
356 | return ret; | ||
357 | } | ||
358 | |||
359 | static void __exit configfs_example_exit(void) | ||
360 | { | ||
361 | int i; | ||
362 | |||
363 | for (i = 0; example_subsys[i]; i++) | ||
364 | configfs_unregister_subsystem(example_subsys[i]); | ||
365 | } | ||
366 | |||
367 | module_init(configfs_example_init); | ||
368 | module_exit(configfs_example_exit); | ||
369 | MODULE_LICENSE("GPL"); | ||
diff --git a/samples/connector/.gitignore b/samples/connector/.gitignore new file mode 100644 index 000000000..d86f2ff9c --- /dev/null +++ b/samples/connector/.gitignore | |||
@@ -0,0 +1,2 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | ucon | ||
diff --git a/samples/connector/Makefile b/samples/connector/Makefile new file mode 100644 index 000000000..d98a9e047 --- /dev/null +++ b/samples/connector/Makefile | |||
@@ -0,0 +1,6 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0 | ||
2 | obj-$(CONFIG_SAMPLE_CONNECTOR) += cn_test.o | ||
3 | |||
4 | userprogs-always-$(CONFIG_CC_CAN_LINK) += ucon | ||
5 | |||
6 | userccflags += -I usr/include | ||
diff --git a/samples/connector/cn_test.c b/samples/connector/cn_test.c new file mode 100644 index 000000000..0958a171d --- /dev/null +++ b/samples/connector/cn_test.c | |||
@@ -0,0 +1,188 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
2 | /* | ||
3 | * cn_test.c | ||
4 | * | ||
5 | * 2004+ Copyright (c) Evgeniy Polyakov <zbr@ioremap.net> | ||
6 | * All rights reserved. | ||
7 | */ | ||
8 | |||
9 | #define pr_fmt(fmt) "cn_test: " fmt | ||
10 | |||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/moduleparam.h> | ||
14 | #include <linux/skbuff.h> | ||
15 | #include <linux/slab.h> | ||
16 | #include <linux/timer.h> | ||
17 | |||
18 | #include <linux/connector.h> | ||
19 | |||
20 | static struct cb_id cn_test_id = { CN_NETLINK_USERS + 3, 0x456 }; | ||
21 | static char cn_test_name[] = "cn_test"; | ||
22 | static struct sock *nls; | ||
23 | static struct timer_list cn_test_timer; | ||
24 | |||
25 | static void cn_test_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) | ||
26 | { | ||
27 | pr_info("%s: %lu: idx=%x, val=%x, seq=%u, ack=%u, len=%d: %s.\n", | ||
28 | __func__, jiffies, msg->id.idx, msg->id.val, | ||
29 | msg->seq, msg->ack, msg->len, | ||
30 | msg->len ? (char *)msg->data : ""); | ||
31 | } | ||
32 | |||
33 | /* | ||
34 | * Do not remove this function even if no one is using it as | ||
35 | * this is an example of how to get notifications about new | ||
36 | * connector user registration | ||
37 | */ | ||
38 | #if 0 | ||
39 | static int cn_test_want_notify(void) | ||
40 | { | ||
41 | struct cn_ctl_msg *ctl; | ||
42 | struct cn_notify_req *req; | ||
43 | struct cn_msg *msg = NULL; | ||
44 | int size, size0; | ||
45 | struct sk_buff *skb; | ||
46 | struct nlmsghdr *nlh; | ||
47 | u32 group = 1; | ||
48 | |||
49 | size0 = sizeof(*msg) + sizeof(*ctl) + 3 * sizeof(*req); | ||
50 | |||
51 | size = NLMSG_SPACE(size0); | ||
52 | |||
53 | skb = alloc_skb(size, GFP_ATOMIC); | ||
54 | if (!skb) { | ||
55 | pr_err("failed to allocate new skb with size=%u\n", size); | ||
56 | return -ENOMEM; | ||
57 | } | ||
58 | |||
59 | nlh = nlmsg_put(skb, 0, 0x123, NLMSG_DONE, size - sizeof(*nlh), 0); | ||
60 | if (!nlh) { | ||
61 | kfree_skb(skb); | ||
62 | return -EMSGSIZE; | ||
63 | } | ||
64 | |||
65 | msg = nlmsg_data(nlh); | ||
66 | |||
67 | memset(msg, 0, size0); | ||
68 | |||
69 | msg->id.idx = -1; | ||
70 | msg->id.val = -1; | ||
71 | msg->seq = 0x123; | ||
72 | msg->ack = 0x345; | ||
73 | msg->len = size0 - sizeof(*msg); | ||
74 | |||
75 | ctl = (struct cn_ctl_msg *)(msg + 1); | ||
76 | |||
77 | ctl->idx_notify_num = 1; | ||
78 | ctl->val_notify_num = 2; | ||
79 | ctl->group = group; | ||
80 | ctl->len = msg->len - sizeof(*ctl); | ||
81 | |||
82 | req = (struct cn_notify_req *)(ctl + 1); | ||
83 | |||
84 | /* | ||
85 | * Idx. | ||
86 | */ | ||
87 | req->first = cn_test_id.idx; | ||
88 | req->range = 10; | ||
89 | |||
90 | /* | ||
91 | * Val 0. | ||
92 | */ | ||
93 | req++; | ||
94 | req->first = cn_test_id.val; | ||
95 | req->range = 10; | ||
96 | |||
97 | /* | ||
98 | * Val 1. | ||
99 | */ | ||
100 | req++; | ||
101 | req->first = cn_test_id.val + 20; | ||
102 | req->range = 10; | ||
103 | |||
104 | NETLINK_CB(skb).dst_group = ctl->group; | ||
105 | //netlink_broadcast(nls, skb, 0, ctl->group, GFP_ATOMIC); | ||
106 | netlink_unicast(nls, skb, 0, 0); | ||
107 | |||
108 | pr_info("request was sent: group=0x%x\n", ctl->group); | ||
109 | |||
110 | return 0; | ||
111 | } | ||
112 | #endif | ||
113 | |||
114 | static u32 cn_test_timer_counter; | ||
115 | static void cn_test_timer_func(struct timer_list *unused) | ||
116 | { | ||
117 | struct cn_msg *m; | ||
118 | char data[32]; | ||
119 | |||
120 | pr_debug("%s: timer fired\n", __func__); | ||
121 | |||
122 | m = kzalloc(sizeof(*m) + sizeof(data), GFP_ATOMIC); | ||
123 | if (m) { | ||
124 | |||
125 | memcpy(&m->id, &cn_test_id, sizeof(m->id)); | ||
126 | m->seq = cn_test_timer_counter; | ||
127 | m->len = sizeof(data); | ||
128 | |||
129 | m->len = | ||
130 | scnprintf(data, sizeof(data), "counter = %u", | ||
131 | cn_test_timer_counter) + 1; | ||
132 | |||
133 | memcpy(m + 1, data, m->len); | ||
134 | |||
135 | cn_netlink_send(m, 0, 0, GFP_ATOMIC); | ||
136 | kfree(m); | ||
137 | } | ||
138 | |||
139 | cn_test_timer_counter++; | ||
140 | |||
141 | mod_timer(&cn_test_timer, jiffies + msecs_to_jiffies(1000)); | ||
142 | } | ||
143 | |||
144 | static int cn_test_init(void) | ||
145 | { | ||
146 | int err; | ||
147 | |||
148 | err = cn_add_callback(&cn_test_id, cn_test_name, cn_test_callback); | ||
149 | if (err) | ||
150 | goto err_out; | ||
151 | cn_test_id.val++; | ||
152 | err = cn_add_callback(&cn_test_id, cn_test_name, cn_test_callback); | ||
153 | if (err) { | ||
154 | cn_del_callback(&cn_test_id); | ||
155 | goto err_out; | ||
156 | } | ||
157 | |||
158 | timer_setup(&cn_test_timer, cn_test_timer_func, 0); | ||
159 | mod_timer(&cn_test_timer, jiffies + msecs_to_jiffies(1000)); | ||
160 | |||
161 | pr_info("initialized with id={%u.%u}\n", | ||
162 | cn_test_id.idx, cn_test_id.val); | ||
163 | |||
164 | return 0; | ||
165 | |||
166 | err_out: | ||
167 | if (nls && nls->sk_socket) | ||
168 | sock_release(nls->sk_socket); | ||
169 | |||
170 | return err; | ||
171 | } | ||
172 | |||
173 | static void cn_test_fini(void) | ||
174 | { | ||
175 | del_timer_sync(&cn_test_timer); | ||
176 | cn_del_callback(&cn_test_id); | ||
177 | cn_test_id.val--; | ||
178 | cn_del_callback(&cn_test_id); | ||
179 | if (nls && nls->sk_socket) | ||
180 | sock_release(nls->sk_socket); | ||
181 | } | ||
182 | |||
183 | module_init(cn_test_init); | ||
184 | module_exit(cn_test_fini); | ||
185 | |||
186 | MODULE_LICENSE("GPL"); | ||
187 | MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>"); | ||
188 | MODULE_DESCRIPTION("Connector's test module"); | ||
diff --git a/samples/connector/ucon.c b/samples/connector/ucon.c new file mode 100644 index 000000000..fa17f8642 --- /dev/null +++ b/samples/connector/ucon.c | |||
@@ -0,0 +1,236 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
2 | /* | ||
3 | * ucon.c | ||
4 | * | ||
5 | * Copyright (c) 2004+ Evgeniy Polyakov <zbr@ioremap.net> | ||
6 | */ | ||
7 | |||
8 | #include <asm/types.h> | ||
9 | |||
10 | #include <sys/types.h> | ||
11 | #include <sys/socket.h> | ||
12 | #include <sys/poll.h> | ||
13 | |||
14 | #include <linux/netlink.h> | ||
15 | #include <linux/rtnetlink.h> | ||
16 | |||
17 | #include <arpa/inet.h> | ||
18 | |||
19 | #include <stdbool.h> | ||
20 | #include <stdio.h> | ||
21 | #include <stdlib.h> | ||
22 | #include <unistd.h> | ||
23 | #include <string.h> | ||
24 | #include <errno.h> | ||
25 | #include <time.h> | ||
26 | #include <getopt.h> | ||
27 | |||
28 | #include <linux/connector.h> | ||
29 | |||
30 | #define DEBUG | ||
31 | #define NETLINK_CONNECTOR 11 | ||
32 | |||
33 | /* Hopefully your userspace connector.h matches this kernel */ | ||
34 | #define CN_TEST_IDX CN_NETLINK_USERS + 3 | ||
35 | #define CN_TEST_VAL 0x456 | ||
36 | |||
37 | #ifdef DEBUG | ||
38 | #define ulog(f, a...) fprintf(stdout, f, ##a) | ||
39 | #else | ||
40 | #define ulog(f, a...) do {} while (0) | ||
41 | #endif | ||
42 | |||
43 | static int need_exit; | ||
44 | static __u32 seq; | ||
45 | |||
46 | static int netlink_send(int s, struct cn_msg *msg) | ||
47 | { | ||
48 | struct nlmsghdr *nlh; | ||
49 | unsigned int size; | ||
50 | int err; | ||
51 | char buf[128]; | ||
52 | struct cn_msg *m; | ||
53 | |||
54 | size = NLMSG_SPACE(sizeof(struct cn_msg) + msg->len); | ||
55 | |||
56 | nlh = (struct nlmsghdr *)buf; | ||
57 | nlh->nlmsg_seq = seq++; | ||
58 | nlh->nlmsg_pid = getpid(); | ||
59 | nlh->nlmsg_type = NLMSG_DONE; | ||
60 | nlh->nlmsg_len = size; | ||
61 | nlh->nlmsg_flags = 0; | ||
62 | |||
63 | m = NLMSG_DATA(nlh); | ||
64 | #if 0 | ||
65 | ulog("%s: [%08x.%08x] len=%u, seq=%u, ack=%u.\n", | ||
66 | __func__, msg->id.idx, msg->id.val, msg->len, msg->seq, msg->ack); | ||
67 | #endif | ||
68 | memcpy(m, msg, sizeof(*m) + msg->len); | ||
69 | |||
70 | err = send(s, nlh, size, 0); | ||
71 | if (err == -1) | ||
72 | ulog("Failed to send: %s [%d].\n", | ||
73 | strerror(errno), errno); | ||
74 | |||
75 | return err; | ||
76 | } | ||
77 | |||
78 | static void usage(void) | ||
79 | { | ||
80 | printf( | ||
81 | "Usage: ucon [options] [output file]\n" | ||
82 | "\n" | ||
83 | "\t-h\tthis help screen\n" | ||
84 | "\t-s\tsend buffers to the test module\n" | ||
85 | "\n" | ||
86 | "The default behavior of ucon is to subscribe to the test module\n" | ||
87 | "and wait for state messages. Any ones received are dumped to the\n" | ||
88 | "specified output file (or stdout). The test module is assumed to\n" | ||
89 | "have an id of {%u.%u}\n" | ||
90 | "\n" | ||
91 | "If you get no output, then verify the cn_test module id matches\n" | ||
92 | "the expected id above.\n" | ||
93 | , CN_TEST_IDX, CN_TEST_VAL | ||
94 | ); | ||
95 | } | ||
96 | |||
97 | int main(int argc, char *argv[]) | ||
98 | { | ||
99 | int s; | ||
100 | char buf[1024]; | ||
101 | int len; | ||
102 | struct nlmsghdr *reply; | ||
103 | struct sockaddr_nl l_local; | ||
104 | struct cn_msg *data; | ||
105 | FILE *out; | ||
106 | time_t tm; | ||
107 | struct pollfd pfd; | ||
108 | bool send_msgs = false; | ||
109 | |||
110 | while ((s = getopt(argc, argv, "hs")) != -1) { | ||
111 | switch (s) { | ||
112 | case 's': | ||
113 | send_msgs = true; | ||
114 | break; | ||
115 | |||
116 | case 'h': | ||
117 | usage(); | ||
118 | return 0; | ||
119 | |||
120 | default: | ||
121 | /* getopt() outputs an error for us */ | ||
122 | usage(); | ||
123 | return 1; | ||
124 | } | ||
125 | } | ||
126 | |||
127 | if (argc != optind) { | ||
128 | out = fopen(argv[optind], "a+"); | ||
129 | if (!out) { | ||
130 | ulog("Unable to open %s for writing: %s\n", | ||
131 | argv[1], strerror(errno)); | ||
132 | out = stdout; | ||
133 | } | ||
134 | } else | ||
135 | out = stdout; | ||
136 | |||
137 | memset(buf, 0, sizeof(buf)); | ||
138 | |||
139 | s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR); | ||
140 | if (s == -1) { | ||
141 | perror("socket"); | ||
142 | return -1; | ||
143 | } | ||
144 | |||
145 | l_local.nl_family = AF_NETLINK; | ||
146 | l_local.nl_groups = -1; /* bitmask of requested groups */ | ||
147 | l_local.nl_pid = 0; | ||
148 | |||
149 | ulog("subscribing to %u.%u\n", CN_TEST_IDX, CN_TEST_VAL); | ||
150 | |||
151 | if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) { | ||
152 | perror("bind"); | ||
153 | close(s); | ||
154 | return -1; | ||
155 | } | ||
156 | |||
157 | #if 0 | ||
158 | { | ||
159 | int on = 0x57; /* Additional group number */ | ||
160 | setsockopt(s, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &on, sizeof(on)); | ||
161 | } | ||
162 | #endif | ||
163 | if (send_msgs) { | ||
164 | int i, j; | ||
165 | |||
166 | memset(buf, 0, sizeof(buf)); | ||
167 | |||
168 | data = (struct cn_msg *)buf; | ||
169 | |||
170 | data->id.idx = CN_TEST_IDX; | ||
171 | data->id.val = CN_TEST_VAL; | ||
172 | data->seq = seq++; | ||
173 | data->ack = 0; | ||
174 | data->len = 0; | ||
175 | |||
176 | for (j=0; j<10; ++j) { | ||
177 | for (i=0; i<1000; ++i) { | ||
178 | len = netlink_send(s, data); | ||
179 | } | ||
180 | |||
181 | ulog("%d messages have been sent to %08x.%08x.\n", i, data->id.idx, data->id.val); | ||
182 | } | ||
183 | |||
184 | return 0; | ||
185 | } | ||
186 | |||
187 | |||
188 | pfd.fd = s; | ||
189 | |||
190 | while (!need_exit) { | ||
191 | pfd.events = POLLIN; | ||
192 | pfd.revents = 0; | ||
193 | switch (poll(&pfd, 1, -1)) { | ||
194 | case 0: | ||
195 | need_exit = 1; | ||
196 | break; | ||
197 | case -1: | ||
198 | if (errno != EINTR) { | ||
199 | need_exit = 1; | ||
200 | break; | ||
201 | } | ||
202 | continue; | ||
203 | } | ||
204 | if (need_exit) | ||
205 | break; | ||
206 | |||
207 | memset(buf, 0, sizeof(buf)); | ||
208 | len = recv(s, buf, sizeof(buf), 0); | ||
209 | if (len == -1) { | ||
210 | perror("recv buf"); | ||
211 | close(s); | ||
212 | return -1; | ||
213 | } | ||
214 | reply = (struct nlmsghdr *)buf; | ||
215 | |||
216 | switch (reply->nlmsg_type) { | ||
217 | case NLMSG_ERROR: | ||
218 | fprintf(out, "Error message received.\n"); | ||
219 | fflush(out); | ||
220 | break; | ||
221 | case NLMSG_DONE: | ||
222 | data = (struct cn_msg *)NLMSG_DATA(reply); | ||
223 | |||
224 | time(&tm); | ||
225 | fprintf(out, "%.24s : [%x.%x] [%08u.%08u].\n", | ||
226 | ctime(&tm), data->id.idx, data->id.val, data->seq, data->ack); | ||
227 | fflush(out); | ||
228 | break; | ||
229 | default: | ||
230 | break; | ||
231 | } | ||
232 | } | ||
233 | |||
234 | close(s); | ||
235 | return 0; | ||
236 | } | ||
diff --git a/samples/ftrace/Makefile b/samples/ftrace/Makefile new file mode 100644 index 000000000..4ce896e10 --- /dev/null +++ b/samples/ftrace/Makefile | |||
@@ -0,0 +1,8 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | |||
3 | obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct.o | ||
4 | obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-too.o | ||
5 | obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-modify.o | ||
6 | |||
7 | CFLAGS_sample-trace-array.o := -I$(src) | ||
8 | obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += sample-trace-array.o | ||
diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c new file mode 100644 index 000000000..d620f3da0 --- /dev/null +++ b/samples/ftrace/ftrace-direct-modify.c | |||
@@ -0,0 +1,97 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/kthread.h> | ||
4 | #include <linux/ftrace.h> | ||
5 | |||
6 | extern void my_direct_func1(void); | ||
7 | extern void my_direct_func2(void); | ||
8 | |||
9 | void my_direct_func1(void) | ||
10 | { | ||
11 | trace_printk("my direct func1\n"); | ||
12 | } | ||
13 | |||
14 | void my_direct_func2(void) | ||
15 | { | ||
16 | trace_printk("my direct func2\n"); | ||
17 | } | ||
18 | |||
19 | extern void my_tramp1(void *); | ||
20 | extern void my_tramp2(void *); | ||
21 | |||
22 | static unsigned long my_ip = (unsigned long)schedule; | ||
23 | |||
24 | asm ( | ||
25 | " .pushsection .text, \"ax\", @progbits\n" | ||
26 | " .type my_tramp1, @function\n" | ||
27 | " .globl my_tramp1\n" | ||
28 | " my_tramp1:" | ||
29 | " pushq %rbp\n" | ||
30 | " movq %rsp, %rbp\n" | ||
31 | " call my_direct_func1\n" | ||
32 | " leave\n" | ||
33 | " .size my_tramp1, .-my_tramp1\n" | ||
34 | ASM_RET | ||
35 | " .type my_tramp2, @function\n" | ||
36 | " .globl my_tramp2\n" | ||
37 | " my_tramp2:" | ||
38 | " pushq %rbp\n" | ||
39 | " movq %rsp, %rbp\n" | ||
40 | " call my_direct_func2\n" | ||
41 | " leave\n" | ||
42 | ASM_RET | ||
43 | " .size my_tramp2, .-my_tramp2\n" | ||
44 | " .popsection\n" | ||
45 | ); | ||
46 | |||
47 | static unsigned long my_tramp = (unsigned long)my_tramp1; | ||
48 | static unsigned long tramps[2] = { | ||
49 | (unsigned long)my_tramp1, | ||
50 | (unsigned long)my_tramp2, | ||
51 | }; | ||
52 | |||
53 | static int simple_thread(void *arg) | ||
54 | { | ||
55 | static int t; | ||
56 | int ret = 0; | ||
57 | |||
58 | while (!kthread_should_stop()) { | ||
59 | set_current_state(TASK_INTERRUPTIBLE); | ||
60 | schedule_timeout(2 * HZ); | ||
61 | |||
62 | if (ret) | ||
63 | continue; | ||
64 | t ^= 1; | ||
65 | ret = modify_ftrace_direct(my_ip, my_tramp, tramps[t]); | ||
66 | if (!ret) | ||
67 | my_tramp = tramps[t]; | ||
68 | WARN_ON_ONCE(ret); | ||
69 | } | ||
70 | |||
71 | return 0; | ||
72 | } | ||
73 | |||
74 | static struct task_struct *simple_tsk; | ||
75 | |||
76 | static int __init ftrace_direct_init(void) | ||
77 | { | ||
78 | int ret; | ||
79 | |||
80 | ret = register_ftrace_direct(my_ip, my_tramp); | ||
81 | if (!ret) | ||
82 | simple_tsk = kthread_run(simple_thread, NULL, "event-sample-fn"); | ||
83 | return ret; | ||
84 | } | ||
85 | |||
86 | static void __exit ftrace_direct_exit(void) | ||
87 | { | ||
88 | kthread_stop(simple_tsk); | ||
89 | unregister_ftrace_direct(my_ip, my_tramp); | ||
90 | } | ||
91 | |||
92 | module_init(ftrace_direct_init); | ||
93 | module_exit(ftrace_direct_exit); | ||
94 | |||
95 | MODULE_AUTHOR("Steven Rostedt"); | ||
96 | MODULE_DESCRIPTION("Example use case of using modify_ftrace_direct()"); | ||
97 | MODULE_LICENSE("GPL"); | ||
diff --git a/samples/ftrace/ftrace-direct-too.c b/samples/ftrace/ftrace-direct-too.c new file mode 100644 index 000000000..3927cb880 --- /dev/null +++ b/samples/ftrace/ftrace-direct-too.c | |||
@@ -0,0 +1,57 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | #include <linux/module.h> | ||
3 | |||
4 | #include <linux/mm.h> /* for handle_mm_fault() */ | ||
5 | #include <linux/ftrace.h> | ||
6 | |||
7 | extern void my_direct_func(struct vm_area_struct *vma, | ||
8 | unsigned long address, unsigned int flags); | ||
9 | |||
10 | void my_direct_func(struct vm_area_struct *vma, | ||
11 | unsigned long address, unsigned int flags) | ||
12 | { | ||
13 | trace_printk("handle mm fault vma=%p address=%lx flags=%x\n", | ||
14 | vma, address, flags); | ||
15 | } | ||
16 | |||
17 | extern void my_tramp(void *); | ||
18 | |||
19 | asm ( | ||
20 | " .pushsection .text, \"ax\", @progbits\n" | ||
21 | " .type my_tramp, @function\n" | ||
22 | " .globl my_tramp\n" | ||
23 | " my_tramp:" | ||
24 | " pushq %rbp\n" | ||
25 | " movq %rsp, %rbp\n" | ||
26 | " pushq %rdi\n" | ||
27 | " pushq %rsi\n" | ||
28 | " pushq %rdx\n" | ||
29 | " call my_direct_func\n" | ||
30 | " popq %rdx\n" | ||
31 | " popq %rsi\n" | ||
32 | " popq %rdi\n" | ||
33 | " leave\n" | ||
34 | ASM_RET | ||
35 | " .size my_tramp, .-my_tramp\n" | ||
36 | " .popsection\n" | ||
37 | ); | ||
38 | |||
39 | |||
40 | static int __init ftrace_direct_init(void) | ||
41 | { | ||
42 | return register_ftrace_direct((unsigned long)handle_mm_fault, | ||
43 | (unsigned long)my_tramp); | ||
44 | } | ||
45 | |||
46 | static void __exit ftrace_direct_exit(void) | ||
47 | { | ||
48 | unregister_ftrace_direct((unsigned long)handle_mm_fault, | ||
49 | (unsigned long)my_tramp); | ||
50 | } | ||
51 | |||
52 | module_init(ftrace_direct_init); | ||
53 | module_exit(ftrace_direct_exit); | ||
54 | |||
55 | MODULE_AUTHOR("Steven Rostedt"); | ||
56 | MODULE_DESCRIPTION("Another example use case of using register_ftrace_direct()"); | ||
57 | MODULE_LICENSE("GPL"); | ||
diff --git a/samples/ftrace/ftrace-direct.c b/samples/ftrace/ftrace-direct.c new file mode 100644 index 000000000..1e901bb8d --- /dev/null +++ b/samples/ftrace/ftrace-direct.c | |||
@@ -0,0 +1,50 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | #include <linux/module.h> | ||
3 | |||
4 | #include <linux/sched.h> /* for wake_up_process() */ | ||
5 | #include <linux/ftrace.h> | ||
6 | |||
7 | extern void my_direct_func(struct task_struct *p); | ||
8 | |||
9 | void my_direct_func(struct task_struct *p) | ||
10 | { | ||
11 | trace_printk("waking up %s-%d\n", p->comm, p->pid); | ||
12 | } | ||
13 | |||
14 | extern void my_tramp(void *); | ||
15 | |||
16 | asm ( | ||
17 | " .pushsection .text, \"ax\", @progbits\n" | ||
18 | " .type my_tramp, @function\n" | ||
19 | " .globl my_tramp\n" | ||
20 | " my_tramp:" | ||
21 | " pushq %rbp\n" | ||
22 | " movq %rsp, %rbp\n" | ||
23 | " pushq %rdi\n" | ||
24 | " call my_direct_func\n" | ||
25 | " popq %rdi\n" | ||
26 | " leave\n" | ||
27 | ASM_RET | ||
28 | " .size my_tramp, .-my_tramp\n" | ||
29 | " .popsection\n" | ||
30 | ); | ||
31 | |||
32 | |||
33 | static int __init ftrace_direct_init(void) | ||
34 | { | ||
35 | return register_ftrace_direct((unsigned long)wake_up_process, | ||
36 | (unsigned long)my_tramp); | ||
37 | } | ||
38 | |||
39 | static void __exit ftrace_direct_exit(void) | ||
40 | { | ||
41 | unregister_ftrace_direct((unsigned long)wake_up_process, | ||
42 | (unsigned long)my_tramp); | ||
43 | } | ||
44 | |||
45 | module_init(ftrace_direct_init); | ||
46 | module_exit(ftrace_direct_exit); | ||
47 | |||
48 | MODULE_AUTHOR("Steven Rostedt"); | ||
49 | MODULE_DESCRIPTION("Example use case of using register_ftrace_direct()"); | ||
50 | MODULE_LICENSE("GPL"); | ||
diff --git a/samples/ftrace/sample-trace-array.c b/samples/ftrace/sample-trace-array.c new file mode 100644 index 000000000..6aba02a31 --- /dev/null +++ b/samples/ftrace/sample-trace-array.c | |||
@@ -0,0 +1,143 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/kthread.h> | ||
4 | #include <linux/trace.h> | ||
5 | #include <linux/trace_events.h> | ||
6 | #include <linux/timer.h> | ||
7 | #include <linux/err.h> | ||
8 | #include <linux/jiffies.h> | ||
9 | #include <linux/workqueue.h> | ||
10 | |||
11 | /* | ||
12 | * Any file that uses trace points, must include the header. | ||
13 | * But only one file, must include the header by defining | ||
14 | * CREATE_TRACE_POINTS first. This will make the C code that | ||
15 | * creates the handles for the trace points. | ||
16 | */ | ||
17 | #define CREATE_TRACE_POINTS | ||
18 | #include "sample-trace-array.h" | ||
19 | |||
20 | struct trace_array *tr; | ||
21 | static void mytimer_handler(struct timer_list *unused); | ||
22 | static struct task_struct *simple_tsk; | ||
23 | |||
24 | static void trace_work_fn(struct work_struct *work) | ||
25 | { | ||
26 | /* | ||
27 | * Disable tracing for event "sample_event". | ||
28 | */ | ||
29 | trace_array_set_clr_event(tr, "sample-subsystem", "sample_event", | ||
30 | false); | ||
31 | } | ||
32 | static DECLARE_WORK(trace_work, trace_work_fn); | ||
33 | |||
34 | /* | ||
35 | * mytimer: Timer setup to disable tracing for event "sample_event". This | ||
36 | * timer is only for the purposes of the sample module to demonstrate access of | ||
37 | * Ftrace instances from within kernel. | ||
38 | */ | ||
39 | static DEFINE_TIMER(mytimer, mytimer_handler); | ||
40 | |||
41 | static void mytimer_handler(struct timer_list *unused) | ||
42 | { | ||
43 | schedule_work(&trace_work); | ||
44 | } | ||
45 | |||
46 | static void simple_thread_func(int count) | ||
47 | { | ||
48 | set_current_state(TASK_INTERRUPTIBLE); | ||
49 | schedule_timeout(HZ); | ||
50 | |||
51 | /* | ||
52 | * Printing count value using trace_array_printk() - trace_printk() | ||
53 | * equivalent for the instance buffers. | ||
54 | */ | ||
55 | trace_array_printk(tr, _THIS_IP_, "trace_array_printk: count=%d\n", | ||
56 | count); | ||
57 | /* | ||
58 | * Tracepoint for event "sample_event". This will print the | ||
59 | * current value of count and current jiffies. | ||
60 | */ | ||
61 | trace_sample_event(count, jiffies); | ||
62 | } | ||
63 | |||
64 | static int simple_thread(void *arg) | ||
65 | { | ||
66 | int count = 0; | ||
67 | unsigned long delay = msecs_to_jiffies(5000); | ||
68 | |||
69 | /* | ||
70 | * Enable tracing for "sample_event". | ||
71 | */ | ||
72 | trace_array_set_clr_event(tr, "sample-subsystem", "sample_event", true); | ||
73 | |||
74 | /* | ||
75 | * Adding timer - mytimer. This timer will disable tracing after | ||
76 | * delay seconds. | ||
77 | * | ||
78 | */ | ||
79 | add_timer(&mytimer); | ||
80 | mod_timer(&mytimer, jiffies+delay); | ||
81 | |||
82 | while (!kthread_should_stop()) | ||
83 | simple_thread_func(count++); | ||
84 | |||
85 | del_timer(&mytimer); | ||
86 | cancel_work_sync(&trace_work); | ||
87 | |||
88 | /* | ||
89 | * trace_array_put() decrements the reference counter associated with | ||
90 | * the trace array - "tr". We are done using the trace array, hence | ||
91 | * decrement the reference counter so that it can be destroyed using | ||
92 | * trace_array_destroy(). | ||
93 | */ | ||
94 | trace_array_put(tr); | ||
95 | |||
96 | return 0; | ||
97 | } | ||
98 | |||
99 | static int __init sample_trace_array_init(void) | ||
100 | { | ||
101 | /* | ||
102 | * Return a pointer to the trace array with name "sample-instance" if it | ||
103 | * exists, else create a new trace array. | ||
104 | * | ||
105 | * NOTE: This function increments the reference counter | ||
106 | * associated with the trace array - "tr". | ||
107 | */ | ||
108 | tr = trace_array_get_by_name("sample-instance"); | ||
109 | |||
110 | if (!tr) | ||
111 | return -1; | ||
112 | /* | ||
113 | * If context specific per-cpu buffers havent already been allocated. | ||
114 | */ | ||
115 | trace_printk_init_buffers(); | ||
116 | |||
117 | simple_tsk = kthread_run(simple_thread, NULL, "sample-instance"); | ||
118 | if (IS_ERR(simple_tsk)) { | ||
119 | trace_array_put(tr); | ||
120 | trace_array_destroy(tr); | ||
121 | return -1; | ||
122 | } | ||
123 | |||
124 | return 0; | ||
125 | } | ||
126 | |||
127 | static void __exit sample_trace_array_exit(void) | ||
128 | { | ||
129 | kthread_stop(simple_tsk); | ||
130 | |||
131 | /* | ||
132 | * We are unloading our module and no longer require the trace array. | ||
133 | * Remove/destroy "tr" using trace_array_destroy() | ||
134 | */ | ||
135 | trace_array_destroy(tr); | ||
136 | } | ||
137 | |||
138 | module_init(sample_trace_array_init); | ||
139 | module_exit(sample_trace_array_exit); | ||
140 | |||
141 | MODULE_AUTHOR("Divya Indi"); | ||
142 | MODULE_DESCRIPTION("Sample module for kernel access to Ftrace instances"); | ||
143 | MODULE_LICENSE("GPL"); | ||
diff --git a/samples/ftrace/sample-trace-array.h b/samples/ftrace/sample-trace-array.h new file mode 100644 index 000000000..6f8962428 --- /dev/null +++ b/samples/ftrace/sample-trace-array.h | |||
@@ -0,0 +1,84 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
2 | |||
3 | /* | ||
4 | * If TRACE_SYSTEM is defined, that will be the directory created | ||
5 | * in the ftrace directory under /sys/kernel/tracing/events/<system> | ||
6 | * | ||
7 | * The define_trace.h below will also look for a file name of | ||
8 | * TRACE_SYSTEM.h where TRACE_SYSTEM is what is defined here. | ||
9 | * In this case, it would look for sample-trace.h | ||
10 | * | ||
11 | * If the header name will be different than the system name | ||
12 | * (as in this case), then you can override the header name that | ||
13 | * define_trace.h will look up by defining TRACE_INCLUDE_FILE | ||
14 | * | ||
15 | * This file is called sample-trace-array.h but we want the system | ||
16 | * to be called "sample-subsystem". Therefore we must define the name of this | ||
17 | * file: | ||
18 | * | ||
19 | * #define TRACE_INCLUDE_FILE sample-trace-array | ||
20 | * | ||
21 | * As we do in the bottom of this file. | ||
22 | * | ||
23 | * Notice that TRACE_SYSTEM should be defined outside of #if | ||
24 | * protection, just like TRACE_INCLUDE_FILE. | ||
25 | */ | ||
26 | #undef TRACE_SYSTEM | ||
27 | #define TRACE_SYSTEM sample-subsystem | ||
28 | |||
29 | /* | ||
30 | * TRACE_SYSTEM is expected to be a C valid variable (alpha-numeric | ||
31 | * and underscore), although it may start with numbers. If for some | ||
32 | * reason it is not, you need to add the following lines: | ||
33 | */ | ||
34 | #undef TRACE_SYSTEM_VAR | ||
35 | #define TRACE_SYSTEM_VAR sample_subsystem | ||
36 | |||
37 | /* | ||
38 | * But the above is only needed if TRACE_SYSTEM is not alpha-numeric | ||
39 | * and underscored. By default, TRACE_SYSTEM_VAR will be equal to | ||
40 | * TRACE_SYSTEM. As TRACE_SYSTEM_VAR must be alpha-numeric, if | ||
41 | * TRACE_SYSTEM is not, then TRACE_SYSTEM_VAR must be defined with | ||
42 | * only alpha-numeric and underscores. | ||
43 | * | ||
44 | * The TRACE_SYSTEM_VAR is only used internally and not visible to | ||
45 | * user space. | ||
46 | */ | ||
47 | |||
48 | /* | ||
49 | * Notice that this file is not protected like a normal header. | ||
50 | * We also must allow for rereading of this file. The | ||
51 | * | ||
52 | * || defined(TRACE_HEADER_MULTI_READ) | ||
53 | * | ||
54 | * serves this purpose. | ||
55 | */ | ||
56 | #if !defined(_SAMPLE_TRACE_ARRAY_H) || defined(TRACE_HEADER_MULTI_READ) | ||
57 | #define _SAMPLE_TRACE_ARRAY_H | ||
58 | |||
59 | #include <linux/tracepoint.h> | ||
60 | TRACE_EVENT(sample_event, | ||
61 | |||
62 | TP_PROTO(int count, unsigned long time), | ||
63 | |||
64 | TP_ARGS(count, time), | ||
65 | |||
66 | TP_STRUCT__entry( | ||
67 | __field(int, count) | ||
68 | __field(unsigned long, time) | ||
69 | ), | ||
70 | |||
71 | TP_fast_assign( | ||
72 | __entry->count = count; | ||
73 | __entry->time = time; | ||
74 | ), | ||
75 | |||
76 | TP_printk("count value=%d at jiffies=%lu", __entry->count, | ||
77 | __entry->time) | ||
78 | ); | ||
79 | #endif | ||
80 | |||
81 | #undef TRACE_INCLUDE_PATH | ||
82 | #define TRACE_INCLUDE_PATH . | ||
83 | #define TRACE_INCLUDE_FILE sample-trace-array | ||
84 | #include <trace/define_trace.h> | ||
diff --git a/samples/hck/Makefile b/samples/hck/Makefile new file mode 100644 index 000000000..1f24a99a4 --- /dev/null +++ b/samples/hck/Makefile | |||
@@ -0,0 +1,6 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | ccflags-y += -I$(src) | ||
3 | |||
4 | obj-$(CONFIG_SAMPLE_HCK_CALL) += call.o | ||
5 | obj-$(CONFIG_SAMPLE_HCK_REGISTER) += register.o | ||
6 | obj-$(CONFIG_SAMPLE_HCK_REGISTER_ONE) += register_one.o \ No newline at end of file | ||
diff --git a/samples/hck/call.c b/samples/hck/call.c new file mode 100644 index 000000000..870d5611c --- /dev/null +++ b/samples/hck/call.c | |||
@@ -0,0 +1,24 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Sample Call HCK | ||
4 | * | ||
5 | */ | ||
6 | #include <linux/module.h> | ||
7 | #include <linux/init.h> | ||
8 | #include <linux/hck/lite_hck_sample.h> | ||
9 | |||
10 | static int __init samplecallhck_init(void) | ||
11 | { | ||
12 | int val = 0; | ||
13 | |||
14 | pr_info("hck sample: call\n"); | ||
15 | |||
16 | CALL_HCK_LITE_HOOK(get_boot_config_lhck, &val); | ||
17 | pr_info("hck sample val changed: %d\n", val); | ||
18 | |||
19 | CALL_HCK_LITE_HOOK(set_boot_stat_lhck, val); | ||
20 | pr_info("hck sample val not changed: %d\n", val); | ||
21 | |||
22 | return 0; | ||
23 | } | ||
24 | late_initcall(samplecallhck_init); \ No newline at end of file | ||
diff --git a/samples/hck/register.c b/samples/hck/register.c new file mode 100644 index 000000000..407d05f74 --- /dev/null +++ b/samples/hck/register.c | |||
@@ -0,0 +1,48 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Sample HCK | ||
4 | * | ||
5 | */ | ||
6 | #include <linux/module.h> | ||
7 | #include <linux/init.h> | ||
8 | #include <linux/slab.h> | ||
9 | #include <linux/hck/lite_hck_sample.h> | ||
10 | |||
11 | static struct sample_hck_data data = { | ||
12 | .stat = 999, | ||
13 | .name = "sample tesst", | ||
14 | }; | ||
15 | |||
16 | void get_boot_config(int* info) | ||
17 | { | ||
18 | pr_info("hck sample: %s\n", __func__); | ||
19 | *info = 1; | ||
20 | } | ||
21 | |||
22 | void set_boot_stat(void* data, int info) | ||
23 | { | ||
24 | pr_info("hck sample: %s\n", __func__); | ||
25 | info = 2; | ||
26 | struct sample_hck_data *hdata = data; | ||
27 | |||
28 | pr_info("hck data: stat = %d, name = %s\n", hdata->stat, hdata->name); | ||
29 | } | ||
30 | |||
31 | static int __init samplehck_init(void) | ||
32 | { | ||
33 | pr_info("hck sample register\n"); | ||
34 | |||
35 | REGISTER_HCK_LITE_HOOK(get_boot_config_lhck, get_boot_config); | ||
36 | REGISTER_HCK_LITE_DATA_HOOK(set_boot_stat_lhck, set_boot_stat, &data); | ||
37 | |||
38 | return 0; | ||
39 | } | ||
40 | |||
41 | static void __exit samplehck_exit(void) | ||
42 | { | ||
43 | } | ||
44 | |||
45 | module_init(samplehck_init); | ||
46 | module_exit(samplehck_exit); | ||
47 | MODULE_LICENSE("GPL v2"); | ||
48 | MODULE_AUTHOR("zhujiaxin <zhujiaxin@huawei.com>"); | ||
diff --git a/samples/hck/register_one.c b/samples/hck/register_one.c new file mode 100644 index 000000000..9ea2c0250 --- /dev/null +++ b/samples/hck/register_one.c | |||
@@ -0,0 +1,31 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Sample HCK | ||
4 | * | ||
5 | */ | ||
6 | #include <linux/module.h> | ||
7 | #include <linux/init.h> | ||
8 | #include <linux/hck/lite_hck_sample.h> | ||
9 | |||
10 | void get_boot_power_config(int* info) | ||
11 | { | ||
12 | pr_info("hck sample: intf-2 run\n"); | ||
13 | *info = 2; | ||
14 | } | ||
15 | |||
16 | static int __init samplehckone_init(void) | ||
17 | { | ||
18 | pr_info("hck sample register_one\n"); | ||
19 | REGISTER_HCK_LITE_HOOK(get_boot_config_lhck, get_boot_power_config); | ||
20 | |||
21 | return 0; | ||
22 | } | ||
23 | |||
24 | static void __exit samplehckone_exit(void) | ||
25 | { | ||
26 | } | ||
27 | |||
28 | module_init(samplehckone_init); | ||
29 | module_exit(samplehckone_exit); | ||
30 | MODULE_LICENSE("GPL v2"); | ||
31 | MODULE_AUTHOR("zhujiaxin <zhujiaxin@huawei.com>"); | ||
diff --git a/samples/hidraw/.gitignore b/samples/hidraw/.gitignore new file mode 100644 index 000000000..d7a6074eb --- /dev/null +++ b/samples/hidraw/.gitignore | |||
@@ -0,0 +1,2 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | hid-example | ||
diff --git a/samples/hidraw/Makefile b/samples/hidraw/Makefile new file mode 100644 index 000000000..594d989e5 --- /dev/null +++ b/samples/hidraw/Makefile | |||
@@ -0,0 +1,4 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0 | ||
2 | userprogs-always-y += hid-example | ||
3 | |||
4 | userccflags += -I usr/include | ||
diff --git a/samples/hidraw/hid-example.c b/samples/hidraw/hid-example.c new file mode 100644 index 000000000..37a0ffcb4 --- /dev/null +++ b/samples/hidraw/hid-example.c | |||
@@ -0,0 +1,182 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Hidraw Userspace Example | ||
4 | * | ||
5 | * Copyright (c) 2010 Alan Ott <alan@signal11.us> | ||
6 | * Copyright (c) 2010 Signal 11 Software | ||
7 | * | ||
8 | * The code may be used by anyone for any purpose, | ||
9 | * and can serve as a starting point for developing | ||
10 | * applications using hidraw. | ||
11 | */ | ||
12 | |||
13 | /* Linux */ | ||
14 | #include <linux/types.h> | ||
15 | #include <linux/input.h> | ||
16 | #include <linux/hidraw.h> | ||
17 | |||
18 | /* | ||
19 | * Ugly hack to work around failing compilation on systems that don't | ||
20 | * yet populate new version of hidraw.h to userspace. | ||
21 | */ | ||
22 | #ifndef HIDIOCSFEATURE | ||
23 | #warning Please have your distro update the userspace kernel headers | ||
24 | #define HIDIOCSFEATURE(len) _IOC(_IOC_WRITE|_IOC_READ, 'H', 0x06, len) | ||
25 | #define HIDIOCGFEATURE(len) _IOC(_IOC_WRITE|_IOC_READ, 'H', 0x07, len) | ||
26 | #endif | ||
27 | |||
28 | /* Unix */ | ||
29 | #include <sys/ioctl.h> | ||
30 | #include <sys/types.h> | ||
31 | #include <sys/stat.h> | ||
32 | #include <fcntl.h> | ||
33 | #include <unistd.h> | ||
34 | |||
35 | /* C */ | ||
36 | #include <stdio.h> | ||
37 | #include <string.h> | ||
38 | #include <stdlib.h> | ||
39 | #include <errno.h> | ||
40 | |||
41 | const char *bus_str(int bus); | ||
42 | |||
43 | int main(int argc, char **argv) | ||
44 | { | ||
45 | int fd; | ||
46 | int i, res, desc_size = 0; | ||
47 | char buf[256]; | ||
48 | struct hidraw_report_descriptor rpt_desc; | ||
49 | struct hidraw_devinfo info; | ||
50 | char *device = "/dev/hidraw0"; | ||
51 | |||
52 | if (argc > 1) | ||
53 | device = argv[1]; | ||
54 | |||
55 | /* Open the Device with non-blocking reads. In real life, | ||
56 | don't use a hard coded path; use libudev instead. */ | ||
57 | fd = open(device, O_RDWR|O_NONBLOCK); | ||
58 | |||
59 | if (fd < 0) { | ||
60 | perror("Unable to open device"); | ||
61 | return 1; | ||
62 | } | ||
63 | |||
64 | memset(&rpt_desc, 0x0, sizeof(rpt_desc)); | ||
65 | memset(&info, 0x0, sizeof(info)); | ||
66 | memset(buf, 0x0, sizeof(buf)); | ||
67 | |||
68 | /* Get Report Descriptor Size */ | ||
69 | res = ioctl(fd, HIDIOCGRDESCSIZE, &desc_size); | ||
70 | if (res < 0) | ||
71 | perror("HIDIOCGRDESCSIZE"); | ||
72 | else | ||
73 | printf("Report Descriptor Size: %d\n", desc_size); | ||
74 | |||
75 | /* Get Report Descriptor */ | ||
76 | rpt_desc.size = desc_size; | ||
77 | res = ioctl(fd, HIDIOCGRDESC, &rpt_desc); | ||
78 | if (res < 0) { | ||
79 | perror("HIDIOCGRDESC"); | ||
80 | } else { | ||
81 | printf("Report Descriptor:\n"); | ||
82 | for (i = 0; i < rpt_desc.size; i++) | ||
83 | printf("%hhx ", rpt_desc.value[i]); | ||
84 | puts("\n"); | ||
85 | } | ||
86 | |||
87 | /* Get Raw Name */ | ||
88 | res = ioctl(fd, HIDIOCGRAWNAME(256), buf); | ||
89 | if (res < 0) | ||
90 | perror("HIDIOCGRAWNAME"); | ||
91 | else | ||
92 | printf("Raw Name: %s\n", buf); | ||
93 | |||
94 | /* Get Physical Location */ | ||
95 | res = ioctl(fd, HIDIOCGRAWPHYS(256), buf); | ||
96 | if (res < 0) | ||
97 | perror("HIDIOCGRAWPHYS"); | ||
98 | else | ||
99 | printf("Raw Phys: %s\n", buf); | ||
100 | |||
101 | /* Get Raw Info */ | ||
102 | res = ioctl(fd, HIDIOCGRAWINFO, &info); | ||
103 | if (res < 0) { | ||
104 | perror("HIDIOCGRAWINFO"); | ||
105 | } else { | ||
106 | printf("Raw Info:\n"); | ||
107 | printf("\tbustype: %d (%s)\n", | ||
108 | info.bustype, bus_str(info.bustype)); | ||
109 | printf("\tvendor: 0x%04hx\n", info.vendor); | ||
110 | printf("\tproduct: 0x%04hx\n", info.product); | ||
111 | } | ||
112 | |||
113 | /* Set Feature */ | ||
114 | buf[0] = 0x9; /* Report Number */ | ||
115 | buf[1] = 0xff; | ||
116 | buf[2] = 0xff; | ||
117 | buf[3] = 0xff; | ||
118 | res = ioctl(fd, HIDIOCSFEATURE(4), buf); | ||
119 | if (res < 0) | ||
120 | perror("HIDIOCSFEATURE"); | ||
121 | else | ||
122 | printf("ioctl HIDIOCSFEATURE returned: %d\n", res); | ||
123 | |||
124 | /* Get Feature */ | ||
125 | buf[0] = 0x9; /* Report Number */ | ||
126 | res = ioctl(fd, HIDIOCGFEATURE(256), buf); | ||
127 | if (res < 0) { | ||
128 | perror("HIDIOCGFEATURE"); | ||
129 | } else { | ||
130 | printf("ioctl HIDIOCGFEATURE returned: %d\n", res); | ||
131 | printf("Report data (not containing the report number):\n\t"); | ||
132 | for (i = 0; i < res; i++) | ||
133 | printf("%hhx ", buf[i]); | ||
134 | puts("\n"); | ||
135 | } | ||
136 | |||
137 | /* Send a Report to the Device */ | ||
138 | buf[0] = 0x1; /* Report Number */ | ||
139 | buf[1] = 0x77; | ||
140 | res = write(fd, buf, 2); | ||
141 | if (res < 0) { | ||
142 | printf("Error: %d\n", errno); | ||
143 | perror("write"); | ||
144 | } else { | ||
145 | printf("write() wrote %d bytes\n", res); | ||
146 | } | ||
147 | |||
148 | /* Get a report from the device */ | ||
149 | res = read(fd, buf, 16); | ||
150 | if (res < 0) { | ||
151 | perror("read"); | ||
152 | } else { | ||
153 | printf("read() read %d bytes:\n\t", res); | ||
154 | for (i = 0; i < res; i++) | ||
155 | printf("%hhx ", buf[i]); | ||
156 | puts("\n"); | ||
157 | } | ||
158 | close(fd); | ||
159 | return 0; | ||
160 | } | ||
161 | |||
162 | const char * | ||
163 | bus_str(int bus) | ||
164 | { | ||
165 | switch (bus) { | ||
166 | case BUS_USB: | ||
167 | return "USB"; | ||
168 | break; | ||
169 | case BUS_HIL: | ||
170 | return "HIL"; | ||
171 | break; | ||
172 | case BUS_BLUETOOTH: | ||
173 | return "Bluetooth"; | ||
174 | break; | ||
175 | case BUS_VIRTUAL: | ||
176 | return "Virtual"; | ||
177 | break; | ||
178 | default: | ||
179 | return "Other"; | ||
180 | break; | ||
181 | } | ||
182 | } | ||
diff --git a/samples/hw_breakpoint/Makefile b/samples/hw_breakpoint/Makefile new file mode 100644 index 000000000..ef4b6fdd7 --- /dev/null +++ b/samples/hw_breakpoint/Makefile | |||
@@ -0,0 +1,2 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += data_breakpoint.o | ||
diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c new file mode 100644 index 000000000..418c46fe5 --- /dev/null +++ b/samples/hw_breakpoint/data_breakpoint.c | |||
@@ -0,0 +1,82 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
2 | /* | ||
3 | * data_breakpoint.c - Sample HW Breakpoint file to watch kernel data address | ||
4 | * | ||
5 | * usage: insmod data_breakpoint.ko ksym=<ksym_name> | ||
6 | * | ||
7 | * This file is a kernel module that places a breakpoint over ksym_name kernel | ||
8 | * variable using Hardware Breakpoint register. The corresponding handler which | ||
9 | * prints a backtrace is invoked every time a write operation is performed on | ||
10 | * that variable. | ||
11 | * | ||
12 | * Copyright (C) IBM Corporation, 2009 | ||
13 | * | ||
14 | * Author: K.Prasad <prasad@linux.vnet.ibm.com> | ||
15 | */ | ||
16 | #include <linux/module.h> /* Needed by all modules */ | ||
17 | #include <linux/kernel.h> /* Needed for KERN_INFO */ | ||
18 | #include <linux/init.h> /* Needed for the macros */ | ||
19 | #include <linux/kallsyms.h> | ||
20 | |||
21 | #include <linux/perf_event.h> | ||
22 | #include <linux/hw_breakpoint.h> | ||
23 | |||
24 | struct perf_event * __percpu *sample_hbp; | ||
25 | |||
26 | static char ksym_name[KSYM_NAME_LEN] = "jiffies"; | ||
27 | module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO); | ||
28 | MODULE_PARM_DESC(ksym, "Kernel symbol to monitor; this module will report any" | ||
29 | " write operations on the kernel symbol"); | ||
30 | |||
31 | static void sample_hbp_handler(struct perf_event *bp, | ||
32 | struct perf_sample_data *data, | ||
33 | struct pt_regs *regs) | ||
34 | { | ||
35 | printk(KERN_INFO "%s value is changed\n", ksym_name); | ||
36 | dump_stack(); | ||
37 | printk(KERN_INFO "Dump stack from sample_hbp_handler\n"); | ||
38 | } | ||
39 | |||
40 | static int __init hw_break_module_init(void) | ||
41 | { | ||
42 | int ret; | ||
43 | struct perf_event_attr attr; | ||
44 | void *addr = __symbol_get(ksym_name); | ||
45 | |||
46 | if (!addr) | ||
47 | return -ENXIO; | ||
48 | |||
49 | hw_breakpoint_init(&attr); | ||
50 | attr.bp_addr = (unsigned long)addr; | ||
51 | attr.bp_len = HW_BREAKPOINT_LEN_4; | ||
52 | attr.bp_type = HW_BREAKPOINT_W; | ||
53 | |||
54 | sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler, NULL); | ||
55 | if (IS_ERR((void __force *)sample_hbp)) { | ||
56 | ret = PTR_ERR((void __force *)sample_hbp); | ||
57 | goto fail; | ||
58 | } | ||
59 | |||
60 | printk(KERN_INFO "HW Breakpoint for %s write installed\n", ksym_name); | ||
61 | |||
62 | return 0; | ||
63 | |||
64 | fail: | ||
65 | printk(KERN_INFO "Breakpoint registration failed\n"); | ||
66 | |||
67 | return ret; | ||
68 | } | ||
69 | |||
70 | static void __exit hw_break_module_exit(void) | ||
71 | { | ||
72 | unregister_wide_hw_breakpoint(sample_hbp); | ||
73 | symbol_put(ksym_name); | ||
74 | printk(KERN_INFO "HW Breakpoint for %s write uninstalled\n", ksym_name); | ||
75 | } | ||
76 | |||
77 | module_init(hw_break_module_init); | ||
78 | module_exit(hw_break_module_exit); | ||
79 | |||
80 | MODULE_LICENSE("GPL"); | ||
81 | MODULE_AUTHOR("K.Prasad"); | ||
82 | MODULE_DESCRIPTION("ksym breakpoint"); | ||
diff --git a/samples/kdb/Makefile b/samples/kdb/Makefile new file mode 100644 index 000000000..947cb8522 --- /dev/null +++ b/samples/kdb/Makefile | |||
@@ -0,0 +1,2 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | obj-$(CONFIG_SAMPLE_KDB) += kdb_hello.o | ||
diff --git a/samples/kdb/kdb_hello.c b/samples/kdb/kdb_hello.c new file mode 100644 index 000000000..c1c2fa0f6 --- /dev/null +++ b/samples/kdb/kdb_hello.c | |||
@@ -0,0 +1,60 @@ | |||
1 | /* | ||
2 | * Created by: Jason Wessel <jason.wessel@windriver.com> | ||
3 | * | ||
4 | * Copyright (c) 2010 Wind River Systems, Inc. All Rights Reserved. | ||
5 | * | ||
6 | * This file is licensed under the terms of the GNU General Public | ||
7 | * License version 2. This program is licensed "as is" without any | ||
8 | * warranty of any kind, whether express or implied. | ||
9 | */ | ||
10 | |||
11 | #include <linux/module.h> | ||
12 | #include <linux/kdb.h> | ||
13 | |||
14 | /* | ||
15 | * All kdb shell command call backs receive argc and argv, where | ||
16 | * argv[0] is the command the end user typed | ||
17 | */ | ||
18 | static int kdb_hello_cmd(int argc, const char **argv) | ||
19 | { | ||
20 | if (argc > 1) | ||
21 | return KDB_ARGCOUNT; | ||
22 | |||
23 | if (argc) | ||
24 | kdb_printf("Hello %s.\n", argv[1]); | ||
25 | else | ||
26 | kdb_printf("Hello world!\n"); | ||
27 | |||
28 | return 0; | ||
29 | } | ||
30 | |||
31 | |||
32 | static int __init kdb_hello_cmd_init(void) | ||
33 | { | ||
34 | /* | ||
35 | * Registration of a dynamically added kdb command is done with | ||
36 | * kdb_register() with the arguments being: | ||
37 | * 1: The name of the shell command | ||
38 | * 2: The function that processes the command | ||
39 | * 3: Description of the usage of any arguments | ||
40 | * 4: Descriptive text when you run help | ||
41 | * 5: Number of characters to complete the command | ||
42 | * 0 == type the whole command | ||
43 | * 1 == match both "g" and "go" for example | ||
44 | */ | ||
45 | kdb_register("hello", kdb_hello_cmd, "[string]", | ||
46 | "Say Hello World or Hello [string]", 0); | ||
47 | return 0; | ||
48 | } | ||
49 | |||
50 | static void __exit kdb_hello_cmd_exit(void) | ||
51 | { | ||
52 | kdb_unregister("hello"); | ||
53 | } | ||
54 | |||
55 | module_init(kdb_hello_cmd_init); | ||
56 | module_exit(kdb_hello_cmd_exit); | ||
57 | |||
58 | MODULE_AUTHOR("WindRiver"); | ||
59 | MODULE_DESCRIPTION("KDB example to add a hello command"); | ||
60 | MODULE_LICENSE("GPL"); | ||
diff --git a/samples/kfifo/Makefile b/samples/kfifo/Makefile new file mode 100644 index 000000000..0af5250ad --- /dev/null +++ b/samples/kfifo/Makefile | |||
@@ -0,0 +1,2 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | obj-$(CONFIG_SAMPLE_KFIFO) += bytestream-example.o dma-example.o inttype-example.o record-example.o | ||
diff --git a/samples/kfifo/bytestream-example.c b/samples/kfifo/bytestream-example.c new file mode 100644 index 000000000..5a90aa527 --- /dev/null +++ b/samples/kfifo/bytestream-example.c | |||
@@ -0,0 +1,195 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * Sample kfifo byte stream implementation | ||
4 | * | ||
5 | * Copyright (C) 2010 Stefani Seibold <stefani@seibold.net> | ||
6 | */ | ||
7 | |||
8 | #include <linux/init.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/proc_fs.h> | ||
11 | #include <linux/mutex.h> | ||
12 | #include <linux/kfifo.h> | ||
13 | |||
14 | /* | ||
15 | * This module shows how to create a byte stream fifo. | ||
16 | */ | ||
17 | |||
18 | /* fifo size in elements (bytes) */ | ||
19 | #define FIFO_SIZE 32 | ||
20 | |||
21 | /* name of the proc entry */ | ||
22 | #define PROC_FIFO "bytestream-fifo" | ||
23 | |||
24 | /* lock for procfs read access */ | ||
25 | static DEFINE_MUTEX(read_lock); | ||
26 | |||
27 | /* lock for procfs write access */ | ||
28 | static DEFINE_MUTEX(write_lock); | ||
29 | |||
30 | /* | ||
31 | * define DYNAMIC in this example for a dynamically allocated fifo. | ||
32 | * | ||
33 | * Otherwise the fifo storage will be a part of the fifo structure. | ||
34 | */ | ||
35 | #if 0 | ||
36 | #define DYNAMIC | ||
37 | #endif | ||
38 | |||
39 | #ifdef DYNAMIC | ||
40 | static struct kfifo test; | ||
41 | #else | ||
42 | static DECLARE_KFIFO(test, unsigned char, FIFO_SIZE); | ||
43 | #endif | ||
44 | |||
45 | static const unsigned char expected_result[FIFO_SIZE] = { | ||
46 | 3, 4, 5, 6, 7, 8, 9, 0, | ||
47 | 1, 20, 21, 22, 23, 24, 25, 26, | ||
48 | 27, 28, 29, 30, 31, 32, 33, 34, | ||
49 | 35, 36, 37, 38, 39, 40, 41, 42, | ||
50 | }; | ||
51 | |||
52 | static int __init testfunc(void) | ||
53 | { | ||
54 | unsigned char buf[6]; | ||
55 | unsigned char i, j; | ||
56 | unsigned int ret; | ||
57 | |||
58 | printk(KERN_INFO "byte stream fifo test start\n"); | ||
59 | |||
60 | /* put string into the fifo */ | ||
61 | kfifo_in(&test, "hello", 5); | ||
62 | |||
63 | /* put values into the fifo */ | ||
64 | for (i = 0; i != 10; i++) | ||
65 | kfifo_put(&test, i); | ||
66 | |||
67 | /* show the number of used elements */ | ||
68 | printk(KERN_INFO "fifo len: %u\n", kfifo_len(&test)); | ||
69 | |||
70 | /* get max of 5 bytes from the fifo */ | ||
71 | i = kfifo_out(&test, buf, 5); | ||
72 | printk(KERN_INFO "buf: %.*s\n", i, buf); | ||
73 | |||
74 | /* get max of 2 elements from the fifo */ | ||
75 | ret = kfifo_out(&test, buf, 2); | ||
76 | printk(KERN_INFO "ret: %d\n", ret); | ||
77 | /* and put it back to the end of the fifo */ | ||
78 | ret = kfifo_in(&test, buf, ret); | ||
79 | printk(KERN_INFO "ret: %d\n", ret); | ||
80 | |||
81 | /* skip first element of the fifo */ | ||
82 | printk(KERN_INFO "skip 1st element\n"); | ||
83 | kfifo_skip(&test); | ||
84 | |||
85 | /* put values into the fifo until is full */ | ||
86 | for (i = 20; kfifo_put(&test, i); i++) | ||
87 | ; | ||
88 | |||
89 | printk(KERN_INFO "queue len: %u\n", kfifo_len(&test)); | ||
90 | |||
91 | /* show the first value without removing from the fifo */ | ||
92 | if (kfifo_peek(&test, &i)) | ||
93 | printk(KERN_INFO "%d\n", i); | ||
94 | |||
95 | /* check the correctness of all values in the fifo */ | ||
96 | j = 0; | ||
97 | while (kfifo_get(&test, &i)) { | ||
98 | printk(KERN_INFO "item = %d\n", i); | ||
99 | if (i != expected_result[j++]) { | ||
100 | printk(KERN_WARNING "value mismatch: test failed\n"); | ||
101 | return -EIO; | ||
102 | } | ||
103 | } | ||
104 | if (j != ARRAY_SIZE(expected_result)) { | ||
105 | printk(KERN_WARNING "size mismatch: test failed\n"); | ||
106 | return -EIO; | ||
107 | } | ||
108 | printk(KERN_INFO "test passed\n"); | ||
109 | |||
110 | return 0; | ||
111 | } | ||
112 | |||
113 | static ssize_t fifo_write(struct file *file, const char __user *buf, | ||
114 | size_t count, loff_t *ppos) | ||
115 | { | ||
116 | int ret; | ||
117 | unsigned int copied; | ||
118 | |||
119 | if (mutex_lock_interruptible(&write_lock)) | ||
120 | return -ERESTARTSYS; | ||
121 | |||
122 | ret = kfifo_from_user(&test, buf, count, &copied); | ||
123 | |||
124 | mutex_unlock(&write_lock); | ||
125 | if (ret) | ||
126 | return ret; | ||
127 | |||
128 | return copied; | ||
129 | } | ||
130 | |||
131 | static ssize_t fifo_read(struct file *file, char __user *buf, | ||
132 | size_t count, loff_t *ppos) | ||
133 | { | ||
134 | int ret; | ||
135 | unsigned int copied; | ||
136 | |||
137 | if (mutex_lock_interruptible(&read_lock)) | ||
138 | return -ERESTARTSYS; | ||
139 | |||
140 | ret = kfifo_to_user(&test, buf, count, &copied); | ||
141 | |||
142 | mutex_unlock(&read_lock); | ||
143 | if (ret) | ||
144 | return ret; | ||
145 | |||
146 | return copied; | ||
147 | } | ||
148 | |||
149 | static const struct proc_ops fifo_proc_ops = { | ||
150 | .proc_read = fifo_read, | ||
151 | .proc_write = fifo_write, | ||
152 | .proc_lseek = noop_llseek, | ||
153 | }; | ||
154 | |||
155 | static int __init example_init(void) | ||
156 | { | ||
157 | #ifdef DYNAMIC | ||
158 | int ret; | ||
159 | |||
160 | ret = kfifo_alloc(&test, FIFO_SIZE, GFP_KERNEL); | ||
161 | if (ret) { | ||
162 | printk(KERN_ERR "error kfifo_alloc\n"); | ||
163 | return ret; | ||
164 | } | ||
165 | #else | ||
166 | INIT_KFIFO(test); | ||
167 | #endif | ||
168 | if (testfunc() < 0) { | ||
169 | #ifdef DYNAMIC | ||
170 | kfifo_free(&test); | ||
171 | #endif | ||
172 | return -EIO; | ||
173 | } | ||
174 | |||
175 | if (proc_create(PROC_FIFO, 0, NULL, &fifo_proc_ops) == NULL) { | ||
176 | #ifdef DYNAMIC | ||
177 | kfifo_free(&test); | ||
178 | #endif | ||
179 | return -ENOMEM; | ||
180 | } | ||
181 | return 0; | ||
182 | } | ||
183 | |||
184 | static void __exit example_exit(void) | ||
185 | { | ||
186 | remove_proc_entry(PROC_FIFO, NULL); | ||
187 | #ifdef DYNAMIC | ||
188 | kfifo_free(&test); | ||
189 | #endif | ||
190 | } | ||
191 | |||
192 | module_init(example_init); | ||
193 | module_exit(example_exit); | ||
194 | MODULE_LICENSE("GPL"); | ||
195 | MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>"); | ||
diff --git a/samples/kfifo/dma-example.c b/samples/kfifo/dma-example.c new file mode 100644 index 000000000..0cf27483c --- /dev/null +++ b/samples/kfifo/dma-example.c | |||
@@ -0,0 +1,141 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * Sample fifo dma implementation | ||
4 | * | ||
5 | * Copyright (C) 2010 Stefani Seibold <stefani@seibold.net> | ||
6 | */ | ||
7 | |||
8 | #include <linux/init.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/kfifo.h> | ||
11 | |||
12 | /* | ||
13 | * This module shows how to handle fifo dma operations. | ||
14 | */ | ||
15 | |||
16 | /* fifo size in elements (bytes) */ | ||
17 | #define FIFO_SIZE 32 | ||
18 | |||
19 | static struct kfifo fifo; | ||
20 | |||
21 | static int __init example_init(void) | ||
22 | { | ||
23 | int i; | ||
24 | unsigned int ret; | ||
25 | unsigned int nents; | ||
26 | struct scatterlist sg[10]; | ||
27 | |||
28 | printk(KERN_INFO "DMA fifo test start\n"); | ||
29 | |||
30 | if (kfifo_alloc(&fifo, FIFO_SIZE, GFP_KERNEL)) { | ||
31 | printk(KERN_WARNING "error kfifo_alloc\n"); | ||
32 | return -ENOMEM; | ||
33 | } | ||
34 | |||
35 | printk(KERN_INFO "queue size: %u\n", kfifo_size(&fifo)); | ||
36 | |||
37 | kfifo_in(&fifo, "test", 4); | ||
38 | |||
39 | for (i = 0; i != 9; i++) | ||
40 | kfifo_put(&fifo, i); | ||
41 | |||
42 | /* kick away first byte */ | ||
43 | kfifo_skip(&fifo); | ||
44 | |||
45 | printk(KERN_INFO "queue len: %u\n", kfifo_len(&fifo)); | ||
46 | |||
47 | /* | ||
48 | * Configure the kfifo buffer to receive data from DMA input. | ||
49 | * | ||
50 | * .--------------------------------------. | ||
51 | * | 0 | 1 | 2 | ... | 12 | 13 | ... | 31 | | ||
52 | * |---|------------------|---------------| | ||
53 | * \_/ \________________/ \_____________/ | ||
54 | * \ \ \ | ||
55 | * \ \_allocated data \ | ||
56 | * \_*free space* \_*free space* | ||
57 | * | ||
58 | * We need two different SG entries: one for the free space area at the | ||
59 | * end of the kfifo buffer (19 bytes) and another for the first free | ||
60 | * byte at the beginning, after the kfifo_skip(). | ||
61 | */ | ||
62 | sg_init_table(sg, ARRAY_SIZE(sg)); | ||
63 | nents = kfifo_dma_in_prepare(&fifo, sg, ARRAY_SIZE(sg), FIFO_SIZE); | ||
64 | printk(KERN_INFO "DMA sgl entries: %d\n", nents); | ||
65 | if (!nents) { | ||
66 | /* fifo is full and no sgl was created */ | ||
67 | printk(KERN_WARNING "error kfifo_dma_in_prepare\n"); | ||
68 | return -EIO; | ||
69 | } | ||
70 | |||
71 | /* receive data */ | ||
72 | printk(KERN_INFO "scatterlist for receive:\n"); | ||
73 | for (i = 0; i < nents; i++) { | ||
74 | printk(KERN_INFO | ||
75 | "sg[%d] -> " | ||
76 | "page %p offset 0x%.8x length 0x%.8x\n", | ||
77 | i, sg_page(&sg[i]), sg[i].offset, sg[i].length); | ||
78 | |||
79 | if (sg_is_last(&sg[i])) | ||
80 | break; | ||
81 | } | ||
82 | |||
83 | /* put here your code to setup and exectute the dma operation */ | ||
84 | /* ... */ | ||
85 | |||
86 | /* example: zero bytes received */ | ||
87 | ret = 0; | ||
88 | |||
89 | /* finish the dma operation and update the received data */ | ||
90 | kfifo_dma_in_finish(&fifo, ret); | ||
91 | |||
92 | /* Prepare to transmit data, example: 8 bytes */ | ||
93 | nents = kfifo_dma_out_prepare(&fifo, sg, ARRAY_SIZE(sg), 8); | ||
94 | printk(KERN_INFO "DMA sgl entries: %d\n", nents); | ||
95 | if (!nents) { | ||
96 | /* no data was available and no sgl was created */ | ||
97 | printk(KERN_WARNING "error kfifo_dma_out_prepare\n"); | ||
98 | return -EIO; | ||
99 | } | ||
100 | |||
101 | printk(KERN_INFO "scatterlist for transmit:\n"); | ||
102 | for (i = 0; i < nents; i++) { | ||
103 | printk(KERN_INFO | ||
104 | "sg[%d] -> " | ||
105 | "page %p offset 0x%.8x length 0x%.8x\n", | ||
106 | i, sg_page(&sg[i]), sg[i].offset, sg[i].length); | ||
107 | |||
108 | if (sg_is_last(&sg[i])) | ||
109 | break; | ||
110 | } | ||
111 | |||
112 | /* put here your code to setup and exectute the dma operation */ | ||
113 | /* ... */ | ||
114 | |||
115 | /* example: 5 bytes transmitted */ | ||
116 | ret = 5; | ||
117 | |||
118 | /* finish the dma operation and update the transmitted data */ | ||
119 | kfifo_dma_out_finish(&fifo, ret); | ||
120 | |||
121 | ret = kfifo_len(&fifo); | ||
122 | printk(KERN_INFO "queue len: %u\n", kfifo_len(&fifo)); | ||
123 | |||
124 | if (ret != 7) { | ||
125 | printk(KERN_WARNING "size mismatch: test failed"); | ||
126 | return -EIO; | ||
127 | } | ||
128 | printk(KERN_INFO "test passed\n"); | ||
129 | |||
130 | return 0; | ||
131 | } | ||
132 | |||
133 | static void __exit example_exit(void) | ||
134 | { | ||
135 | kfifo_free(&fifo); | ||
136 | } | ||
137 | |||
138 | module_init(example_init); | ||
139 | module_exit(example_exit); | ||
140 | MODULE_LICENSE("GPL"); | ||
141 | MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>"); | ||
diff --git a/samples/kfifo/inttype-example.c b/samples/kfifo/inttype-example.c new file mode 100644 index 000000000..e5403d8c9 --- /dev/null +++ b/samples/kfifo/inttype-example.c | |||
@@ -0,0 +1,186 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * Sample kfifo int type implementation | ||
4 | * | ||
5 | * Copyright (C) 2010 Stefani Seibold <stefani@seibold.net> | ||
6 | */ | ||
7 | |||
8 | #include <linux/init.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/proc_fs.h> | ||
11 | #include <linux/mutex.h> | ||
12 | #include <linux/kfifo.h> | ||
13 | |||
14 | /* | ||
15 | * This module shows how to create a int type fifo. | ||
16 | */ | ||
17 | |||
18 | /* fifo size in elements (ints) */ | ||
19 | #define FIFO_SIZE 32 | ||
20 | |||
21 | /* name of the proc entry */ | ||
22 | #define PROC_FIFO "int-fifo" | ||
23 | |||
24 | /* lock for procfs read access */ | ||
25 | static DEFINE_MUTEX(read_lock); | ||
26 | |||
27 | /* lock for procfs write access */ | ||
28 | static DEFINE_MUTEX(write_lock); | ||
29 | |||
30 | /* | ||
31 | * define DYNAMIC in this example for a dynamically allocated fifo. | ||
32 | * | ||
33 | * Otherwise the fifo storage will be a part of the fifo structure. | ||
34 | */ | ||
35 | #if 0 | ||
36 | #define DYNAMIC | ||
37 | #endif | ||
38 | |||
39 | #ifdef DYNAMIC | ||
40 | static DECLARE_KFIFO_PTR(test, int); | ||
41 | #else | ||
42 | static DEFINE_KFIFO(test, int, FIFO_SIZE); | ||
43 | #endif | ||
44 | |||
45 | static const int expected_result[FIFO_SIZE] = { | ||
46 | 3, 4, 5, 6, 7, 8, 9, 0, | ||
47 | 1, 20, 21, 22, 23, 24, 25, 26, | ||
48 | 27, 28, 29, 30, 31, 32, 33, 34, | ||
49 | 35, 36, 37, 38, 39, 40, 41, 42, | ||
50 | }; | ||
51 | |||
52 | static int __init testfunc(void) | ||
53 | { | ||
54 | int buf[6]; | ||
55 | int i, j; | ||
56 | unsigned int ret; | ||
57 | |||
58 | printk(KERN_INFO "int fifo test start\n"); | ||
59 | |||
60 | /* put values into the fifo */ | ||
61 | for (i = 0; i != 10; i++) | ||
62 | kfifo_put(&test, i); | ||
63 | |||
64 | /* show the number of used elements */ | ||
65 | printk(KERN_INFO "fifo len: %u\n", kfifo_len(&test)); | ||
66 | |||
67 | /* get max of 2 elements from the fifo */ | ||
68 | ret = kfifo_out(&test, buf, 2); | ||
69 | printk(KERN_INFO "ret: %d\n", ret); | ||
70 | /* and put it back to the end of the fifo */ | ||
71 | ret = kfifo_in(&test, buf, ret); | ||
72 | printk(KERN_INFO "ret: %d\n", ret); | ||
73 | |||
74 | /* skip first element of the fifo */ | ||
75 | printk(KERN_INFO "skip 1st element\n"); | ||
76 | kfifo_skip(&test); | ||
77 | |||
78 | /* put values into the fifo until is full */ | ||
79 | for (i = 20; kfifo_put(&test, i); i++) | ||
80 | ; | ||
81 | |||
82 | printk(KERN_INFO "queue len: %u\n", kfifo_len(&test)); | ||
83 | |||
84 | /* show the first value without removing from the fifo */ | ||
85 | if (kfifo_peek(&test, &i)) | ||
86 | printk(KERN_INFO "%d\n", i); | ||
87 | |||
88 | /* check the correctness of all values in the fifo */ | ||
89 | j = 0; | ||
90 | while (kfifo_get(&test, &i)) { | ||
91 | printk(KERN_INFO "item = %d\n", i); | ||
92 | if (i != expected_result[j++]) { | ||
93 | printk(KERN_WARNING "value mismatch: test failed\n"); | ||
94 | return -EIO; | ||
95 | } | ||
96 | } | ||
97 | if (j != ARRAY_SIZE(expected_result)) { | ||
98 | printk(KERN_WARNING "size mismatch: test failed\n"); | ||
99 | return -EIO; | ||
100 | } | ||
101 | printk(KERN_INFO "test passed\n"); | ||
102 | |||
103 | return 0; | ||
104 | } | ||
105 | |||
106 | static ssize_t fifo_write(struct file *file, const char __user *buf, | ||
107 | size_t count, loff_t *ppos) | ||
108 | { | ||
109 | int ret; | ||
110 | unsigned int copied; | ||
111 | |||
112 | if (mutex_lock_interruptible(&write_lock)) | ||
113 | return -ERESTARTSYS; | ||
114 | |||
115 | ret = kfifo_from_user(&test, buf, count, &copied); | ||
116 | |||
117 | mutex_unlock(&write_lock); | ||
118 | if (ret) | ||
119 | return ret; | ||
120 | |||
121 | return copied; | ||
122 | } | ||
123 | |||
124 | static ssize_t fifo_read(struct file *file, char __user *buf, | ||
125 | size_t count, loff_t *ppos) | ||
126 | { | ||
127 | int ret; | ||
128 | unsigned int copied; | ||
129 | |||
130 | if (mutex_lock_interruptible(&read_lock)) | ||
131 | return -ERESTARTSYS; | ||
132 | |||
133 | ret = kfifo_to_user(&test, buf, count, &copied); | ||
134 | |||
135 | mutex_unlock(&read_lock); | ||
136 | if (ret) | ||
137 | return ret; | ||
138 | |||
139 | return copied; | ||
140 | } | ||
141 | |||
142 | static const struct proc_ops fifo_proc_ops = { | ||
143 | .proc_read = fifo_read, | ||
144 | .proc_write = fifo_write, | ||
145 | .proc_lseek = noop_llseek, | ||
146 | }; | ||
147 | |||
148 | static int __init example_init(void) | ||
149 | { | ||
150 | #ifdef DYNAMIC | ||
151 | int ret; | ||
152 | |||
153 | ret = kfifo_alloc(&test, FIFO_SIZE, GFP_KERNEL); | ||
154 | if (ret) { | ||
155 | printk(KERN_ERR "error kfifo_alloc\n"); | ||
156 | return ret; | ||
157 | } | ||
158 | #endif | ||
159 | if (testfunc() < 0) { | ||
160 | #ifdef DYNAMIC | ||
161 | kfifo_free(&test); | ||
162 | #endif | ||
163 | return -EIO; | ||
164 | } | ||
165 | |||
166 | if (proc_create(PROC_FIFO, 0, NULL, &fifo_proc_ops) == NULL) { | ||
167 | #ifdef DYNAMIC | ||
168 | kfifo_free(&test); | ||
169 | #endif | ||
170 | return -ENOMEM; | ||
171 | } | ||
172 | return 0; | ||
173 | } | ||
174 | |||
175 | static void __exit example_exit(void) | ||
176 | { | ||
177 | remove_proc_entry(PROC_FIFO, NULL); | ||
178 | #ifdef DYNAMIC | ||
179 | kfifo_free(&test); | ||
180 | #endif | ||
181 | } | ||
182 | |||
183 | module_init(example_init); | ||
184 | module_exit(example_exit); | ||
185 | MODULE_LICENSE("GPL"); | ||
186 | MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>"); | ||
diff --git a/samples/kfifo/record-example.c b/samples/kfifo/record-example.c new file mode 100644 index 000000000..f64f3d62d --- /dev/null +++ b/samples/kfifo/record-example.c | |||
@@ -0,0 +1,202 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * Sample dynamic sized record fifo implementation | ||
4 | * | ||
5 | * Copyright (C) 2010 Stefani Seibold <stefani@seibold.net> | ||
6 | */ | ||
7 | |||
8 | #include <linux/init.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/proc_fs.h> | ||
11 | #include <linux/mutex.h> | ||
12 | #include <linux/kfifo.h> | ||
13 | |||
14 | /* | ||
15 | * This module shows how to create a variable sized record fifo. | ||
16 | */ | ||
17 | |||
18 | /* fifo size in elements (bytes) */ | ||
19 | #define FIFO_SIZE 128 | ||
20 | |||
21 | /* name of the proc entry */ | ||
22 | #define PROC_FIFO "record-fifo" | ||
23 | |||
24 | /* lock for procfs read access */ | ||
25 | static DEFINE_MUTEX(read_lock); | ||
26 | |||
27 | /* lock for procfs write access */ | ||
28 | static DEFINE_MUTEX(write_lock); | ||
29 | |||
30 | /* | ||
31 | * define DYNAMIC in this example for a dynamically allocated fifo. | ||
32 | * | ||
33 | * Otherwise the fifo storage will be a part of the fifo structure. | ||
34 | */ | ||
35 | #if 0 | ||
36 | #define DYNAMIC | ||
37 | #endif | ||
38 | |||
39 | /* | ||
40 | * struct kfifo_rec_ptr_1 and STRUCT_KFIFO_REC_1 can handle records of a | ||
41 | * length between 0 and 255 bytes. | ||
42 | * | ||
43 | * struct kfifo_rec_ptr_2 and STRUCT_KFIFO_REC_2 can handle records of a | ||
44 | * length between 0 and 65535 bytes. | ||
45 | */ | ||
46 | |||
47 | #ifdef DYNAMIC | ||
48 | struct kfifo_rec_ptr_1 test; | ||
49 | |||
50 | #else | ||
51 | typedef STRUCT_KFIFO_REC_1(FIFO_SIZE) mytest; | ||
52 | |||
53 | static mytest test; | ||
54 | #endif | ||
55 | |||
56 | static const char *expected_result[] = { | ||
57 | "a", | ||
58 | "bb", | ||
59 | "ccc", | ||
60 | "dddd", | ||
61 | "eeeee", | ||
62 | "ffffff", | ||
63 | "ggggggg", | ||
64 | "hhhhhhhh", | ||
65 | "iiiiiiiii", | ||
66 | "jjjjjjjjjj", | ||
67 | }; | ||
68 | |||
69 | static int __init testfunc(void) | ||
70 | { | ||
71 | char buf[100]; | ||
72 | unsigned int i; | ||
73 | unsigned int ret; | ||
74 | struct { unsigned char buf[6]; } hello = { "hello" }; | ||
75 | |||
76 | printk(KERN_INFO "record fifo test start\n"); | ||
77 | |||
78 | kfifo_in(&test, &hello, sizeof(hello)); | ||
79 | |||
80 | /* show the size of the next record in the fifo */ | ||
81 | printk(KERN_INFO "fifo peek len: %u\n", kfifo_peek_len(&test)); | ||
82 | |||
83 | /* put in variable length data */ | ||
84 | for (i = 0; i < 10; i++) { | ||
85 | memset(buf, 'a' + i, i + 1); | ||
86 | kfifo_in(&test, buf, i + 1); | ||
87 | } | ||
88 | |||
89 | /* skip first element of the fifo */ | ||
90 | printk(KERN_INFO "skip 1st element\n"); | ||
91 | kfifo_skip(&test); | ||
92 | |||
93 | printk(KERN_INFO "fifo len: %u\n", kfifo_len(&test)); | ||
94 | |||
95 | /* show the first record without removing from the fifo */ | ||
96 | ret = kfifo_out_peek(&test, buf, sizeof(buf)); | ||
97 | if (ret) | ||
98 | printk(KERN_INFO "%.*s\n", ret, buf); | ||
99 | |||
100 | /* check the correctness of all values in the fifo */ | ||
101 | i = 0; | ||
102 | while (!kfifo_is_empty(&test)) { | ||
103 | ret = kfifo_out(&test, buf, sizeof(buf)); | ||
104 | buf[ret] = '\0'; | ||
105 | printk(KERN_INFO "item = %.*s\n", ret, buf); | ||
106 | if (strcmp(buf, expected_result[i++])) { | ||
107 | printk(KERN_WARNING "value mismatch: test failed\n"); | ||
108 | return -EIO; | ||
109 | } | ||
110 | } | ||
111 | if (i != ARRAY_SIZE(expected_result)) { | ||
112 | printk(KERN_WARNING "size mismatch: test failed\n"); | ||
113 | return -EIO; | ||
114 | } | ||
115 | printk(KERN_INFO "test passed\n"); | ||
116 | |||
117 | return 0; | ||
118 | } | ||
119 | |||
120 | static ssize_t fifo_write(struct file *file, const char __user *buf, | ||
121 | size_t count, loff_t *ppos) | ||
122 | { | ||
123 | int ret; | ||
124 | unsigned int copied; | ||
125 | |||
126 | if (mutex_lock_interruptible(&write_lock)) | ||
127 | return -ERESTARTSYS; | ||
128 | |||
129 | ret = kfifo_from_user(&test, buf, count, &copied); | ||
130 | |||
131 | mutex_unlock(&write_lock); | ||
132 | if (ret) | ||
133 | return ret; | ||
134 | |||
135 | return copied; | ||
136 | } | ||
137 | |||
138 | static ssize_t fifo_read(struct file *file, char __user *buf, | ||
139 | size_t count, loff_t *ppos) | ||
140 | { | ||
141 | int ret; | ||
142 | unsigned int copied; | ||
143 | |||
144 | if (mutex_lock_interruptible(&read_lock)) | ||
145 | return -ERESTARTSYS; | ||
146 | |||
147 | ret = kfifo_to_user(&test, buf, count, &copied); | ||
148 | |||
149 | mutex_unlock(&read_lock); | ||
150 | if (ret) | ||
151 | return ret; | ||
152 | |||
153 | return copied; | ||
154 | } | ||
155 | |||
156 | static const struct proc_ops fifo_proc_ops = { | ||
157 | .proc_read = fifo_read, | ||
158 | .proc_write = fifo_write, | ||
159 | .proc_lseek = noop_llseek, | ||
160 | }; | ||
161 | |||
162 | static int __init example_init(void) | ||
163 | { | ||
164 | #ifdef DYNAMIC | ||
165 | int ret; | ||
166 | |||
167 | ret = kfifo_alloc(&test, FIFO_SIZE, GFP_KERNEL); | ||
168 | if (ret) { | ||
169 | printk(KERN_ERR "error kfifo_alloc\n"); | ||
170 | return ret; | ||
171 | } | ||
172 | #else | ||
173 | INIT_KFIFO(test); | ||
174 | #endif | ||
175 | if (testfunc() < 0) { | ||
176 | #ifdef DYNAMIC | ||
177 | kfifo_free(&test); | ||
178 | #endif | ||
179 | return -EIO; | ||
180 | } | ||
181 | |||
182 | if (proc_create(PROC_FIFO, 0, NULL, &fifo_proc_ops) == NULL) { | ||
183 | #ifdef DYNAMIC | ||
184 | kfifo_free(&test); | ||
185 | #endif | ||
186 | return -ENOMEM; | ||
187 | } | ||
188 | return 0; | ||
189 | } | ||
190 | |||
191 | static void __exit example_exit(void) | ||
192 | { | ||
193 | remove_proc_entry(PROC_FIFO, NULL); | ||
194 | #ifdef DYNAMIC | ||
195 | kfifo_free(&test); | ||
196 | #endif | ||
197 | } | ||
198 | |||
199 | module_init(example_init); | ||
200 | module_exit(example_exit); | ||
201 | MODULE_LICENSE("GPL"); | ||
202 | MODULE_AUTHOR("Stefani Seibold <stefani@seibold.net>"); | ||
diff --git a/samples/kmemleak/Makefile b/samples/kmemleak/Makefile new file mode 100644 index 000000000..16b6132c5 --- /dev/null +++ b/samples/kmemleak/Makefile | |||
@@ -0,0 +1,3 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | |||
3 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o | ||
diff --git a/samples/kmemleak/kmemleak-test.c b/samples/kmemleak/kmemleak-test.c new file mode 100644 index 000000000..7b476eb82 --- /dev/null +++ b/samples/kmemleak/kmemleak-test.c | |||
@@ -0,0 +1,99 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * samples/kmemleak/kmemleak-test.c | ||
4 | * | ||
5 | * Copyright (C) 2008 ARM Limited | ||
6 | * Written by Catalin Marinas <catalin.marinas@arm.com> | ||
7 | */ | ||
8 | |||
9 | #define pr_fmt(fmt) "kmemleak: " fmt | ||
10 | |||
11 | #include <linux/init.h> | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <linux/slab.h> | ||
15 | #include <linux/vmalloc.h> | ||
16 | #include <linux/list.h> | ||
17 | #include <linux/percpu.h> | ||
18 | #include <linux/fdtable.h> | ||
19 | |||
20 | #include <linux/kmemleak.h> | ||
21 | |||
22 | struct test_node { | ||
23 | long header[25]; | ||
24 | struct list_head list; | ||
25 | long footer[25]; | ||
26 | }; | ||
27 | |||
28 | static LIST_HEAD(test_list); | ||
29 | static DEFINE_PER_CPU(void *, kmemleak_test_pointer); | ||
30 | |||
31 | /* | ||
32 | * Some very simple testing. This function needs to be extended for | ||
33 | * proper testing. | ||
34 | */ | ||
35 | static int __init kmemleak_test_init(void) | ||
36 | { | ||
37 | struct test_node *elem; | ||
38 | int i; | ||
39 | |||
40 | pr_info("Kmemleak testing\n"); | ||
41 | |||
42 | /* make some orphan objects */ | ||
43 | pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); | ||
44 | pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); | ||
45 | pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); | ||
46 | pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); | ||
47 | pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); | ||
48 | pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); | ||
49 | pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); | ||
50 | pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); | ||
51 | #ifndef CONFIG_MODULES | ||
52 | pr_info("kmem_cache_alloc(files_cachep) = %p\n", | ||
53 | kmem_cache_alloc(files_cachep, GFP_KERNEL)); | ||
54 | pr_info("kmem_cache_alloc(files_cachep) = %p\n", | ||
55 | kmem_cache_alloc(files_cachep, GFP_KERNEL)); | ||
56 | #endif | ||
57 | pr_info("vmalloc(64) = %p\n", vmalloc(64)); | ||
58 | pr_info("vmalloc(64) = %p\n", vmalloc(64)); | ||
59 | pr_info("vmalloc(64) = %p\n", vmalloc(64)); | ||
60 | pr_info("vmalloc(64) = %p\n", vmalloc(64)); | ||
61 | pr_info("vmalloc(64) = %p\n", vmalloc(64)); | ||
62 | |||
63 | /* | ||
64 | * Add elements to a list. They should only appear as orphan | ||
65 | * after the module is removed. | ||
66 | */ | ||
67 | for (i = 0; i < 10; i++) { | ||
68 | elem = kzalloc(sizeof(*elem), GFP_KERNEL); | ||
69 | pr_info("kzalloc(sizeof(*elem)) = %p\n", elem); | ||
70 | if (!elem) | ||
71 | return -ENOMEM; | ||
72 | INIT_LIST_HEAD(&elem->list); | ||
73 | list_add_tail(&elem->list, &test_list); | ||
74 | } | ||
75 | |||
76 | for_each_possible_cpu(i) { | ||
77 | per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL); | ||
78 | pr_info("kmalloc(129) = %p\n", | ||
79 | per_cpu(kmemleak_test_pointer, i)); | ||
80 | } | ||
81 | |||
82 | return 0; | ||
83 | } | ||
84 | module_init(kmemleak_test_init); | ||
85 | |||
86 | static void __exit kmemleak_test_exit(void) | ||
87 | { | ||
88 | struct test_node *elem, *tmp; | ||
89 | |||
90 | /* | ||
91 | * Remove the list elements without actually freeing the | ||
92 | * memory. | ||
93 | */ | ||
94 | list_for_each_entry_safe(elem, tmp, &test_list, list) | ||
95 | list_del(&elem->list); | ||
96 | } | ||
97 | module_exit(kmemleak_test_exit); | ||
98 | |||
99 | MODULE_LICENSE("GPL"); | ||
diff --git a/samples/kobject/Makefile b/samples/kobject/Makefile new file mode 100644 index 000000000..bb5d21997 --- /dev/null +++ b/samples/kobject/Makefile | |||
@@ -0,0 +1,2 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | obj-$(CONFIG_SAMPLE_KOBJECT) += kobject-example.o kset-example.o | ||
diff --git a/samples/kobject/kobject-example.c b/samples/kobject/kobject-example.c new file mode 100644 index 000000000..9e383fdba --- /dev/null +++ b/samples/kobject/kobject-example.c | |||
@@ -0,0 +1,144 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Sample kobject implementation | ||
4 | * | ||
5 | * Copyright (C) 2004-2007 Greg Kroah-Hartman <greg@kroah.com> | ||
6 | * Copyright (C) 2007 Novell Inc. | ||
7 | */ | ||
8 | #include <linux/kobject.h> | ||
9 | #include <linux/string.h> | ||
10 | #include <linux/sysfs.h> | ||
11 | #include <linux/module.h> | ||
12 | #include <linux/init.h> | ||
13 | |||
14 | /* | ||
15 | * This module shows how to create a simple subdirectory in sysfs called | ||
16 | * /sys/kernel/kobject-example In that directory, 3 files are created: | ||
17 | * "foo", "baz", and "bar". If an integer is written to these files, it can be | ||
18 | * later read out of it. | ||
19 | */ | ||
20 | |||
21 | static int foo; | ||
22 | static int baz; | ||
23 | static int bar; | ||
24 | |||
25 | /* | ||
26 | * The "foo" file where a static variable is read from and written to. | ||
27 | */ | ||
28 | static ssize_t foo_show(struct kobject *kobj, struct kobj_attribute *attr, | ||
29 | char *buf) | ||
30 | { | ||
31 | return sprintf(buf, "%d\n", foo); | ||
32 | } | ||
33 | |||
34 | static ssize_t foo_store(struct kobject *kobj, struct kobj_attribute *attr, | ||
35 | const char *buf, size_t count) | ||
36 | { | ||
37 | int ret; | ||
38 | |||
39 | ret = kstrtoint(buf, 10, &foo); | ||
40 | if (ret < 0) | ||
41 | return ret; | ||
42 | |||
43 | return count; | ||
44 | } | ||
45 | |||
46 | /* Sysfs attributes cannot be world-writable. */ | ||
47 | static struct kobj_attribute foo_attribute = | ||
48 | __ATTR(foo, 0664, foo_show, foo_store); | ||
49 | |||
50 | /* | ||
51 | * More complex function where we determine which variable is being accessed by | ||
52 | * looking at the attribute for the "baz" and "bar" files. | ||
53 | */ | ||
54 | static ssize_t b_show(struct kobject *kobj, struct kobj_attribute *attr, | ||
55 | char *buf) | ||
56 | { | ||
57 | int var; | ||
58 | |||
59 | if (strcmp(attr->attr.name, "baz") == 0) | ||
60 | var = baz; | ||
61 | else | ||
62 | var = bar; | ||
63 | return sprintf(buf, "%d\n", var); | ||
64 | } | ||
65 | |||
66 | static ssize_t b_store(struct kobject *kobj, struct kobj_attribute *attr, | ||
67 | const char *buf, size_t count) | ||
68 | { | ||
69 | int var, ret; | ||
70 | |||
71 | ret = kstrtoint(buf, 10, &var); | ||
72 | if (ret < 0) | ||
73 | return ret; | ||
74 | |||
75 | if (strcmp(attr->attr.name, "baz") == 0) | ||
76 | baz = var; | ||
77 | else | ||
78 | bar = var; | ||
79 | return count; | ||
80 | } | ||
81 | |||
82 | static struct kobj_attribute baz_attribute = | ||
83 | __ATTR(baz, 0664, b_show, b_store); | ||
84 | static struct kobj_attribute bar_attribute = | ||
85 | __ATTR(bar, 0664, b_show, b_store); | ||
86 | |||
87 | |||
88 | /* | ||
89 | * Create a group of attributes so that we can create and destroy them all | ||
90 | * at once. | ||
91 | */ | ||
92 | static struct attribute *attrs[] = { | ||
93 | &foo_attribute.attr, | ||
94 | &baz_attribute.attr, | ||
95 | &bar_attribute.attr, | ||
96 | NULL, /* need to NULL terminate the list of attributes */ | ||
97 | }; | ||
98 | |||
99 | /* | ||
100 | * An unnamed attribute group will put all of the attributes directly in | ||
101 | * the kobject directory. If we specify a name, a subdirectory will be | ||
102 | * created for the attributes with the directory being the name of the | ||
103 | * attribute group. | ||
104 | */ | ||
105 | static struct attribute_group attr_group = { | ||
106 | .attrs = attrs, | ||
107 | }; | ||
108 | |||
109 | static struct kobject *example_kobj; | ||
110 | |||
111 | static int __init example_init(void) | ||
112 | { | ||
113 | int retval; | ||
114 | |||
115 | /* | ||
116 | * Create a simple kobject with the name of "kobject_example", | ||
117 | * located under /sys/kernel/ | ||
118 | * | ||
119 | * As this is a simple directory, no uevent will be sent to | ||
120 | * userspace. That is why this function should not be used for | ||
121 | * any type of dynamic kobjects, where the name and number are | ||
122 | * not known ahead of time. | ||
123 | */ | ||
124 | example_kobj = kobject_create_and_add("kobject_example", kernel_kobj); | ||
125 | if (!example_kobj) | ||
126 | return -ENOMEM; | ||
127 | |||
128 | /* Create the files associated with this kobject */ | ||
129 | retval = sysfs_create_group(example_kobj, &attr_group); | ||
130 | if (retval) | ||
131 | kobject_put(example_kobj); | ||
132 | |||
133 | return retval; | ||
134 | } | ||
135 | |||
136 | static void __exit example_exit(void) | ||
137 | { | ||
138 | kobject_put(example_kobj); | ||
139 | } | ||
140 | |||
141 | module_init(example_init); | ||
142 | module_exit(example_exit); | ||
143 | MODULE_LICENSE("GPL v2"); | ||
144 | MODULE_AUTHOR("Greg Kroah-Hartman <greg@kroah.com>"); | ||
diff --git a/samples/kobject/kset-example.c b/samples/kobject/kset-example.c new file mode 100644 index 000000000..c8010f126 --- /dev/null +++ b/samples/kobject/kset-example.c | |||
@@ -0,0 +1,288 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Sample kset and ktype implementation | ||
4 | * | ||
5 | * Copyright (C) 2004-2007 Greg Kroah-Hartman <greg@kroah.com> | ||
6 | * Copyright (C) 2007 Novell Inc. | ||
7 | */ | ||
8 | #include <linux/kobject.h> | ||
9 | #include <linux/string.h> | ||
10 | #include <linux/sysfs.h> | ||
11 | #include <linux/slab.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/init.h> | ||
14 | |||
15 | /* | ||
16 | * This module shows how to create a kset in sysfs called | ||
17 | * /sys/kernel/kset-example | ||
18 | * Then tree kobjects are created and assigned to this kset, "foo", "baz", | ||
19 | * and "bar". In those kobjects, attributes of the same name are also | ||
20 | * created and if an integer is written to these files, it can be later | ||
21 | * read out of it. | ||
22 | */ | ||
23 | |||
24 | |||
25 | /* | ||
26 | * This is our "object" that we will create a few of and register them with | ||
27 | * sysfs. | ||
28 | */ | ||
29 | struct foo_obj { | ||
30 | struct kobject kobj; | ||
31 | int foo; | ||
32 | int baz; | ||
33 | int bar; | ||
34 | }; | ||
35 | #define to_foo_obj(x) container_of(x, struct foo_obj, kobj) | ||
36 | |||
37 | /* a custom attribute that works just for a struct foo_obj. */ | ||
38 | struct foo_attribute { | ||
39 | struct attribute attr; | ||
40 | ssize_t (*show)(struct foo_obj *foo, struct foo_attribute *attr, char *buf); | ||
41 | ssize_t (*store)(struct foo_obj *foo, struct foo_attribute *attr, const char *buf, size_t count); | ||
42 | }; | ||
43 | #define to_foo_attr(x) container_of(x, struct foo_attribute, attr) | ||
44 | |||
45 | /* | ||
46 | * The default show function that must be passed to sysfs. This will be | ||
47 | * called by sysfs for whenever a show function is called by the user on a | ||
48 | * sysfs file associated with the kobjects we have registered. We need to | ||
49 | * transpose back from a "default" kobject to our custom struct foo_obj and | ||
50 | * then call the show function for that specific object. | ||
51 | */ | ||
52 | static ssize_t foo_attr_show(struct kobject *kobj, | ||
53 | struct attribute *attr, | ||
54 | char *buf) | ||
55 | { | ||
56 | struct foo_attribute *attribute; | ||
57 | struct foo_obj *foo; | ||
58 | |||
59 | attribute = to_foo_attr(attr); | ||
60 | foo = to_foo_obj(kobj); | ||
61 | |||
62 | if (!attribute->show) | ||
63 | return -EIO; | ||
64 | |||
65 | return attribute->show(foo, attribute, buf); | ||
66 | } | ||
67 | |||
68 | /* | ||
69 | * Just like the default show function above, but this one is for when the | ||
70 | * sysfs "store" is requested (when a value is written to a file.) | ||
71 | */ | ||
72 | static ssize_t foo_attr_store(struct kobject *kobj, | ||
73 | struct attribute *attr, | ||
74 | const char *buf, size_t len) | ||
75 | { | ||
76 | struct foo_attribute *attribute; | ||
77 | struct foo_obj *foo; | ||
78 | |||
79 | attribute = to_foo_attr(attr); | ||
80 | foo = to_foo_obj(kobj); | ||
81 | |||
82 | if (!attribute->store) | ||
83 | return -EIO; | ||
84 | |||
85 | return attribute->store(foo, attribute, buf, len); | ||
86 | } | ||
87 | |||
88 | /* Our custom sysfs_ops that we will associate with our ktype later on */ | ||
89 | static const struct sysfs_ops foo_sysfs_ops = { | ||
90 | .show = foo_attr_show, | ||
91 | .store = foo_attr_store, | ||
92 | }; | ||
93 | |||
94 | /* | ||
95 | * The release function for our object. This is REQUIRED by the kernel to | ||
96 | * have. We free the memory held in our object here. | ||
97 | * | ||
98 | * NEVER try to get away with just a "blank" release function to try to be | ||
99 | * smarter than the kernel. Turns out, no one ever is... | ||
100 | */ | ||
101 | static void foo_release(struct kobject *kobj) | ||
102 | { | ||
103 | struct foo_obj *foo; | ||
104 | |||
105 | foo = to_foo_obj(kobj); | ||
106 | kfree(foo); | ||
107 | } | ||
108 | |||
109 | /* | ||
110 | * The "foo" file where the .foo variable is read from and written to. | ||
111 | */ | ||
112 | static ssize_t foo_show(struct foo_obj *foo_obj, struct foo_attribute *attr, | ||
113 | char *buf) | ||
114 | { | ||
115 | return sprintf(buf, "%d\n", foo_obj->foo); | ||
116 | } | ||
117 | |||
118 | static ssize_t foo_store(struct foo_obj *foo_obj, struct foo_attribute *attr, | ||
119 | const char *buf, size_t count) | ||
120 | { | ||
121 | int ret; | ||
122 | |||
123 | ret = kstrtoint(buf, 10, &foo_obj->foo); | ||
124 | if (ret < 0) | ||
125 | return ret; | ||
126 | |||
127 | return count; | ||
128 | } | ||
129 | |||
130 | /* Sysfs attributes cannot be world-writable. */ | ||
131 | static struct foo_attribute foo_attribute = | ||
132 | __ATTR(foo, 0664, foo_show, foo_store); | ||
133 | |||
134 | /* | ||
135 | * More complex function where we determine which variable is being accessed by | ||
136 | * looking at the attribute for the "baz" and "bar" files. | ||
137 | */ | ||
138 | static ssize_t b_show(struct foo_obj *foo_obj, struct foo_attribute *attr, | ||
139 | char *buf) | ||
140 | { | ||
141 | int var; | ||
142 | |||
143 | if (strcmp(attr->attr.name, "baz") == 0) | ||
144 | var = foo_obj->baz; | ||
145 | else | ||
146 | var = foo_obj->bar; | ||
147 | return sprintf(buf, "%d\n", var); | ||
148 | } | ||
149 | |||
150 | static ssize_t b_store(struct foo_obj *foo_obj, struct foo_attribute *attr, | ||
151 | const char *buf, size_t count) | ||
152 | { | ||
153 | int var, ret; | ||
154 | |||
155 | ret = kstrtoint(buf, 10, &var); | ||
156 | if (ret < 0) | ||
157 | return ret; | ||
158 | |||
159 | if (strcmp(attr->attr.name, "baz") == 0) | ||
160 | foo_obj->baz = var; | ||
161 | else | ||
162 | foo_obj->bar = var; | ||
163 | return count; | ||
164 | } | ||
165 | |||
166 | static struct foo_attribute baz_attribute = | ||
167 | __ATTR(baz, 0664, b_show, b_store); | ||
168 | static struct foo_attribute bar_attribute = | ||
169 | __ATTR(bar, 0664, b_show, b_store); | ||
170 | |||
171 | /* | ||
172 | * Create a group of attributes so that we can create and destroy them all | ||
173 | * at once. | ||
174 | */ | ||
175 | static struct attribute *foo_default_attrs[] = { | ||
176 | &foo_attribute.attr, | ||
177 | &baz_attribute.attr, | ||
178 | &bar_attribute.attr, | ||
179 | NULL, /* need to NULL terminate the list of attributes */ | ||
180 | }; | ||
181 | ATTRIBUTE_GROUPS(foo_default); | ||
182 | |||
183 | /* | ||
184 | * Our own ktype for our kobjects. Here we specify our sysfs ops, the | ||
185 | * release function, and the set of default attributes we want created | ||
186 | * whenever a kobject of this type is registered with the kernel. | ||
187 | */ | ||
188 | static struct kobj_type foo_ktype = { | ||
189 | .sysfs_ops = &foo_sysfs_ops, | ||
190 | .release = foo_release, | ||
191 | .default_groups = foo_default_groups, | ||
192 | }; | ||
193 | |||
194 | static struct kset *example_kset; | ||
195 | static struct foo_obj *foo_obj; | ||
196 | static struct foo_obj *bar_obj; | ||
197 | static struct foo_obj *baz_obj; | ||
198 | |||
199 | static struct foo_obj *create_foo_obj(const char *name) | ||
200 | { | ||
201 | struct foo_obj *foo; | ||
202 | int retval; | ||
203 | |||
204 | /* allocate the memory for the whole object */ | ||
205 | foo = kzalloc(sizeof(*foo), GFP_KERNEL); | ||
206 | if (!foo) | ||
207 | return NULL; | ||
208 | |||
209 | /* | ||
210 | * As we have a kset for this kobject, we need to set it before calling | ||
211 | * the kobject core. | ||
212 | */ | ||
213 | foo->kobj.kset = example_kset; | ||
214 | |||
215 | /* | ||
216 | * Initialize and add the kobject to the kernel. All the default files | ||
217 | * will be created here. As we have already specified a kset for this | ||
218 | * kobject, we don't have to set a parent for the kobject, the kobject | ||
219 | * will be placed beneath that kset automatically. | ||
220 | */ | ||
221 | retval = kobject_init_and_add(&foo->kobj, &foo_ktype, NULL, "%s", name); | ||
222 | if (retval) { | ||
223 | kobject_put(&foo->kobj); | ||
224 | return NULL; | ||
225 | } | ||
226 | |||
227 | /* | ||
228 | * We are always responsible for sending the uevent that the kobject | ||
229 | * was added to the system. | ||
230 | */ | ||
231 | kobject_uevent(&foo->kobj, KOBJ_ADD); | ||
232 | |||
233 | return foo; | ||
234 | } | ||
235 | |||
236 | static void destroy_foo_obj(struct foo_obj *foo) | ||
237 | { | ||
238 | kobject_put(&foo->kobj); | ||
239 | } | ||
240 | |||
241 | static int __init example_init(void) | ||
242 | { | ||
243 | /* | ||
244 | * Create a kset with the name of "kset_example", | ||
245 | * located under /sys/kernel/ | ||
246 | */ | ||
247 | example_kset = kset_create_and_add("kset_example", NULL, kernel_kobj); | ||
248 | if (!example_kset) | ||
249 | return -ENOMEM; | ||
250 | |||
251 | /* | ||
252 | * Create three objects and register them with our kset | ||
253 | */ | ||
254 | foo_obj = create_foo_obj("foo"); | ||
255 | if (!foo_obj) | ||
256 | goto foo_error; | ||
257 | |||
258 | bar_obj = create_foo_obj("bar"); | ||
259 | if (!bar_obj) | ||
260 | goto bar_error; | ||
261 | |||
262 | baz_obj = create_foo_obj("baz"); | ||
263 | if (!baz_obj) | ||
264 | goto baz_error; | ||
265 | |||
266 | return 0; | ||
267 | |||
268 | baz_error: | ||
269 | destroy_foo_obj(bar_obj); | ||
270 | bar_error: | ||
271 | destroy_foo_obj(foo_obj); | ||
272 | foo_error: | ||
273 | kset_unregister(example_kset); | ||
274 | return -EINVAL; | ||
275 | } | ||
276 | |||
277 | static void __exit example_exit(void) | ||
278 | { | ||
279 | destroy_foo_obj(baz_obj); | ||
280 | destroy_foo_obj(bar_obj); | ||
281 | destroy_foo_obj(foo_obj); | ||
282 | kset_unregister(example_kset); | ||
283 | } | ||
284 | |||
285 | module_init(example_init); | ||
286 | module_exit(example_exit); | ||
287 | MODULE_LICENSE("GPL v2"); | ||
288 | MODULE_AUTHOR("Greg Kroah-Hartman <greg@kroah.com>"); | ||
diff --git a/samples/kprobes/Makefile b/samples/kprobes/Makefile new file mode 100644 index 000000000..e77459271 --- /dev/null +++ b/samples/kprobes/Makefile | |||
@@ -0,0 +1,6 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | # builds the kprobes example kernel modules; | ||
3 | # then to use one (as root): insmod <module_name.ko> | ||
4 | |||
5 | obj-$(CONFIG_SAMPLE_KPROBES) += kprobe_example.o | ||
6 | obj-$(CONFIG_SAMPLE_KRETPROBES) += kretprobe_example.o | ||
diff --git a/samples/kprobes/kprobe_example.c b/samples/kprobes/kprobe_example.c new file mode 100644 index 000000000..365905cb2 --- /dev/null +++ b/samples/kprobes/kprobe_example.c | |||
@@ -0,0 +1,120 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * NOTE: This example is works on x86 and powerpc. | ||
4 | * Here's a sample kernel module showing the use of kprobes to dump a | ||
5 | * stack trace and selected registers when kernel_clone() is called. | ||
6 | * | ||
7 | * For more information on theory of operation of kprobes, see | ||
8 | * Documentation/trace/kprobes.rst | ||
9 | * | ||
10 | * You will see the trace data in /var/log/messages and on the console | ||
11 | * whenever kernel_clone() is invoked to create a new process. | ||
12 | */ | ||
13 | |||
14 | #include <linux/kernel.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <linux/kprobes.h> | ||
17 | |||
18 | #define MAX_SYMBOL_LEN 64 | ||
19 | static char symbol[MAX_SYMBOL_LEN] = "kernel_clone"; | ||
20 | module_param_string(symbol, symbol, sizeof(symbol), 0644); | ||
21 | |||
22 | /* For each probe you need to allocate a kprobe structure */ | ||
23 | static struct kprobe kp = { | ||
24 | .symbol_name = symbol, | ||
25 | }; | ||
26 | |||
27 | /* kprobe pre_handler: called just before the probed instruction is executed */ | ||
28 | static int __kprobes handler_pre(struct kprobe *p, struct pt_regs *regs) | ||
29 | { | ||
30 | #ifdef CONFIG_X86 | ||
31 | pr_info("<%s> pre_handler: p->addr = 0x%p, ip = %lx, flags = 0x%lx\n", | ||
32 | p->symbol_name, p->addr, regs->ip, regs->flags); | ||
33 | #endif | ||
34 | #ifdef CONFIG_PPC | ||
35 | pr_info("<%s> pre_handler: p->addr = 0x%p, nip = 0x%lx, msr = 0x%lx\n", | ||
36 | p->symbol_name, p->addr, regs->nip, regs->msr); | ||
37 | #endif | ||
38 | #ifdef CONFIG_MIPS | ||
39 | pr_info("<%s> pre_handler: p->addr = 0x%p, epc = 0x%lx, status = 0x%lx\n", | ||
40 | p->symbol_name, p->addr, regs->cp0_epc, regs->cp0_status); | ||
41 | #endif | ||
42 | #ifdef CONFIG_ARM64 | ||
43 | pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx," | ||
44 | " pstate = 0x%lx\n", | ||
45 | p->symbol_name, p->addr, (long)regs->pc, (long)regs->pstate); | ||
46 | #endif | ||
47 | #ifdef CONFIG_S390 | ||
48 | pr_info("<%s> pre_handler: p->addr, 0x%p, ip = 0x%lx, flags = 0x%lx\n", | ||
49 | p->symbol_name, p->addr, regs->psw.addr, regs->flags); | ||
50 | #endif | ||
51 | |||
52 | /* A dump_stack() here will give a stack backtrace */ | ||
53 | return 0; | ||
54 | } | ||
55 | |||
56 | /* kprobe post_handler: called after the probed instruction is executed */ | ||
57 | static void __kprobes handler_post(struct kprobe *p, struct pt_regs *regs, | ||
58 | unsigned long flags) | ||
59 | { | ||
60 | #ifdef CONFIG_X86 | ||
61 | pr_info("<%s> post_handler: p->addr = 0x%p, flags = 0x%lx\n", | ||
62 | p->symbol_name, p->addr, regs->flags); | ||
63 | #endif | ||
64 | #ifdef CONFIG_PPC | ||
65 | pr_info("<%s> post_handler: p->addr = 0x%p, msr = 0x%lx\n", | ||
66 | p->symbol_name, p->addr, regs->msr); | ||
67 | #endif | ||
68 | #ifdef CONFIG_MIPS | ||
69 | pr_info("<%s> post_handler: p->addr = 0x%p, status = 0x%lx\n", | ||
70 | p->symbol_name, p->addr, regs->cp0_status); | ||
71 | #endif | ||
72 | #ifdef CONFIG_ARM64 | ||
73 | pr_info("<%s> post_handler: p->addr = 0x%p, pstate = 0x%lx\n", | ||
74 | p->symbol_name, p->addr, (long)regs->pstate); | ||
75 | #endif | ||
76 | #ifdef CONFIG_S390 | ||
77 | pr_info("<%s> pre_handler: p->addr, 0x%p, flags = 0x%lx\n", | ||
78 | p->symbol_name, p->addr, regs->flags); | ||
79 | #endif | ||
80 | } | ||
81 | |||
82 | /* | ||
83 | * fault_handler: this is called if an exception is generated for any | ||
84 | * instruction within the pre- or post-handler, or when Kprobes | ||
85 | * single-steps the probed instruction. | ||
86 | */ | ||
87 | static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr) | ||
88 | { | ||
89 | pr_info("fault_handler: p->addr = 0x%p, trap #%dn", p->addr, trapnr); | ||
90 | /* Return 0 because we don't handle the fault. */ | ||
91 | return 0; | ||
92 | } | ||
93 | /* NOKPROBE_SYMBOL() is also available */ | ||
94 | NOKPROBE_SYMBOL(handler_fault); | ||
95 | |||
96 | static int __init kprobe_init(void) | ||
97 | { | ||
98 | int ret; | ||
99 | kp.pre_handler = handler_pre; | ||
100 | kp.post_handler = handler_post; | ||
101 | kp.fault_handler = handler_fault; | ||
102 | |||
103 | ret = register_kprobe(&kp); | ||
104 | if (ret < 0) { | ||
105 | pr_err("register_kprobe failed, returned %d\n", ret); | ||
106 | return ret; | ||
107 | } | ||
108 | pr_info("Planted kprobe at %p\n", kp.addr); | ||
109 | return 0; | ||
110 | } | ||
111 | |||
112 | static void __exit kprobe_exit(void) | ||
113 | { | ||
114 | unregister_kprobe(&kp); | ||
115 | pr_info("kprobe at %p unregistered\n", kp.addr); | ||
116 | } | ||
117 | |||
118 | module_init(kprobe_init) | ||
119 | module_exit(kprobe_exit) | ||
120 | MODULE_LICENSE("GPL"); | ||
diff --git a/samples/kprobes/kretprobe_example.c b/samples/kprobes/kretprobe_example.c new file mode 100644 index 000000000..228321ecb --- /dev/null +++ b/samples/kprobes/kretprobe_example.c | |||
@@ -0,0 +1,108 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * kretprobe_example.c | ||
4 | * | ||
5 | * Here's a sample kernel module showing the use of return probes to | ||
6 | * report the return value and total time taken for probed function | ||
7 | * to run. | ||
8 | * | ||
9 | * usage: insmod kretprobe_example.ko func=<func_name> | ||
10 | * | ||
11 | * If no func_name is specified, kernel_clone is instrumented | ||
12 | * | ||
13 | * For more information on theory of operation of kretprobes, see | ||
14 | * Documentation/trace/kprobes.rst | ||
15 | * | ||
16 | * Build and insert the kernel module as done in the kprobe example. | ||
17 | * You will see the trace data in /var/log/messages and on the console | ||
18 | * whenever the probed function returns. (Some messages may be suppressed | ||
19 | * if syslogd is configured to eliminate duplicate messages.) | ||
20 | */ | ||
21 | |||
22 | #include <linux/kernel.h> | ||
23 | #include <linux/module.h> | ||
24 | #include <linux/kprobes.h> | ||
25 | #include <linux/ktime.h> | ||
26 | #include <linux/limits.h> | ||
27 | #include <linux/sched.h> | ||
28 | |||
29 | static char func_name[NAME_MAX] = "kernel_clone"; | ||
30 | module_param_string(func, func_name, NAME_MAX, S_IRUGO); | ||
31 | MODULE_PARM_DESC(func, "Function to kretprobe; this module will report the" | ||
32 | " function's execution time"); | ||
33 | |||
34 | /* per-instance private data */ | ||
35 | struct my_data { | ||
36 | ktime_t entry_stamp; | ||
37 | }; | ||
38 | |||
39 | /* Here we use the entry_hanlder to timestamp function entry */ | ||
40 | static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) | ||
41 | { | ||
42 | struct my_data *data; | ||
43 | |||
44 | if (!current->mm) | ||
45 | return 1; /* Skip kernel threads */ | ||
46 | |||
47 | data = (struct my_data *)ri->data; | ||
48 | data->entry_stamp = ktime_get(); | ||
49 | return 0; | ||
50 | } | ||
51 | NOKPROBE_SYMBOL(entry_handler); | ||
52 | |||
53 | /* | ||
54 | * Return-probe handler: Log the return value and duration. Duration may turn | ||
55 | * out to be zero consistently, depending upon the granularity of time | ||
56 | * accounting on the platform. | ||
57 | */ | ||
58 | static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs) | ||
59 | { | ||
60 | unsigned long retval = regs_return_value(regs); | ||
61 | struct my_data *data = (struct my_data *)ri->data; | ||
62 | s64 delta; | ||
63 | ktime_t now; | ||
64 | |||
65 | now = ktime_get(); | ||
66 | delta = ktime_to_ns(ktime_sub(now, data->entry_stamp)); | ||
67 | pr_info("%s returned %lu and took %lld ns to execute\n", | ||
68 | func_name, retval, (long long)delta); | ||
69 | return 0; | ||
70 | } | ||
71 | NOKPROBE_SYMBOL(ret_handler); | ||
72 | |||
73 | static struct kretprobe my_kretprobe = { | ||
74 | .handler = ret_handler, | ||
75 | .entry_handler = entry_handler, | ||
76 | .data_size = sizeof(struct my_data), | ||
77 | /* Probe up to 20 instances concurrently. */ | ||
78 | .maxactive = 20, | ||
79 | }; | ||
80 | |||
81 | static int __init kretprobe_init(void) | ||
82 | { | ||
83 | int ret; | ||
84 | |||
85 | my_kretprobe.kp.symbol_name = func_name; | ||
86 | ret = register_kretprobe(&my_kretprobe); | ||
87 | if (ret < 0) { | ||
88 | pr_err("register_kretprobe failed, returned %d\n", ret); | ||
89 | return ret; | ||
90 | } | ||
91 | pr_info("Planted return probe at %s: %p\n", | ||
92 | my_kretprobe.kp.symbol_name, my_kretprobe.kp.addr); | ||
93 | return 0; | ||
94 | } | ||
95 | |||
96 | static void __exit kretprobe_exit(void) | ||
97 | { | ||
98 | unregister_kretprobe(&my_kretprobe); | ||
99 | pr_info("kretprobe at %p unregistered\n", my_kretprobe.kp.addr); | ||
100 | |||
101 | /* nmissed > 0 suggests that maxactive was set too low. */ | ||
102 | pr_info("Missed probing %d instances of %s\n", | ||
103 | my_kretprobe.nmissed, my_kretprobe.kp.symbol_name); | ||
104 | } | ||
105 | |||
106 | module_init(kretprobe_init) | ||
107 | module_exit(kretprobe_exit) | ||
108 | MODULE_LICENSE("GPL"); | ||
diff --git a/samples/livepatch/Makefile b/samples/livepatch/Makefile new file mode 100644 index 000000000..9f853eeb6 --- /dev/null +++ b/samples/livepatch/Makefile | |||
@@ -0,0 +1,8 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-sample.o | ||
3 | obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-mod.o | ||
4 | obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix1.o | ||
5 | obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix2.o | ||
6 | obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-demo.o | ||
7 | obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-mod.o | ||
8 | obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-busymod.o | ||
diff --git a/samples/livepatch/livepatch-callbacks-busymod.c b/samples/livepatch/livepatch-callbacks-busymod.c new file mode 100644 index 000000000..378e2d402 --- /dev/null +++ b/samples/livepatch/livepatch-callbacks-busymod.c | |||
@@ -0,0 +1,60 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
2 | /* | ||
3 | * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> | ||
4 | */ | ||
5 | |||
6 | /* | ||
7 | * livepatch-callbacks-busymod.c - (un)patching callbacks demo support module | ||
8 | * | ||
9 | * | ||
10 | * Purpose | ||
11 | * ------- | ||
12 | * | ||
13 | * Simple module to demonstrate livepatch (un)patching callbacks. | ||
14 | * | ||
15 | * | ||
16 | * Usage | ||
17 | * ----- | ||
18 | * | ||
19 | * This module is not intended to be standalone. See the "Usage" | ||
20 | * section of livepatch-callbacks-mod.c. | ||
21 | */ | ||
22 | |||
23 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
24 | |||
25 | #include <linux/module.h> | ||
26 | #include <linux/kernel.h> | ||
27 | #include <linux/workqueue.h> | ||
28 | #include <linux/delay.h> | ||
29 | |||
30 | static int sleep_secs; | ||
31 | module_param(sleep_secs, int, 0644); | ||
32 | MODULE_PARM_DESC(sleep_secs, "sleep_secs (default=0)"); | ||
33 | |||
34 | static void busymod_work_func(struct work_struct *work); | ||
35 | static DECLARE_DELAYED_WORK(work, busymod_work_func); | ||
36 | |||
37 | static void busymod_work_func(struct work_struct *work) | ||
38 | { | ||
39 | pr_info("%s, sleeping %d seconds ...\n", __func__, sleep_secs); | ||
40 | msleep(sleep_secs * 1000); | ||
41 | pr_info("%s exit\n", __func__); | ||
42 | } | ||
43 | |||
44 | static int livepatch_callbacks_mod_init(void) | ||
45 | { | ||
46 | pr_info("%s\n", __func__); | ||
47 | schedule_delayed_work(&work, | ||
48 | msecs_to_jiffies(1000 * 0)); | ||
49 | return 0; | ||
50 | } | ||
51 | |||
52 | static void livepatch_callbacks_mod_exit(void) | ||
53 | { | ||
54 | cancel_delayed_work_sync(&work); | ||
55 | pr_info("%s\n", __func__); | ||
56 | } | ||
57 | |||
58 | module_init(livepatch_callbacks_mod_init); | ||
59 | module_exit(livepatch_callbacks_mod_exit); | ||
60 | MODULE_LICENSE("GPL"); | ||
diff --git a/samples/livepatch/livepatch-callbacks-demo.c b/samples/livepatch/livepatch-callbacks-demo.c new file mode 100644 index 000000000..11c3f4357 --- /dev/null +++ b/samples/livepatch/livepatch-callbacks-demo.c | |||
@@ -0,0 +1,196 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
2 | /* | ||
3 | * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> | ||
4 | */ | ||
5 | |||
6 | /* | ||
7 | * livepatch-callbacks-demo.c - (un)patching callbacks livepatch demo | ||
8 | * | ||
9 | * | ||
10 | * Purpose | ||
11 | * ------- | ||
12 | * | ||
13 | * Demonstration of registering livepatch (un)patching callbacks. | ||
14 | * | ||
15 | * | ||
16 | * Usage | ||
17 | * ----- | ||
18 | * | ||
19 | * Step 1 - load the simple module | ||
20 | * | ||
21 | * insmod samples/livepatch/livepatch-callbacks-mod.ko | ||
22 | * | ||
23 | * | ||
24 | * Step 2 - load the demonstration livepatch (with callbacks) | ||
25 | * | ||
26 | * insmod samples/livepatch/livepatch-callbacks-demo.ko | ||
27 | * | ||
28 | * | ||
29 | * Step 3 - cleanup | ||
30 | * | ||
31 | * echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled | ||
32 | * rmmod livepatch_callbacks_demo | ||
33 | * rmmod livepatch_callbacks_mod | ||
34 | * | ||
35 | * Watch dmesg output to see livepatch enablement, callback execution | ||
36 | * and patching operations for both vmlinux and module targets. | ||
37 | * | ||
38 | * NOTE: swap the insmod order of livepatch-callbacks-mod.ko and | ||
39 | * livepatch-callbacks-demo.ko to observe what happens when a | ||
40 | * target module is loaded after a livepatch with callbacks. | ||
41 | * | ||
42 | * NOTE: 'pre_patch_ret' is a module parameter that sets the pre-patch | ||
43 | * callback return status. Try setting up a non-zero status | ||
44 | * such as -19 (-ENODEV): | ||
45 | * | ||
46 | * # Load demo livepatch, vmlinux is patched | ||
47 | * insmod samples/livepatch/livepatch-callbacks-demo.ko | ||
48 | * | ||
49 | * # Setup next pre-patch callback to return -ENODEV | ||
50 | * echo -19 > /sys/module/livepatch_callbacks_demo/parameters/pre_patch_ret | ||
51 | * | ||
52 | * # Module loader refuses to load the target module | ||
53 | * insmod samples/livepatch/livepatch-callbacks-mod.ko | ||
54 | * insmod: ERROR: could not insert module samples/livepatch/livepatch-callbacks-mod.ko: No such device | ||
55 | * | ||
56 | * NOTE: There is a second target module, | ||
57 | * livepatch-callbacks-busymod.ko, available for experimenting | ||
58 | * with livepatch (un)patch callbacks. This module contains | ||
59 | * a 'sleep_secs' parameter that parks the module on one of the | ||
60 | * functions that the livepatch demo module wants to patch. | ||
61 | * Modifying this value and tweaking the order of module loads can | ||
62 | * effectively demonstrate stalled patch transitions: | ||
63 | * | ||
64 | * # Load a target module, let it park on 'busymod_work_func' for | ||
65 | * # thirty seconds | ||
66 | * insmod samples/livepatch/livepatch-callbacks-busymod.ko sleep_secs=30 | ||
67 | * | ||
68 | * # Meanwhile load the livepatch | ||
69 | * insmod samples/livepatch/livepatch-callbacks-demo.ko | ||
70 | * | ||
71 | * # ... then load and unload another target module while the | ||
72 | * # transition is in progress | ||
73 | * insmod samples/livepatch/livepatch-callbacks-mod.ko | ||
74 | * rmmod samples/livepatch/livepatch-callbacks-mod.ko | ||
75 | * | ||
76 | * # Finally cleanup | ||
77 | * echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled | ||
78 | * rmmod samples/livepatch/livepatch-callbacks-demo.ko | ||
79 | */ | ||
80 | |||
81 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
82 | |||
83 | #include <linux/module.h> | ||
84 | #include <linux/kernel.h> | ||
85 | #include <linux/livepatch.h> | ||
86 | |||
87 | static int pre_patch_ret; | ||
88 | module_param(pre_patch_ret, int, 0644); | ||
89 | MODULE_PARM_DESC(pre_patch_ret, "pre_patch_ret (default=0)"); | ||
90 | |||
91 | static const char *const module_state[] = { | ||
92 | [MODULE_STATE_LIVE] = "[MODULE_STATE_LIVE] Normal state", | ||
93 | [MODULE_STATE_COMING] = "[MODULE_STATE_COMING] Full formed, running module_init", | ||
94 | [MODULE_STATE_GOING] = "[MODULE_STATE_GOING] Going away", | ||
95 | [MODULE_STATE_UNFORMED] = "[MODULE_STATE_UNFORMED] Still setting it up", | ||
96 | }; | ||
97 | |||
98 | static void callback_info(const char *callback, struct klp_object *obj) | ||
99 | { | ||
100 | if (obj->mod) | ||
101 | pr_info("%s: %s -> %s\n", callback, obj->mod->name, | ||
102 | module_state[obj->mod->state]); | ||
103 | else | ||
104 | pr_info("%s: vmlinux\n", callback); | ||
105 | } | ||
106 | |||
107 | /* Executed on object patching (ie, patch enablement) */ | ||
108 | static int pre_patch_callback(struct klp_object *obj) | ||
109 | { | ||
110 | callback_info(__func__, obj); | ||
111 | return pre_patch_ret; | ||
112 | } | ||
113 | |||
114 | /* Executed on object unpatching (ie, patch disablement) */ | ||
115 | static void post_patch_callback(struct klp_object *obj) | ||
116 | { | ||
117 | callback_info(__func__, obj); | ||
118 | } | ||
119 | |||
120 | /* Executed on object unpatching (ie, patch disablement) */ | ||
121 | static void pre_unpatch_callback(struct klp_object *obj) | ||
122 | { | ||
123 | callback_info(__func__, obj); | ||
124 | } | ||
125 | |||
126 | /* Executed on object unpatching (ie, patch disablement) */ | ||
127 | static void post_unpatch_callback(struct klp_object *obj) | ||
128 | { | ||
129 | callback_info(__func__, obj); | ||
130 | } | ||
131 | |||
132 | static void patched_work_func(struct work_struct *work) | ||
133 | { | ||
134 | pr_info("%s\n", __func__); | ||
135 | } | ||
136 | |||
137 | static struct klp_func no_funcs[] = { | ||
138 | { } | ||
139 | }; | ||
140 | |||
141 | static struct klp_func busymod_funcs[] = { | ||
142 | { | ||
143 | .old_name = "busymod_work_func", | ||
144 | .new_func = patched_work_func, | ||
145 | }, { } | ||
146 | }; | ||
147 | |||
148 | static struct klp_object objs[] = { | ||
149 | { | ||
150 | .name = NULL, /* vmlinux */ | ||
151 | .funcs = no_funcs, | ||
152 | .callbacks = { | ||
153 | .pre_patch = pre_patch_callback, | ||
154 | .post_patch = post_patch_callback, | ||
155 | .pre_unpatch = pre_unpatch_callback, | ||
156 | .post_unpatch = post_unpatch_callback, | ||
157 | }, | ||
158 | }, { | ||
159 | .name = "livepatch_callbacks_mod", | ||
160 | .funcs = no_funcs, | ||
161 | .callbacks = { | ||
162 | .pre_patch = pre_patch_callback, | ||
163 | .post_patch = post_patch_callback, | ||
164 | .pre_unpatch = pre_unpatch_callback, | ||
165 | .post_unpatch = post_unpatch_callback, | ||
166 | }, | ||
167 | }, { | ||
168 | .name = "livepatch_callbacks_busymod", | ||
169 | .funcs = busymod_funcs, | ||
170 | .callbacks = { | ||
171 | .pre_patch = pre_patch_callback, | ||
172 | .post_patch = post_patch_callback, | ||
173 | .pre_unpatch = pre_unpatch_callback, | ||
174 | .post_unpatch = post_unpatch_callback, | ||
175 | }, | ||
176 | }, { } | ||
177 | }; | ||
178 | |||
179 | static struct klp_patch patch = { | ||
180 | .mod = THIS_MODULE, | ||
181 | .objs = objs, | ||
182 | }; | ||
183 | |||
184 | static int livepatch_callbacks_demo_init(void) | ||
185 | { | ||
186 | return klp_enable_patch(&patch); | ||
187 | } | ||
188 | |||
189 | static void livepatch_callbacks_demo_exit(void) | ||
190 | { | ||
191 | } | ||
192 | |||
193 | module_init(livepatch_callbacks_demo_init); | ||
194 | module_exit(livepatch_callbacks_demo_exit); | ||
195 | MODULE_LICENSE("GPL"); | ||
196 | MODULE_INFO(livepatch, "Y"); | ||
diff --git a/samples/livepatch/livepatch-callbacks-mod.c b/samples/livepatch/livepatch-callbacks-mod.c new file mode 100644 index 000000000..2a074f422 --- /dev/null +++ b/samples/livepatch/livepatch-callbacks-mod.c | |||
@@ -0,0 +1,41 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
2 | /* | ||
3 | * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> | ||
4 | */ | ||
5 | |||
6 | /* | ||
7 | * livepatch-callbacks-mod.c - (un)patching callbacks demo support module | ||
8 | * | ||
9 | * | ||
10 | * Purpose | ||
11 | * ------- | ||
12 | * | ||
13 | * Simple module to demonstrate livepatch (un)patching callbacks. | ||
14 | * | ||
15 | * | ||
16 | * Usage | ||
17 | * ----- | ||
18 | * | ||
19 | * This module is not intended to be standalone. See the "Usage" | ||
20 | * section of livepatch-callbacks-demo.c. | ||
21 | */ | ||
22 | |||
23 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
24 | |||
25 | #include <linux/module.h> | ||
26 | #include <linux/kernel.h> | ||
27 | |||
28 | static int livepatch_callbacks_mod_init(void) | ||
29 | { | ||
30 | pr_info("%s\n", __func__); | ||
31 | return 0; | ||
32 | } | ||
33 | |||
34 | static void livepatch_callbacks_mod_exit(void) | ||
35 | { | ||
36 | pr_info("%s\n", __func__); | ||
37 | } | ||
38 | |||
39 | module_init(livepatch_callbacks_mod_init); | ||
40 | module_exit(livepatch_callbacks_mod_exit); | ||
41 | MODULE_LICENSE("GPL"); | ||
diff --git a/samples/livepatch/livepatch-sample.c b/samples/livepatch/livepatch-sample.c new file mode 100644 index 000000000..cd76d7ebe --- /dev/null +++ b/samples/livepatch/livepatch-sample.c | |||
@@ -0,0 +1,70 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
2 | /* | ||
3 | * livepatch-sample.c - Kernel Live Patching Sample Module | ||
4 | * | ||
5 | * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> | ||
6 | */ | ||
7 | |||
8 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
9 | |||
10 | #include <linux/module.h> | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/livepatch.h> | ||
13 | |||
14 | /* | ||
15 | * This (dumb) live patch overrides the function that prints the | ||
16 | * kernel boot cmdline when /proc/cmdline is read. | ||
17 | * | ||
18 | * Example: | ||
19 | * | ||
20 | * $ cat /proc/cmdline | ||
21 | * <your cmdline> | ||
22 | * | ||
23 | * $ insmod livepatch-sample.ko | ||
24 | * $ cat /proc/cmdline | ||
25 | * this has been live patched | ||
26 | * | ||
27 | * $ echo 0 > /sys/kernel/livepatch/livepatch_sample/enabled | ||
28 | * $ cat /proc/cmdline | ||
29 | * <your cmdline> | ||
30 | */ | ||
31 | |||
32 | #include <linux/seq_file.h> | ||
33 | static int livepatch_cmdline_proc_show(struct seq_file *m, void *v) | ||
34 | { | ||
35 | seq_printf(m, "%s\n", "this has been live patched"); | ||
36 | return 0; | ||
37 | } | ||
38 | |||
39 | static struct klp_func funcs[] = { | ||
40 | { | ||
41 | .old_name = "cmdline_proc_show", | ||
42 | .new_func = livepatch_cmdline_proc_show, | ||
43 | }, { } | ||
44 | }; | ||
45 | |||
46 | static struct klp_object objs[] = { | ||
47 | { | ||
48 | /* name being NULL means vmlinux */ | ||
49 | .funcs = funcs, | ||
50 | }, { } | ||
51 | }; | ||
52 | |||
53 | static struct klp_patch patch = { | ||
54 | .mod = THIS_MODULE, | ||
55 | .objs = objs, | ||
56 | }; | ||
57 | |||
58 | static int livepatch_init(void) | ||
59 | { | ||
60 | return klp_enable_patch(&patch); | ||
61 | } | ||
62 | |||
63 | static void livepatch_exit(void) | ||
64 | { | ||
65 | } | ||
66 | |||
67 | module_init(livepatch_init); | ||
68 | module_exit(livepatch_exit); | ||
69 | MODULE_LICENSE("GPL"); | ||
70 | MODULE_INFO(livepatch, "Y"); | ||
diff --git a/samples/livepatch/livepatch-shadow-fix1.c b/samples/livepatch/livepatch-shadow-fix1.c new file mode 100644 index 000000000..918ce17b4 --- /dev/null +++ b/samples/livepatch/livepatch-shadow-fix1.c | |||
@@ -0,0 +1,173 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
2 | /* | ||
3 | * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> | ||
4 | */ | ||
5 | |||
6 | /* | ||
7 | * livepatch-shadow-fix1.c - Shadow variables, livepatch demo | ||
8 | * | ||
9 | * Purpose | ||
10 | * ------- | ||
11 | * | ||
12 | * Fixes the memory leak introduced in livepatch-shadow-mod through the | ||
13 | * use of a shadow variable. This fix demonstrates the "extending" of | ||
14 | * short-lived data structures by patching its allocation and release | ||
15 | * functions. | ||
16 | * | ||
17 | * | ||
18 | * Usage | ||
19 | * ----- | ||
20 | * | ||
21 | * This module is not intended to be standalone. See the "Usage" | ||
22 | * section of livepatch-shadow-mod.c. | ||
23 | */ | ||
24 | |||
25 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
26 | |||
27 | #include <linux/module.h> | ||
28 | #include <linux/kernel.h> | ||
29 | #include <linux/livepatch.h> | ||
30 | #include <linux/slab.h> | ||
31 | |||
32 | /* Shadow variable enums */ | ||
33 | #define SV_LEAK 1 | ||
34 | |||
35 | /* Allocate new dummies every second */ | ||
36 | #define ALLOC_PERIOD 1 | ||
37 | /* Check for expired dummies after a few new ones have been allocated */ | ||
38 | #define CLEANUP_PERIOD (3 * ALLOC_PERIOD) | ||
39 | /* Dummies expire after a few cleanup instances */ | ||
40 | #define EXPIRE_PERIOD (4 * CLEANUP_PERIOD) | ||
41 | |||
42 | struct dummy { | ||
43 | struct list_head list; | ||
44 | unsigned long jiffies_expire; | ||
45 | }; | ||
46 | |||
47 | /* | ||
48 | * The constructor makes more sense together with klp_shadow_get_or_alloc(). | ||
49 | * In this example, it would be safe to assign the pointer also to the shadow | ||
50 | * variable returned by klp_shadow_alloc(). But we wanted to show the more | ||
51 | * complicated use of the API. | ||
52 | */ | ||
53 | static int shadow_leak_ctor(void *obj, void *shadow_data, void *ctor_data) | ||
54 | { | ||
55 | int **shadow_leak = shadow_data; | ||
56 | int **leak = ctor_data; | ||
57 | |||
58 | if (!ctor_data) | ||
59 | return -EINVAL; | ||
60 | |||
61 | *shadow_leak = *leak; | ||
62 | return 0; | ||
63 | } | ||
64 | |||
65 | static struct dummy *livepatch_fix1_dummy_alloc(void) | ||
66 | { | ||
67 | struct dummy *d; | ||
68 | int *leak; | ||
69 | int **shadow_leak; | ||
70 | |||
71 | d = kzalloc(sizeof(*d), GFP_KERNEL); | ||
72 | if (!d) | ||
73 | return NULL; | ||
74 | |||
75 | d->jiffies_expire = jiffies + | ||
76 | msecs_to_jiffies(1000 * EXPIRE_PERIOD); | ||
77 | |||
78 | /* | ||
79 | * Patch: save the extra memory location into a SV_LEAK shadow | ||
80 | * variable. A patched dummy_free routine can later fetch this | ||
81 | * pointer to handle resource release. | ||
82 | */ | ||
83 | leak = kzalloc(sizeof(*leak), GFP_KERNEL); | ||
84 | if (!leak) | ||
85 | goto err_leak; | ||
86 | |||
87 | shadow_leak = klp_shadow_alloc(d, SV_LEAK, sizeof(leak), GFP_KERNEL, | ||
88 | shadow_leak_ctor, &leak); | ||
89 | if (!shadow_leak) { | ||
90 | pr_err("%s: failed to allocate shadow variable for the leaking pointer: dummy @ %p, leak @ %p\n", | ||
91 | __func__, d, leak); | ||
92 | goto err_shadow; | ||
93 | } | ||
94 | |||
95 | pr_info("%s: dummy @ %p, expires @ %lx\n", | ||
96 | __func__, d, d->jiffies_expire); | ||
97 | |||
98 | return d; | ||
99 | |||
100 | err_shadow: | ||
101 | kfree(leak); | ||
102 | err_leak: | ||
103 | kfree(d); | ||
104 | return NULL; | ||
105 | } | ||
106 | |||
107 | static void livepatch_fix1_dummy_leak_dtor(void *obj, void *shadow_data) | ||
108 | { | ||
109 | void *d = obj; | ||
110 | int **shadow_leak = shadow_data; | ||
111 | |||
112 | kfree(*shadow_leak); | ||
113 | pr_info("%s: dummy @ %p, prevented leak @ %p\n", | ||
114 | __func__, d, *shadow_leak); | ||
115 | } | ||
116 | |||
117 | static void livepatch_fix1_dummy_free(struct dummy *d) | ||
118 | { | ||
119 | int **shadow_leak; | ||
120 | |||
121 | /* | ||
122 | * Patch: fetch the saved SV_LEAK shadow variable, detach and | ||
123 | * free it. Note: handle cases where this shadow variable does | ||
124 | * not exist (ie, dummy structures allocated before this livepatch | ||
125 | * was loaded.) | ||
126 | */ | ||
127 | shadow_leak = klp_shadow_get(d, SV_LEAK); | ||
128 | if (shadow_leak) | ||
129 | klp_shadow_free(d, SV_LEAK, livepatch_fix1_dummy_leak_dtor); | ||
130 | else | ||
131 | pr_info("%s: dummy @ %p leaked!\n", __func__, d); | ||
132 | |||
133 | kfree(d); | ||
134 | } | ||
135 | |||
136 | static struct klp_func funcs[] = { | ||
137 | { | ||
138 | .old_name = "dummy_alloc", | ||
139 | .new_func = livepatch_fix1_dummy_alloc, | ||
140 | }, | ||
141 | { | ||
142 | .old_name = "dummy_free", | ||
143 | .new_func = livepatch_fix1_dummy_free, | ||
144 | }, { } | ||
145 | }; | ||
146 | |||
147 | static struct klp_object objs[] = { | ||
148 | { | ||
149 | .name = "livepatch_shadow_mod", | ||
150 | .funcs = funcs, | ||
151 | }, { } | ||
152 | }; | ||
153 | |||
154 | static struct klp_patch patch = { | ||
155 | .mod = THIS_MODULE, | ||
156 | .objs = objs, | ||
157 | }; | ||
158 | |||
159 | static int livepatch_shadow_fix1_init(void) | ||
160 | { | ||
161 | return klp_enable_patch(&patch); | ||
162 | } | ||
163 | |||
164 | static void livepatch_shadow_fix1_exit(void) | ||
165 | { | ||
166 | /* Cleanup any existing SV_LEAK shadow variables */ | ||
167 | klp_shadow_free_all(SV_LEAK, livepatch_fix1_dummy_leak_dtor); | ||
168 | } | ||
169 | |||
170 | module_init(livepatch_shadow_fix1_init); | ||
171 | module_exit(livepatch_shadow_fix1_exit); | ||
172 | MODULE_LICENSE("GPL"); | ||
173 | MODULE_INFO(livepatch, "Y"); | ||
diff --git a/samples/livepatch/livepatch-shadow-fix2.c b/samples/livepatch/livepatch-shadow-fix2.c new file mode 100644 index 000000000..29fe5cd42 --- /dev/null +++ b/samples/livepatch/livepatch-shadow-fix2.c | |||
@@ -0,0 +1,132 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
2 | /* | ||
3 | * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> | ||
4 | */ | ||
5 | |||
6 | /* | ||
7 | * livepatch-shadow-fix2.c - Shadow variables, livepatch demo | ||
8 | * | ||
9 | * Purpose | ||
10 | * ------- | ||
11 | * | ||
12 | * Adds functionality to livepatch-shadow-mod's in-flight data | ||
13 | * structures through a shadow variable. The livepatch patches a | ||
14 | * routine that periodically inspects data structures, incrementing a | ||
15 | * per-data-structure counter, creating the counter if needed. | ||
16 | * | ||
17 | * | ||
18 | * Usage | ||
19 | * ----- | ||
20 | * | ||
21 | * This module is not intended to be standalone. See the "Usage" | ||
22 | * section of livepatch-shadow-mod.c. | ||
23 | */ | ||
24 | |||
25 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
26 | |||
27 | #include <linux/module.h> | ||
28 | #include <linux/kernel.h> | ||
29 | #include <linux/livepatch.h> | ||
30 | #include <linux/slab.h> | ||
31 | |||
32 | /* Shadow variable enums */ | ||
33 | #define SV_LEAK 1 | ||
34 | #define SV_COUNTER 2 | ||
35 | |||
36 | struct dummy { | ||
37 | struct list_head list; | ||
38 | unsigned long jiffies_expire; | ||
39 | }; | ||
40 | |||
41 | static bool livepatch_fix2_dummy_check(struct dummy *d, unsigned long jiffies) | ||
42 | { | ||
43 | int *shadow_count; | ||
44 | |||
45 | /* | ||
46 | * Patch: handle in-flight dummy structures, if they do not | ||
47 | * already have a SV_COUNTER shadow variable, then attach a | ||
48 | * new one. | ||
49 | */ | ||
50 | shadow_count = klp_shadow_get_or_alloc(d, SV_COUNTER, | ||
51 | sizeof(*shadow_count), GFP_NOWAIT, | ||
52 | NULL, NULL); | ||
53 | if (shadow_count) | ||
54 | *shadow_count += 1; | ||
55 | |||
56 | return time_after(jiffies, d->jiffies_expire); | ||
57 | } | ||
58 | |||
59 | static void livepatch_fix2_dummy_leak_dtor(void *obj, void *shadow_data) | ||
60 | { | ||
61 | void *d = obj; | ||
62 | int **shadow_leak = shadow_data; | ||
63 | |||
64 | kfree(*shadow_leak); | ||
65 | pr_info("%s: dummy @ %p, prevented leak @ %p\n", | ||
66 | __func__, d, *shadow_leak); | ||
67 | } | ||
68 | |||
69 | static void livepatch_fix2_dummy_free(struct dummy *d) | ||
70 | { | ||
71 | int **shadow_leak; | ||
72 | int *shadow_count; | ||
73 | |||
74 | /* Patch: copy the memory leak patch from the fix1 module. */ | ||
75 | shadow_leak = klp_shadow_get(d, SV_LEAK); | ||
76 | if (shadow_leak) | ||
77 | klp_shadow_free(d, SV_LEAK, livepatch_fix2_dummy_leak_dtor); | ||
78 | else | ||
79 | pr_info("%s: dummy @ %p leaked!\n", __func__, d); | ||
80 | |||
81 | /* | ||
82 | * Patch: fetch the SV_COUNTER shadow variable and display | ||
83 | * the final count. Detach the shadow variable. | ||
84 | */ | ||
85 | shadow_count = klp_shadow_get(d, SV_COUNTER); | ||
86 | if (shadow_count) { | ||
87 | pr_info("%s: dummy @ %p, check counter = %d\n", | ||
88 | __func__, d, *shadow_count); | ||
89 | klp_shadow_free(d, SV_COUNTER, NULL); | ||
90 | } | ||
91 | |||
92 | kfree(d); | ||
93 | } | ||
94 | |||
95 | static struct klp_func funcs[] = { | ||
96 | { | ||
97 | .old_name = "dummy_check", | ||
98 | .new_func = livepatch_fix2_dummy_check, | ||
99 | }, | ||
100 | { | ||
101 | .old_name = "dummy_free", | ||
102 | .new_func = livepatch_fix2_dummy_free, | ||
103 | }, { } | ||
104 | }; | ||
105 | |||
106 | static struct klp_object objs[] = { | ||
107 | { | ||
108 | .name = "livepatch_shadow_mod", | ||
109 | .funcs = funcs, | ||
110 | }, { } | ||
111 | }; | ||
112 | |||
113 | static struct klp_patch patch = { | ||
114 | .mod = THIS_MODULE, | ||
115 | .objs = objs, | ||
116 | }; | ||
117 | |||
118 | static int livepatch_shadow_fix2_init(void) | ||
119 | { | ||
120 | return klp_enable_patch(&patch); | ||
121 | } | ||
122 | |||
123 | static void livepatch_shadow_fix2_exit(void) | ||
124 | { | ||
125 | /* Cleanup any existing SV_COUNTER shadow variables */ | ||
126 | klp_shadow_free_all(SV_COUNTER, NULL); | ||
127 | } | ||
128 | |||
129 | module_init(livepatch_shadow_fix2_init); | ||
130 | module_exit(livepatch_shadow_fix2_exit); | ||
131 | MODULE_LICENSE("GPL"); | ||
132 | MODULE_INFO(livepatch, "Y"); | ||
diff --git a/samples/livepatch/livepatch-shadow-mod.c b/samples/livepatch/livepatch-shadow-mod.c new file mode 100644 index 000000000..7e753b0d2 --- /dev/null +++ b/samples/livepatch/livepatch-shadow-mod.c | |||
@@ -0,0 +1,217 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
2 | /* | ||
3 | * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> | ||
4 | */ | ||
5 | |||
6 | /* | ||
7 | * livepatch-shadow-mod.c - Shadow variables, buggy module demo | ||
8 | * | ||
9 | * Purpose | ||
10 | * ------- | ||
11 | * | ||
12 | * As a demonstration of livepatch shadow variable API, this module | ||
13 | * introduces memory leak behavior that livepatch modules | ||
14 | * livepatch-shadow-fix1.ko and livepatch-shadow-fix2.ko correct and | ||
15 | * enhance. | ||
16 | * | ||
17 | * WARNING - even though the livepatch-shadow-fix modules patch the | ||
18 | * memory leak, please load these modules at your own risk -- some | ||
19 | * amount of memory may leaked before the bug is patched. | ||
20 | * | ||
21 | * | ||
22 | * Usage | ||
23 | * ----- | ||
24 | * | ||
25 | * Step 1 - Load the buggy demonstration module: | ||
26 | * | ||
27 | * insmod samples/livepatch/livepatch-shadow-mod.ko | ||
28 | * | ||
29 | * Watch dmesg output for a few moments to see new dummy being allocated | ||
30 | * and a periodic cleanup check. (Note: a small amount of memory is | ||
31 | * being leaked.) | ||
32 | * | ||
33 | * | ||
34 | * Step 2 - Load livepatch fix1: | ||
35 | * | ||
36 | * insmod samples/livepatch/livepatch-shadow-fix1.ko | ||
37 | * | ||
38 | * Continue watching dmesg and note that now livepatch_fix1_dummy_free() | ||
39 | * and livepatch_fix1_dummy_alloc() are logging messages about leaked | ||
40 | * memory and eventually leaks prevented. | ||
41 | * | ||
42 | * | ||
43 | * Step 3 - Load livepatch fix2 (on top of fix1): | ||
44 | * | ||
45 | * insmod samples/livepatch/livepatch-shadow-fix2.ko | ||
46 | * | ||
47 | * This module extends functionality through shadow variables, as a new | ||
48 | * "check" counter is added to the dummy structure. Periodic dmesg | ||
49 | * messages will log these as dummies are cleaned up. | ||
50 | * | ||
51 | * | ||
52 | * Step 4 - Cleanup | ||
53 | * | ||
54 | * Unwind the demonstration by disabling the livepatch fix modules, then | ||
55 | * removing them and the demo module: | ||
56 | * | ||
57 | * echo 0 > /sys/kernel/livepatch/livepatch_shadow_fix2/enabled | ||
58 | * echo 0 > /sys/kernel/livepatch/livepatch_shadow_fix1/enabled | ||
59 | * rmmod livepatch-shadow-fix2 | ||
60 | * rmmod livepatch-shadow-fix1 | ||
61 | * rmmod livepatch-shadow-mod | ||
62 | */ | ||
63 | |||
64 | |||
65 | #include <linux/kernel.h> | ||
66 | #include <linux/module.h> | ||
67 | #include <linux/sched.h> | ||
68 | #include <linux/slab.h> | ||
69 | #include <linux/stat.h> | ||
70 | #include <linux/workqueue.h> | ||
71 | |||
72 | MODULE_LICENSE("GPL"); | ||
73 | MODULE_AUTHOR("Joe Lawrence <joe.lawrence@redhat.com>"); | ||
74 | MODULE_DESCRIPTION("Buggy module for shadow variable demo"); | ||
75 | |||
76 | /* Allocate new dummies every second */ | ||
77 | #define ALLOC_PERIOD 1 | ||
78 | /* Check for expired dummies after a few new ones have been allocated */ | ||
79 | #define CLEANUP_PERIOD (3 * ALLOC_PERIOD) | ||
80 | /* Dummies expire after a few cleanup instances */ | ||
81 | #define EXPIRE_PERIOD (4 * CLEANUP_PERIOD) | ||
82 | |||
83 | /* | ||
84 | * Keep a list of all the dummies so we can clean up any residual ones | ||
85 | * on module exit | ||
86 | */ | ||
87 | static LIST_HEAD(dummy_list); | ||
88 | static DEFINE_MUTEX(dummy_list_mutex); | ||
89 | |||
90 | struct dummy { | ||
91 | struct list_head list; | ||
92 | unsigned long jiffies_expire; | ||
93 | }; | ||
94 | |||
95 | static __used noinline struct dummy *dummy_alloc(void) | ||
96 | { | ||
97 | struct dummy *d; | ||
98 | int *leak; | ||
99 | |||
100 | d = kzalloc(sizeof(*d), GFP_KERNEL); | ||
101 | if (!d) | ||
102 | return NULL; | ||
103 | |||
104 | d->jiffies_expire = jiffies + | ||
105 | msecs_to_jiffies(1000 * EXPIRE_PERIOD); | ||
106 | |||
107 | /* Oops, forgot to save leak! */ | ||
108 | leak = kzalloc(sizeof(*leak), GFP_KERNEL); | ||
109 | if (!leak) { | ||
110 | kfree(d); | ||
111 | return NULL; | ||
112 | } | ||
113 | |||
114 | pr_info("%s: dummy @ %p, expires @ %lx\n", | ||
115 | __func__, d, d->jiffies_expire); | ||
116 | |||
117 | return d; | ||
118 | } | ||
119 | |||
120 | static __used noinline void dummy_free(struct dummy *d) | ||
121 | { | ||
122 | pr_info("%s: dummy @ %p, expired = %lx\n", | ||
123 | __func__, d, d->jiffies_expire); | ||
124 | |||
125 | kfree(d); | ||
126 | } | ||
127 | |||
128 | static __used noinline bool dummy_check(struct dummy *d, | ||
129 | unsigned long jiffies) | ||
130 | { | ||
131 | return time_after(jiffies, d->jiffies_expire); | ||
132 | } | ||
133 | |||
134 | /* | ||
135 | * alloc_work_func: allocates new dummy structures, allocates additional | ||
136 | * memory, aptly named "leak", but doesn't keep | ||
137 | * permanent record of it. | ||
138 | */ | ||
139 | |||
140 | static void alloc_work_func(struct work_struct *work); | ||
141 | static DECLARE_DELAYED_WORK(alloc_dwork, alloc_work_func); | ||
142 | |||
143 | static void alloc_work_func(struct work_struct *work) | ||
144 | { | ||
145 | struct dummy *d; | ||
146 | |||
147 | d = dummy_alloc(); | ||
148 | if (!d) | ||
149 | return; | ||
150 | |||
151 | mutex_lock(&dummy_list_mutex); | ||
152 | list_add(&d->list, &dummy_list); | ||
153 | mutex_unlock(&dummy_list_mutex); | ||
154 | |||
155 | schedule_delayed_work(&alloc_dwork, | ||
156 | msecs_to_jiffies(1000 * ALLOC_PERIOD)); | ||
157 | } | ||
158 | |||
159 | /* | ||
160 | * cleanup_work_func: frees dummy structures. Without knownledge of | ||
161 | * "leak", it leaks the additional memory that | ||
162 | * alloc_work_func created. | ||
163 | */ | ||
164 | |||
165 | static void cleanup_work_func(struct work_struct *work); | ||
166 | static DECLARE_DELAYED_WORK(cleanup_dwork, cleanup_work_func); | ||
167 | |||
168 | static void cleanup_work_func(struct work_struct *work) | ||
169 | { | ||
170 | struct dummy *d, *tmp; | ||
171 | unsigned long j; | ||
172 | |||
173 | j = jiffies; | ||
174 | pr_info("%s: jiffies = %lx\n", __func__, j); | ||
175 | |||
176 | mutex_lock(&dummy_list_mutex); | ||
177 | list_for_each_entry_safe(d, tmp, &dummy_list, list) { | ||
178 | |||
179 | /* Kick out and free any expired dummies */ | ||
180 | if (dummy_check(d, j)) { | ||
181 | list_del(&d->list); | ||
182 | dummy_free(d); | ||
183 | } | ||
184 | } | ||
185 | mutex_unlock(&dummy_list_mutex); | ||
186 | |||
187 | schedule_delayed_work(&cleanup_dwork, | ||
188 | msecs_to_jiffies(1000 * CLEANUP_PERIOD)); | ||
189 | } | ||
190 | |||
191 | static int livepatch_shadow_mod_init(void) | ||
192 | { | ||
193 | schedule_delayed_work(&alloc_dwork, | ||
194 | msecs_to_jiffies(1000 * ALLOC_PERIOD)); | ||
195 | schedule_delayed_work(&cleanup_dwork, | ||
196 | msecs_to_jiffies(1000 * CLEANUP_PERIOD)); | ||
197 | |||
198 | return 0; | ||
199 | } | ||
200 | |||
201 | static void livepatch_shadow_mod_exit(void) | ||
202 | { | ||
203 | struct dummy *d, *tmp; | ||
204 | |||
205 | /* Wait for any dummies at work */ | ||
206 | cancel_delayed_work_sync(&alloc_dwork); | ||
207 | cancel_delayed_work_sync(&cleanup_dwork); | ||
208 | |||
209 | /* Cleanup residual dummies */ | ||
210 | list_for_each_entry_safe(d, tmp, &dummy_list, list) { | ||
211 | list_del(&d->list); | ||
212 | dummy_free(d); | ||
213 | } | ||
214 | } | ||
215 | |||
216 | module_init(livepatch_shadow_mod_init); | ||
217 | module_exit(livepatch_shadow_mod_exit); | ||
diff --git a/samples/mei/.gitignore b/samples/mei/.gitignore new file mode 100644 index 000000000..db5e802f0 --- /dev/null +++ b/samples/mei/.gitignore | |||
@@ -0,0 +1,2 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | mei-amt-version | ||
diff --git a/samples/mei/Makefile b/samples/mei/Makefile new file mode 100644 index 000000000..c54b8a0ab --- /dev/null +++ b/samples/mei/Makefile | |||
@@ -0,0 +1,5 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0 | ||
2 | # Copyright (c) 2012-2019, Intel Corporation. All rights reserved. | ||
3 | userprogs-always-y += mei-amt-version | ||
4 | |||
5 | userccflags += -I usr/include | ||
diff --git a/samples/mei/mei-amt-version.c b/samples/mei/mei-amt-version.c new file mode 100644 index 000000000..ad3e56042 --- /dev/null +++ b/samples/mei/mei-amt-version.c | |||
@@ -0,0 +1,479 @@ | |||
1 | /****************************************************************************** | ||
2 | * Intel Management Engine Interface (Intel MEI) Linux driver | ||
3 | * Intel MEI Interface Header | ||
4 | * | ||
5 | * This file is provided under a dual BSD/GPLv2 license. When using or | ||
6 | * redistributing this file, you may do so under either license. | ||
7 | * | ||
8 | * GPL LICENSE SUMMARY | ||
9 | * | ||
10 | * Copyright(c) 2012 Intel Corporation. All rights reserved. | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or modify | ||
13 | * it under the terms of version 2 of the GNU General Public License as | ||
14 | * published by the Free Software Foundation. | ||
15 | * | ||
16 | * This program is distributed in the hope that it will be useful, but | ||
17 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
19 | * General Public License for more details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU General Public License | ||
22 | * along with this program; if not, write to the Free Software | ||
23 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, | ||
24 | * USA | ||
25 | * | ||
26 | * The full GNU General Public License is included in this distribution | ||
27 | * in the file called LICENSE.GPL. | ||
28 | * | ||
29 | * Contact Information: | ||
30 | * Intel Corporation. | ||
31 | * linux-mei@linux.intel.com | ||
32 | * http://www.intel.com | ||
33 | * | ||
34 | * BSD LICENSE | ||
35 | * | ||
36 | * Copyright(c) 2003 - 2012 Intel Corporation. All rights reserved. | ||
37 | * All rights reserved. | ||
38 | * | ||
39 | * Redistribution and use in source and binary forms, with or without | ||
40 | * modification, are permitted provided that the following conditions | ||
41 | * are met: | ||
42 | * | ||
43 | * * Redistributions of source code must retain the above copyright | ||
44 | * notice, this list of conditions and the following disclaimer. | ||
45 | * * Redistributions in binary form must reproduce the above copyright | ||
46 | * notice, this list of conditions and the following disclaimer in | ||
47 | * the documentation and/or other materials provided with the | ||
48 | * distribution. | ||
49 | * * Neither the name Intel Corporation nor the names of its | ||
50 | * contributors may be used to endorse or promote products derived | ||
51 | * from this software without specific prior written permission. | ||
52 | * | ||
53 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
54 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
55 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
56 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | ||
57 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
58 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | ||
59 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | ||
60 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | ||
61 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
62 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
63 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
64 | * | ||
65 | *****************************************************************************/ | ||
66 | |||
67 | #include <stdio.h> | ||
68 | #include <stdlib.h> | ||
69 | #include <string.h> | ||
70 | #include <fcntl.h> | ||
71 | #include <sys/ioctl.h> | ||
72 | #include <unistd.h> | ||
73 | #include <errno.h> | ||
74 | #include <stdint.h> | ||
75 | #include <stdbool.h> | ||
76 | #include <bits/wordsize.h> | ||
77 | #include <linux/mei.h> | ||
78 | |||
79 | /***************************************************************************** | ||
80 | * Intel Management Engine Interface | ||
81 | *****************************************************************************/ | ||
82 | |||
83 | #define mei_msg(_me, fmt, ARGS...) do { \ | ||
84 | if (_me->verbose) \ | ||
85 | fprintf(stderr, fmt, ##ARGS); \ | ||
86 | } while (0) | ||
87 | |||
88 | #define mei_err(_me, fmt, ARGS...) do { \ | ||
89 | fprintf(stderr, "Error: " fmt, ##ARGS); \ | ||
90 | } while (0) | ||
91 | |||
92 | struct mei { | ||
93 | uuid_le guid; | ||
94 | bool initialized; | ||
95 | bool verbose; | ||
96 | unsigned int buf_size; | ||
97 | unsigned char prot_ver; | ||
98 | int fd; | ||
99 | }; | ||
100 | |||
101 | static void mei_deinit(struct mei *cl) | ||
102 | { | ||
103 | if (cl->fd != -1) | ||
104 | close(cl->fd); | ||
105 | cl->fd = -1; | ||
106 | cl->buf_size = 0; | ||
107 | cl->prot_ver = 0; | ||
108 | cl->initialized = false; | ||
109 | } | ||
110 | |||
111 | static bool mei_init(struct mei *me, const uuid_le *guid, | ||
112 | unsigned char req_protocol_version, bool verbose) | ||
113 | { | ||
114 | int result; | ||
115 | struct mei_client *cl; | ||
116 | struct mei_connect_client_data data; | ||
117 | |||
118 | me->verbose = verbose; | ||
119 | |||
120 | me->fd = open("/dev/mei0", O_RDWR); | ||
121 | if (me->fd == -1) { | ||
122 | mei_err(me, "Cannot establish a handle to the Intel MEI driver\n"); | ||
123 | goto err; | ||
124 | } | ||
125 | memcpy(&me->guid, guid, sizeof(*guid)); | ||
126 | memset(&data, 0, sizeof(data)); | ||
127 | me->initialized = true; | ||
128 | |||
129 | memcpy(&data.in_client_uuid, &me->guid, sizeof(me->guid)); | ||
130 | result = ioctl(me->fd, IOCTL_MEI_CONNECT_CLIENT, &data); | ||
131 | if (result) { | ||
132 | mei_err(me, "IOCTL_MEI_CONNECT_CLIENT receive message. err=%d\n", result); | ||
133 | goto err; | ||
134 | } | ||
135 | cl = &data.out_client_properties; | ||
136 | mei_msg(me, "max_message_length %d\n", cl->max_msg_length); | ||
137 | mei_msg(me, "protocol_version %d\n", cl->protocol_version); | ||
138 | |||
139 | if ((req_protocol_version > 0) && | ||
140 | (cl->protocol_version != req_protocol_version)) { | ||
141 | mei_err(me, "Intel MEI protocol version not supported\n"); | ||
142 | goto err; | ||
143 | } | ||
144 | |||
145 | me->buf_size = cl->max_msg_length; | ||
146 | me->prot_ver = cl->protocol_version; | ||
147 | |||
148 | return true; | ||
149 | err: | ||
150 | mei_deinit(me); | ||
151 | return false; | ||
152 | } | ||
153 | |||
154 | static ssize_t mei_recv_msg(struct mei *me, unsigned char *buffer, | ||
155 | ssize_t len, unsigned long timeout) | ||
156 | { | ||
157 | ssize_t rc; | ||
158 | |||
159 | mei_msg(me, "call read length = %zd\n", len); | ||
160 | |||
161 | rc = read(me->fd, buffer, len); | ||
162 | if (rc < 0) { | ||
163 | mei_err(me, "read failed with status %zd %s\n", | ||
164 | rc, strerror(errno)); | ||
165 | mei_deinit(me); | ||
166 | } else { | ||
167 | mei_msg(me, "read succeeded with result %zd\n", rc); | ||
168 | } | ||
169 | return rc; | ||
170 | } | ||
171 | |||
172 | static ssize_t mei_send_msg(struct mei *me, const unsigned char *buffer, | ||
173 | ssize_t len, unsigned long timeout) | ||
174 | { | ||
175 | struct timeval tv; | ||
176 | ssize_t written; | ||
177 | ssize_t rc; | ||
178 | fd_set set; | ||
179 | |||
180 | tv.tv_sec = timeout / 1000; | ||
181 | tv.tv_usec = (timeout % 1000) * 1000000; | ||
182 | |||
183 | mei_msg(me, "call write length = %zd\n", len); | ||
184 | |||
185 | written = write(me->fd, buffer, len); | ||
186 | if (written < 0) { | ||
187 | rc = -errno; | ||
188 | mei_err(me, "write failed with status %zd %s\n", | ||
189 | written, strerror(errno)); | ||
190 | goto out; | ||
191 | } | ||
192 | |||
193 | FD_ZERO(&set); | ||
194 | FD_SET(me->fd, &set); | ||
195 | rc = select(me->fd + 1 , &set, NULL, NULL, &tv); | ||
196 | if (rc > 0 && FD_ISSET(me->fd, &set)) { | ||
197 | mei_msg(me, "write success\n"); | ||
198 | } else if (rc == 0) { | ||
199 | mei_err(me, "write failed on timeout with status\n"); | ||
200 | goto out; | ||
201 | } else { /* rc < 0 */ | ||
202 | mei_err(me, "write failed on select with status %zd\n", rc); | ||
203 | goto out; | ||
204 | } | ||
205 | |||
206 | rc = written; | ||
207 | out: | ||
208 | if (rc < 0) | ||
209 | mei_deinit(me); | ||
210 | |||
211 | return rc; | ||
212 | } | ||
213 | |||
214 | /*************************************************************************** | ||
215 | * Intel Advanced Management Technology ME Client | ||
216 | ***************************************************************************/ | ||
217 | |||
218 | #define AMT_MAJOR_VERSION 1 | ||
219 | #define AMT_MINOR_VERSION 1 | ||
220 | |||
221 | #define AMT_STATUS_SUCCESS 0x0 | ||
222 | #define AMT_STATUS_INTERNAL_ERROR 0x1 | ||
223 | #define AMT_STATUS_NOT_READY 0x2 | ||
224 | #define AMT_STATUS_INVALID_AMT_MODE 0x3 | ||
225 | #define AMT_STATUS_INVALID_MESSAGE_LENGTH 0x4 | ||
226 | |||
227 | #define AMT_STATUS_HOST_IF_EMPTY_RESPONSE 0x4000 | ||
228 | #define AMT_STATUS_SDK_RESOURCES 0x1004 | ||
229 | |||
230 | |||
231 | #define AMT_BIOS_VERSION_LEN 65 | ||
232 | #define AMT_VERSIONS_NUMBER 50 | ||
233 | #define AMT_UNICODE_STRING_LEN 20 | ||
234 | |||
235 | struct amt_unicode_string { | ||
236 | uint16_t length; | ||
237 | char string[AMT_UNICODE_STRING_LEN]; | ||
238 | } __attribute__((packed)); | ||
239 | |||
240 | struct amt_version_type { | ||
241 | struct amt_unicode_string description; | ||
242 | struct amt_unicode_string version; | ||
243 | } __attribute__((packed)); | ||
244 | |||
245 | struct amt_version { | ||
246 | uint8_t major; | ||
247 | uint8_t minor; | ||
248 | } __attribute__((packed)); | ||
249 | |||
250 | struct amt_code_versions { | ||
251 | uint8_t bios[AMT_BIOS_VERSION_LEN]; | ||
252 | uint32_t count; | ||
253 | struct amt_version_type versions[AMT_VERSIONS_NUMBER]; | ||
254 | } __attribute__((packed)); | ||
255 | |||
256 | /*************************************************************************** | ||
257 | * Intel Advanced Management Technology Host Interface | ||
258 | ***************************************************************************/ | ||
259 | |||
260 | struct amt_host_if_msg_header { | ||
261 | struct amt_version version; | ||
262 | uint16_t _reserved; | ||
263 | uint32_t command; | ||
264 | uint32_t length; | ||
265 | } __attribute__((packed)); | ||
266 | |||
267 | struct amt_host_if_resp_header { | ||
268 | struct amt_host_if_msg_header header; | ||
269 | uint32_t status; | ||
270 | unsigned char data[]; | ||
271 | } __attribute__((packed)); | ||
272 | |||
273 | const uuid_le MEI_IAMTHIF = UUID_LE(0x12f80028, 0xb4b7, 0x4b2d, \ | ||
274 | 0xac, 0xa8, 0x46, 0xe0, 0xff, 0x65, 0x81, 0x4c); | ||
275 | |||
276 | #define AMT_HOST_IF_CODE_VERSIONS_REQUEST 0x0400001A | ||
277 | #define AMT_HOST_IF_CODE_VERSIONS_RESPONSE 0x0480001A | ||
278 | |||
279 | const struct amt_host_if_msg_header CODE_VERSION_REQ = { | ||
280 | .version = {AMT_MAJOR_VERSION, AMT_MINOR_VERSION}, | ||
281 | ._reserved = 0, | ||
282 | .command = AMT_HOST_IF_CODE_VERSIONS_REQUEST, | ||
283 | .length = 0 | ||
284 | }; | ||
285 | |||
286 | |||
287 | struct amt_host_if { | ||
288 | struct mei mei_cl; | ||
289 | unsigned long send_timeout; | ||
290 | bool initialized; | ||
291 | }; | ||
292 | |||
293 | |||
294 | static bool amt_host_if_init(struct amt_host_if *acmd, | ||
295 | unsigned long send_timeout, bool verbose) | ||
296 | { | ||
297 | acmd->send_timeout = (send_timeout) ? send_timeout : 20000; | ||
298 | acmd->initialized = mei_init(&acmd->mei_cl, &MEI_IAMTHIF, 0, verbose); | ||
299 | return acmd->initialized; | ||
300 | } | ||
301 | |||
302 | static void amt_host_if_deinit(struct amt_host_if *acmd) | ||
303 | { | ||
304 | mei_deinit(&acmd->mei_cl); | ||
305 | acmd->initialized = false; | ||
306 | } | ||
307 | |||
308 | static uint32_t amt_verify_code_versions(const struct amt_host_if_resp_header *resp) | ||
309 | { | ||
310 | uint32_t status = AMT_STATUS_SUCCESS; | ||
311 | struct amt_code_versions *code_ver; | ||
312 | size_t code_ver_len; | ||
313 | uint32_t ver_type_cnt; | ||
314 | uint32_t len; | ||
315 | uint32_t i; | ||
316 | |||
317 | code_ver = (struct amt_code_versions *)resp->data; | ||
318 | /* length - sizeof(status) */ | ||
319 | code_ver_len = resp->header.length - sizeof(uint32_t); | ||
320 | ver_type_cnt = code_ver_len - | ||
321 | sizeof(code_ver->bios) - | ||
322 | sizeof(code_ver->count); | ||
323 | if (code_ver->count != ver_type_cnt / sizeof(struct amt_version_type)) { | ||
324 | status = AMT_STATUS_INTERNAL_ERROR; | ||
325 | goto out; | ||
326 | } | ||
327 | |||
328 | for (i = 0; i < code_ver->count; i++) { | ||
329 | len = code_ver->versions[i].description.length; | ||
330 | |||
331 | if (len > AMT_UNICODE_STRING_LEN) { | ||
332 | status = AMT_STATUS_INTERNAL_ERROR; | ||
333 | goto out; | ||
334 | } | ||
335 | |||
336 | len = code_ver->versions[i].version.length; | ||
337 | if (code_ver->versions[i].version.string[len] != '\0' || | ||
338 | len != strlen(code_ver->versions[i].version.string)) { | ||
339 | status = AMT_STATUS_INTERNAL_ERROR; | ||
340 | goto out; | ||
341 | } | ||
342 | } | ||
343 | out: | ||
344 | return status; | ||
345 | } | ||
346 | |||
347 | static uint32_t amt_verify_response_header(uint32_t command, | ||
348 | const struct amt_host_if_msg_header *resp_hdr, | ||
349 | uint32_t response_size) | ||
350 | { | ||
351 | if (response_size < sizeof(struct amt_host_if_resp_header)) { | ||
352 | return AMT_STATUS_INTERNAL_ERROR; | ||
353 | } else if (response_size != (resp_hdr->length + | ||
354 | sizeof(struct amt_host_if_msg_header))) { | ||
355 | return AMT_STATUS_INTERNAL_ERROR; | ||
356 | } else if (resp_hdr->command != command) { | ||
357 | return AMT_STATUS_INTERNAL_ERROR; | ||
358 | } else if (resp_hdr->_reserved != 0) { | ||
359 | return AMT_STATUS_INTERNAL_ERROR; | ||
360 | } else if (resp_hdr->version.major != AMT_MAJOR_VERSION || | ||
361 | resp_hdr->version.minor < AMT_MINOR_VERSION) { | ||
362 | return AMT_STATUS_INTERNAL_ERROR; | ||
363 | } | ||
364 | return AMT_STATUS_SUCCESS; | ||
365 | } | ||
366 | |||
367 | static uint32_t amt_host_if_call(struct amt_host_if *acmd, | ||
368 | const unsigned char *command, ssize_t command_sz, | ||
369 | uint8_t **read_buf, uint32_t rcmd, | ||
370 | unsigned int expected_sz) | ||
371 | { | ||
372 | uint32_t in_buf_sz; | ||
373 | ssize_t out_buf_sz; | ||
374 | ssize_t written; | ||
375 | uint32_t status; | ||
376 | struct amt_host_if_resp_header *msg_hdr; | ||
377 | |||
378 | in_buf_sz = acmd->mei_cl.buf_size; | ||
379 | *read_buf = (uint8_t *)malloc(sizeof(uint8_t) * in_buf_sz); | ||
380 | if (*read_buf == NULL) | ||
381 | return AMT_STATUS_SDK_RESOURCES; | ||
382 | memset(*read_buf, 0, in_buf_sz); | ||
383 | msg_hdr = (struct amt_host_if_resp_header *)*read_buf; | ||
384 | |||
385 | written = mei_send_msg(&acmd->mei_cl, | ||
386 | command, command_sz, acmd->send_timeout); | ||
387 | if (written != command_sz) | ||
388 | return AMT_STATUS_INTERNAL_ERROR; | ||
389 | |||
390 | out_buf_sz = mei_recv_msg(&acmd->mei_cl, *read_buf, in_buf_sz, 2000); | ||
391 | if (out_buf_sz <= 0) | ||
392 | return AMT_STATUS_HOST_IF_EMPTY_RESPONSE; | ||
393 | |||
394 | status = msg_hdr->status; | ||
395 | if (status != AMT_STATUS_SUCCESS) | ||
396 | return status; | ||
397 | |||
398 | status = amt_verify_response_header(rcmd, | ||
399 | &msg_hdr->header, out_buf_sz); | ||
400 | if (status != AMT_STATUS_SUCCESS) | ||
401 | return status; | ||
402 | |||
403 | if (expected_sz && expected_sz != out_buf_sz) | ||
404 | return AMT_STATUS_INTERNAL_ERROR; | ||
405 | |||
406 | return AMT_STATUS_SUCCESS; | ||
407 | } | ||
408 | |||
409 | |||
410 | static uint32_t amt_get_code_versions(struct amt_host_if *cmd, | ||
411 | struct amt_code_versions *versions) | ||
412 | { | ||
413 | struct amt_host_if_resp_header *response = NULL; | ||
414 | uint32_t status; | ||
415 | |||
416 | status = amt_host_if_call(cmd, | ||
417 | (const unsigned char *)&CODE_VERSION_REQ, | ||
418 | sizeof(CODE_VERSION_REQ), | ||
419 | (uint8_t **)&response, | ||
420 | AMT_HOST_IF_CODE_VERSIONS_RESPONSE, 0); | ||
421 | |||
422 | if (status != AMT_STATUS_SUCCESS) | ||
423 | goto out; | ||
424 | |||
425 | status = amt_verify_code_versions(response); | ||
426 | if (status != AMT_STATUS_SUCCESS) | ||
427 | goto out; | ||
428 | |||
429 | memcpy(versions, response->data, sizeof(struct amt_code_versions)); | ||
430 | out: | ||
431 | if (response != NULL) | ||
432 | free(response); | ||
433 | |||
434 | return status; | ||
435 | } | ||
436 | |||
437 | /************************** end of amt_host_if_command ***********************/ | ||
438 | int main(int argc, char **argv) | ||
439 | { | ||
440 | struct amt_code_versions ver; | ||
441 | struct amt_host_if acmd; | ||
442 | unsigned int i; | ||
443 | uint32_t status; | ||
444 | int ret; | ||
445 | bool verbose; | ||
446 | |||
447 | verbose = (argc > 1 && strcmp(argv[1], "-v") == 0); | ||
448 | |||
449 | if (!amt_host_if_init(&acmd, 5000, verbose)) { | ||
450 | ret = 1; | ||
451 | goto out; | ||
452 | } | ||
453 | |||
454 | status = amt_get_code_versions(&acmd, &ver); | ||
455 | |||
456 | amt_host_if_deinit(&acmd); | ||
457 | |||
458 | switch (status) { | ||
459 | case AMT_STATUS_HOST_IF_EMPTY_RESPONSE: | ||
460 | printf("Intel AMT: DISABLED\n"); | ||
461 | ret = 0; | ||
462 | break; | ||
463 | case AMT_STATUS_SUCCESS: | ||
464 | printf("Intel AMT: ENABLED\n"); | ||
465 | for (i = 0; i < ver.count; i++) { | ||
466 | printf("%s:\t%s\n", ver.versions[i].description.string, | ||
467 | ver.versions[i].version.string); | ||
468 | } | ||
469 | ret = 0; | ||
470 | break; | ||
471 | default: | ||
472 | printf("An error has occurred\n"); | ||
473 | ret = 1; | ||
474 | break; | ||
475 | } | ||
476 | |||
477 | out: | ||
478 | return ret; | ||
479 | } | ||
diff --git a/samples/nitro_enclaves/.gitignore b/samples/nitro_enclaves/.gitignore new file mode 100644 index 000000000..827934129 --- /dev/null +++ b/samples/nitro_enclaves/.gitignore | |||
@@ -0,0 +1,2 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0 | ||
2 | ne_ioctl_sample | ||
diff --git a/samples/nitro_enclaves/Makefile b/samples/nitro_enclaves/Makefile new file mode 100644 index 000000000..a3ec78fef --- /dev/null +++ b/samples/nitro_enclaves/Makefile | |||
@@ -0,0 +1,16 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0 | ||
2 | # | ||
3 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
4 | |||
5 | # Enclave lifetime management support for Nitro Enclaves (NE) - ioctl sample | ||
6 | # usage. | ||
7 | |||
8 | .PHONY: all clean | ||
9 | |||
10 | CFLAGS += -Wall | ||
11 | |||
12 | all: | ||
13 | $(CC) $(CFLAGS) -o ne_ioctl_sample ne_ioctl_sample.c -lpthread | ||
14 | |||
15 | clean: | ||
16 | rm -f ne_ioctl_sample | ||
diff --git a/samples/nitro_enclaves/ne_ioctl_sample.c b/samples/nitro_enclaves/ne_ioctl_sample.c new file mode 100644 index 000000000..480b76314 --- /dev/null +++ b/samples/nitro_enclaves/ne_ioctl_sample.c | |||
@@ -0,0 +1,883 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
4 | */ | ||
5 | |||
6 | /** | ||
7 | * DOC: Sample flow of using the ioctl interface provided by the Nitro Enclaves (NE) | ||
8 | * kernel driver. | ||
9 | * | ||
10 | * Usage | ||
11 | * ----- | ||
12 | * | ||
13 | * Load the nitro_enclaves module, setting also the enclave CPU pool. The | ||
14 | * enclave CPUs need to be full cores from the same NUMA node. CPU 0 and its | ||
15 | * siblings have to remain available for the primary / parent VM, so they | ||
16 | * cannot be included in the enclave CPU pool. | ||
17 | * | ||
18 | * See the cpu list section from the kernel documentation. | ||
19 | * https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html#cpu-lists | ||
20 | * | ||
21 | * insmod drivers/virt/nitro_enclaves/nitro_enclaves.ko | ||
22 | * lsmod | ||
23 | * | ||
24 | * The CPU pool can be set at runtime, after the kernel module is loaded. | ||
25 | * | ||
26 | * echo <cpu-list> > /sys/module/nitro_enclaves/parameters/ne_cpus | ||
27 | * | ||
28 | * NUMA and CPU siblings information can be found using: | ||
29 | * | ||
30 | * lscpu | ||
31 | * /proc/cpuinfo | ||
32 | * | ||
33 | * Check the online / offline CPU list. The CPUs from the pool should be | ||
34 | * offlined. | ||
35 | * | ||
36 | * lscpu | ||
37 | * | ||
38 | * Check dmesg for any warnings / errors through the NE driver lifetime / usage. | ||
39 | * The NE logs contain the "nitro_enclaves" or "pci 0000:00:02.0" pattern. | ||
40 | * | ||
41 | * dmesg | ||
42 | * | ||
43 | * Setup hugetlbfs huge pages. The memory needs to be from the same NUMA node as | ||
44 | * the enclave CPUs. | ||
45 | * | ||
46 | * https://www.kernel.org/doc/html/latest/admin-guide/mm/hugetlbpage.html | ||
47 | * | ||
48 | * By default, the allocation of hugetlb pages are distributed on all possible | ||
49 | * NUMA nodes. Use the following configuration files to set the number of huge | ||
50 | * pages from a NUMA node: | ||
51 | * | ||
52 | * /sys/devices/system/node/node<X>/hugepages/hugepages-2048kB/nr_hugepages | ||
53 | * /sys/devices/system/node/node<X>/hugepages/hugepages-1048576kB/nr_hugepages | ||
54 | * | ||
55 | * or, if not on a system with multiple NUMA nodes, can also set the number | ||
56 | * of 2 MiB / 1 GiB huge pages using | ||
57 | * | ||
58 | * /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages | ||
59 | * /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages | ||
60 | * | ||
61 | * In this example 256 hugepages of 2 MiB are used. | ||
62 | * | ||
63 | * Build and run the NE sample. | ||
64 | * | ||
65 | * make -C samples/nitro_enclaves clean | ||
66 | * make -C samples/nitro_enclaves | ||
67 | * ./samples/nitro_enclaves/ne_ioctl_sample <path_to_enclave_image> | ||
68 | * | ||
69 | * Unload the nitro_enclaves module. | ||
70 | * | ||
71 | * rmmod nitro_enclaves | ||
72 | * lsmod | ||
73 | */ | ||
74 | |||
75 | #include <stdio.h> | ||
76 | #include <stdlib.h> | ||
77 | #include <errno.h> | ||
78 | #include <fcntl.h> | ||
79 | #include <limits.h> | ||
80 | #include <poll.h> | ||
81 | #include <pthread.h> | ||
82 | #include <string.h> | ||
83 | #include <sys/eventfd.h> | ||
84 | #include <sys/ioctl.h> | ||
85 | #include <sys/mman.h> | ||
86 | #include <sys/socket.h> | ||
87 | #include <sys/stat.h> | ||
88 | #include <sys/types.h> | ||
89 | #include <unistd.h> | ||
90 | |||
91 | #include <linux/mman.h> | ||
92 | #include <linux/nitro_enclaves.h> | ||
93 | #include <linux/vm_sockets.h> | ||
94 | |||
95 | /** | ||
96 | * NE_DEV_NAME - Nitro Enclaves (NE) misc device that provides the ioctl interface. | ||
97 | */ | ||
98 | #define NE_DEV_NAME "/dev/nitro_enclaves" | ||
99 | |||
100 | /** | ||
101 | * NE_POLL_WAIT_TIME - Timeout in seconds for each poll event. | ||
102 | */ | ||
103 | #define NE_POLL_WAIT_TIME (60) | ||
104 | /** | ||
105 | * NE_POLL_WAIT_TIME_MS - Timeout in milliseconds for each poll event. | ||
106 | */ | ||
107 | #define NE_POLL_WAIT_TIME_MS (NE_POLL_WAIT_TIME * 1000) | ||
108 | |||
109 | /** | ||
110 | * NE_SLEEP_TIME - Amount of time in seconds for the process to keep the enclave alive. | ||
111 | */ | ||
112 | #define NE_SLEEP_TIME (300) | ||
113 | |||
114 | /** | ||
115 | * NE_DEFAULT_NR_VCPUS - Default number of vCPUs set for an enclave. | ||
116 | */ | ||
117 | #define NE_DEFAULT_NR_VCPUS (2) | ||
118 | |||
119 | /** | ||
120 | * NE_MIN_MEM_REGION_SIZE - Minimum size of a memory region - 2 MiB. | ||
121 | */ | ||
122 | #define NE_MIN_MEM_REGION_SIZE (2 * 1024 * 1024) | ||
123 | |||
124 | /** | ||
125 | * NE_DEFAULT_NR_MEM_REGIONS - Default number of memory regions of 2 MiB set for | ||
126 | * an enclave. | ||
127 | */ | ||
128 | #define NE_DEFAULT_NR_MEM_REGIONS (256) | ||
129 | |||
130 | /** | ||
131 | * NE_IMAGE_LOAD_HEARTBEAT_CID - Vsock CID for enclave image loading heartbeat logic. | ||
132 | */ | ||
133 | #define NE_IMAGE_LOAD_HEARTBEAT_CID (3) | ||
134 | /** | ||
135 | * NE_IMAGE_LOAD_HEARTBEAT_PORT - Vsock port for enclave image loading heartbeat logic. | ||
136 | */ | ||
137 | #define NE_IMAGE_LOAD_HEARTBEAT_PORT (9000) | ||
138 | /** | ||
139 | * NE_IMAGE_LOAD_HEARTBEAT_VALUE - Heartbeat value for enclave image loading. | ||
140 | */ | ||
141 | #define NE_IMAGE_LOAD_HEARTBEAT_VALUE (0xb7) | ||
142 | |||
143 | /** | ||
144 | * struct ne_user_mem_region - User space memory region set for an enclave. | ||
145 | * @userspace_addr: Address of the user space memory region. | ||
146 | * @memory_size: Size of the user space memory region. | ||
147 | */ | ||
148 | struct ne_user_mem_region { | ||
149 | void *userspace_addr; | ||
150 | size_t memory_size; | ||
151 | }; | ||
152 | |||
153 | /** | ||
154 | * ne_create_vm() - Create a slot for the enclave VM. | ||
155 | * @ne_dev_fd: The file descriptor of the NE misc device. | ||
156 | * @slot_uid: The generated slot uid for the enclave. | ||
157 | * @enclave_fd : The generated file descriptor for the enclave. | ||
158 | * | ||
159 | * Context: Process context. | ||
160 | * Return: | ||
161 | * * 0 on success. | ||
162 | * * Negative return value on failure. | ||
163 | */ | ||
164 | static int ne_create_vm(int ne_dev_fd, unsigned long *slot_uid, int *enclave_fd) | ||
165 | { | ||
166 | int rc = -EINVAL; | ||
167 | *enclave_fd = ioctl(ne_dev_fd, NE_CREATE_VM, slot_uid); | ||
168 | |||
169 | if (*enclave_fd < 0) { | ||
170 | rc = *enclave_fd; | ||
171 | switch (errno) { | ||
172 | case NE_ERR_NO_CPUS_AVAIL_IN_POOL: { | ||
173 | printf("Error in create VM, no CPUs available in the NE CPU pool\n"); | ||
174 | |||
175 | break; | ||
176 | } | ||
177 | |||
178 | default: | ||
179 | printf("Error in create VM [%m]\n"); | ||
180 | } | ||
181 | |||
182 | return rc; | ||
183 | } | ||
184 | |||
185 | return 0; | ||
186 | } | ||
187 | |||
188 | |||
189 | /** | ||
190 | * ne_poll_enclave_fd() - Thread function for polling the enclave fd. | ||
191 | * @data: Argument provided for the polling function. | ||
192 | * | ||
193 | * Context: Process context. | ||
194 | * Return: | ||
195 | * * NULL on success / failure. | ||
196 | */ | ||
197 | void *ne_poll_enclave_fd(void *data) | ||
198 | { | ||
199 | int enclave_fd = *(int *)data; | ||
200 | struct pollfd fds[1] = {}; | ||
201 | int i = 0; | ||
202 | int rc = -EINVAL; | ||
203 | |||
204 | printf("Running from poll thread, enclave fd %d\n", enclave_fd); | ||
205 | |||
206 | fds[0].fd = enclave_fd; | ||
207 | fds[0].events = POLLIN | POLLERR | POLLHUP; | ||
208 | |||
209 | /* Keep on polling until the current process is terminated. */ | ||
210 | while (1) { | ||
211 | printf("[iter %d] Polling ...\n", i); | ||
212 | |||
213 | rc = poll(fds, 1, NE_POLL_WAIT_TIME_MS); | ||
214 | if (rc < 0) { | ||
215 | printf("Error in poll [%m]\n"); | ||
216 | |||
217 | return NULL; | ||
218 | } | ||
219 | |||
220 | i++; | ||
221 | |||
222 | if (!rc) { | ||
223 | printf("Poll: %d seconds elapsed\n", | ||
224 | i * NE_POLL_WAIT_TIME); | ||
225 | |||
226 | continue; | ||
227 | } | ||
228 | |||
229 | printf("Poll received value 0x%x\n", fds[0].revents); | ||
230 | |||
231 | if (fds[0].revents & POLLHUP) { | ||
232 | printf("Received POLLHUP\n"); | ||
233 | |||
234 | return NULL; | ||
235 | } | ||
236 | |||
237 | if (fds[0].revents & POLLNVAL) { | ||
238 | printf("Received POLLNVAL\n"); | ||
239 | |||
240 | return NULL; | ||
241 | } | ||
242 | } | ||
243 | |||
244 | return NULL; | ||
245 | } | ||
246 | |||
247 | /** | ||
248 | * ne_alloc_user_mem_region() - Allocate a user space memory region for an enclave. | ||
249 | * @ne_user_mem_region: User space memory region allocated using hugetlbfs. | ||
250 | * | ||
251 | * Context: Process context. | ||
252 | * Return: | ||
253 | * * 0 on success. | ||
254 | * * Negative return value on failure. | ||
255 | */ | ||
256 | static int ne_alloc_user_mem_region(struct ne_user_mem_region *ne_user_mem_region) | ||
257 | { | ||
258 | /** | ||
259 | * Check available hugetlb encodings for different huge page sizes in | ||
260 | * include/uapi/linux/mman.h. | ||
261 | */ | ||
262 | ne_user_mem_region->userspace_addr = mmap(NULL, ne_user_mem_region->memory_size, | ||
263 | PROT_READ | PROT_WRITE, | ||
264 | MAP_PRIVATE | MAP_ANONYMOUS | | ||
265 | MAP_HUGETLB | MAP_HUGE_2MB, -1, 0); | ||
266 | if (ne_user_mem_region->userspace_addr == MAP_FAILED) { | ||
267 | printf("Error in mmap memory [%m]\n"); | ||
268 | |||
269 | return -1; | ||
270 | } | ||
271 | |||
272 | return 0; | ||
273 | } | ||
274 | |||
275 | /** | ||
276 | * ne_load_enclave_image() - Place the enclave image in the enclave memory. | ||
277 | * @enclave_fd : The file descriptor associated with the enclave. | ||
278 | * @ne_user_mem_regions: User space memory regions allocated for the enclave. | ||
279 | * @enclave_image_path : The file path of the enclave image. | ||
280 | * | ||
281 | * Context: Process context. | ||
282 | * Return: | ||
283 | * * 0 on success. | ||
284 | * * Negative return value on failure. | ||
285 | */ | ||
286 | static int ne_load_enclave_image(int enclave_fd, struct ne_user_mem_region ne_user_mem_regions[], | ||
287 | char *enclave_image_path) | ||
288 | { | ||
289 | unsigned char *enclave_image = NULL; | ||
290 | int enclave_image_fd = -1; | ||
291 | size_t enclave_image_size = 0; | ||
292 | size_t enclave_memory_size = 0; | ||
293 | unsigned long i = 0; | ||
294 | size_t image_written_bytes = 0; | ||
295 | struct ne_image_load_info image_load_info = { | ||
296 | .flags = NE_EIF_IMAGE, | ||
297 | }; | ||
298 | struct stat image_stat_buf = {}; | ||
299 | int rc = -EINVAL; | ||
300 | size_t temp_image_offset = 0; | ||
301 | |||
302 | for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) | ||
303 | enclave_memory_size += ne_user_mem_regions[i].memory_size; | ||
304 | |||
305 | rc = stat(enclave_image_path, &image_stat_buf); | ||
306 | if (rc < 0) { | ||
307 | printf("Error in get image stat info [%m]\n"); | ||
308 | |||
309 | return rc; | ||
310 | } | ||
311 | |||
312 | enclave_image_size = image_stat_buf.st_size; | ||
313 | |||
314 | if (enclave_memory_size < enclave_image_size) { | ||
315 | printf("The enclave memory is smaller than the enclave image size\n"); | ||
316 | |||
317 | return -ENOMEM; | ||
318 | } | ||
319 | |||
320 | rc = ioctl(enclave_fd, NE_GET_IMAGE_LOAD_INFO, &image_load_info); | ||
321 | if (rc < 0) { | ||
322 | switch (errno) { | ||
323 | case NE_ERR_NOT_IN_INIT_STATE: { | ||
324 | printf("Error in get image load info, enclave not in init state\n"); | ||
325 | |||
326 | break; | ||
327 | } | ||
328 | |||
329 | case NE_ERR_INVALID_FLAG_VALUE: { | ||
330 | printf("Error in get image load info, provided invalid flag\n"); | ||
331 | |||
332 | break; | ||
333 | } | ||
334 | |||
335 | default: | ||
336 | printf("Error in get image load info [%m]\n"); | ||
337 | } | ||
338 | |||
339 | return rc; | ||
340 | } | ||
341 | |||
342 | printf("Enclave image offset in enclave memory is %lld\n", | ||
343 | image_load_info.memory_offset); | ||
344 | |||
345 | enclave_image_fd = open(enclave_image_path, O_RDONLY); | ||
346 | if (enclave_image_fd < 0) { | ||
347 | printf("Error in open enclave image file [%m]\n"); | ||
348 | |||
349 | return enclave_image_fd; | ||
350 | } | ||
351 | |||
352 | enclave_image = mmap(NULL, enclave_image_size, PROT_READ, | ||
353 | MAP_PRIVATE, enclave_image_fd, 0); | ||
354 | if (enclave_image == MAP_FAILED) { | ||
355 | printf("Error in mmap enclave image [%m]\n"); | ||
356 | |||
357 | return -1; | ||
358 | } | ||
359 | |||
360 | temp_image_offset = image_load_info.memory_offset; | ||
361 | |||
362 | for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) { | ||
363 | size_t bytes_to_write = 0; | ||
364 | size_t memory_offset = 0; | ||
365 | size_t memory_size = ne_user_mem_regions[i].memory_size; | ||
366 | size_t remaining_bytes = 0; | ||
367 | void *userspace_addr = ne_user_mem_regions[i].userspace_addr; | ||
368 | |||
369 | if (temp_image_offset >= memory_size) { | ||
370 | temp_image_offset -= memory_size; | ||
371 | |||
372 | continue; | ||
373 | } else if (temp_image_offset != 0) { | ||
374 | memory_offset = temp_image_offset; | ||
375 | memory_size -= temp_image_offset; | ||
376 | temp_image_offset = 0; | ||
377 | } | ||
378 | |||
379 | remaining_bytes = enclave_image_size - image_written_bytes; | ||
380 | bytes_to_write = memory_size < remaining_bytes ? | ||
381 | memory_size : remaining_bytes; | ||
382 | |||
383 | memcpy(userspace_addr + memory_offset, | ||
384 | enclave_image + image_written_bytes, bytes_to_write); | ||
385 | |||
386 | image_written_bytes += bytes_to_write; | ||
387 | |||
388 | if (image_written_bytes == enclave_image_size) | ||
389 | break; | ||
390 | } | ||
391 | |||
392 | munmap(enclave_image, enclave_image_size); | ||
393 | |||
394 | close(enclave_image_fd); | ||
395 | |||
396 | return 0; | ||
397 | } | ||
398 | |||
399 | /** | ||
400 | * ne_set_user_mem_region() - Set a user space memory region for the given enclave. | ||
401 | * @enclave_fd : The file descriptor associated with the enclave. | ||
402 | * @ne_user_mem_region : User space memory region to be set for the enclave. | ||
403 | * | ||
404 | * Context: Process context. | ||
405 | * Return: | ||
406 | * * 0 on success. | ||
407 | * * Negative return value on failure. | ||
408 | */ | ||
409 | static int ne_set_user_mem_region(int enclave_fd, struct ne_user_mem_region ne_user_mem_region) | ||
410 | { | ||
411 | struct ne_user_memory_region mem_region = { | ||
412 | .flags = NE_DEFAULT_MEMORY_REGION, | ||
413 | .memory_size = ne_user_mem_region.memory_size, | ||
414 | .userspace_addr = (__u64)ne_user_mem_region.userspace_addr, | ||
415 | }; | ||
416 | int rc = -EINVAL; | ||
417 | |||
418 | rc = ioctl(enclave_fd, NE_SET_USER_MEMORY_REGION, &mem_region); | ||
419 | if (rc < 0) { | ||
420 | switch (errno) { | ||
421 | case NE_ERR_NOT_IN_INIT_STATE: { | ||
422 | printf("Error in set user memory region, enclave not in init state\n"); | ||
423 | |||
424 | break; | ||
425 | } | ||
426 | |||
427 | case NE_ERR_INVALID_MEM_REGION_SIZE: { | ||
428 | printf("Error in set user memory region, mem size not multiple of 2 MiB\n"); | ||
429 | |||
430 | break; | ||
431 | } | ||
432 | |||
433 | case NE_ERR_INVALID_MEM_REGION_ADDR: { | ||
434 | printf("Error in set user memory region, invalid user space address\n"); | ||
435 | |||
436 | break; | ||
437 | } | ||
438 | |||
439 | case NE_ERR_UNALIGNED_MEM_REGION_ADDR: { | ||
440 | printf("Error in set user memory region, unaligned user space address\n"); | ||
441 | |||
442 | break; | ||
443 | } | ||
444 | |||
445 | case NE_ERR_MEM_REGION_ALREADY_USED: { | ||
446 | printf("Error in set user memory region, memory region already used\n"); | ||
447 | |||
448 | break; | ||
449 | } | ||
450 | |||
451 | case NE_ERR_MEM_NOT_HUGE_PAGE: { | ||
452 | printf("Error in set user memory region, not backed by huge pages\n"); | ||
453 | |||
454 | break; | ||
455 | } | ||
456 | |||
457 | case NE_ERR_MEM_DIFFERENT_NUMA_NODE: { | ||
458 | printf("Error in set user memory region, different NUMA node than CPUs\n"); | ||
459 | |||
460 | break; | ||
461 | } | ||
462 | |||
463 | case NE_ERR_MEM_MAX_REGIONS: { | ||
464 | printf("Error in set user memory region, max memory regions reached\n"); | ||
465 | |||
466 | break; | ||
467 | } | ||
468 | |||
469 | case NE_ERR_INVALID_PAGE_SIZE: { | ||
470 | printf("Error in set user memory region, has page not multiple of 2 MiB\n"); | ||
471 | |||
472 | break; | ||
473 | } | ||
474 | |||
475 | case NE_ERR_INVALID_FLAG_VALUE: { | ||
476 | printf("Error in set user memory region, provided invalid flag\n"); | ||
477 | |||
478 | break; | ||
479 | } | ||
480 | |||
481 | default: | ||
482 | printf("Error in set user memory region [%m]\n"); | ||
483 | } | ||
484 | |||
485 | return rc; | ||
486 | } | ||
487 | |||
488 | return 0; | ||
489 | } | ||
490 | |||
491 | /** | ||
492 | * ne_free_mem_regions() - Unmap all the user space memory regions that were set | ||
493 | * aside for the enclave. | ||
494 | * @ne_user_mem_regions: The user space memory regions associated with an enclave. | ||
495 | * | ||
496 | * Context: Process context. | ||
497 | */ | ||
498 | static void ne_free_mem_regions(struct ne_user_mem_region ne_user_mem_regions[]) | ||
499 | { | ||
500 | unsigned int i = 0; | ||
501 | |||
502 | for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) | ||
503 | munmap(ne_user_mem_regions[i].userspace_addr, | ||
504 | ne_user_mem_regions[i].memory_size); | ||
505 | } | ||
506 | |||
507 | /** | ||
508 | * ne_add_vcpu() - Add a vCPU to the given enclave. | ||
509 | * @enclave_fd : The file descriptor associated with the enclave. | ||
510 | * @vcpu_id: vCPU id to be set for the enclave, either provided or | ||
511 | * auto-generated (if provided vCPU id is 0). | ||
512 | * | ||
513 | * Context: Process context. | ||
514 | * Return: | ||
515 | * * 0 on success. | ||
516 | * * Negative return value on failure. | ||
517 | */ | ||
518 | static int ne_add_vcpu(int enclave_fd, unsigned int *vcpu_id) | ||
519 | { | ||
520 | int rc = -EINVAL; | ||
521 | |||
522 | rc = ioctl(enclave_fd, NE_ADD_VCPU, vcpu_id); | ||
523 | if (rc < 0) { | ||
524 | switch (errno) { | ||
525 | case NE_ERR_NO_CPUS_AVAIL_IN_POOL: { | ||
526 | printf("Error in add vcpu, no CPUs available in the NE CPU pool\n"); | ||
527 | |||
528 | break; | ||
529 | } | ||
530 | |||
531 | case NE_ERR_VCPU_ALREADY_USED: { | ||
532 | printf("Error in add vcpu, the provided vCPU is already used\n"); | ||
533 | |||
534 | break; | ||
535 | } | ||
536 | |||
537 | case NE_ERR_VCPU_NOT_IN_CPU_POOL: { | ||
538 | printf("Error in add vcpu, the provided vCPU is not in the NE CPU pool\n"); | ||
539 | |||
540 | break; | ||
541 | } | ||
542 | |||
543 | case NE_ERR_VCPU_INVALID_CPU_CORE: { | ||
544 | printf("Error in add vcpu, the core id of the provided vCPU is invalid\n"); | ||
545 | |||
546 | break; | ||
547 | } | ||
548 | |||
549 | case NE_ERR_NOT_IN_INIT_STATE: { | ||
550 | printf("Error in add vcpu, enclave not in init state\n"); | ||
551 | |||
552 | break; | ||
553 | } | ||
554 | |||
555 | case NE_ERR_INVALID_VCPU: { | ||
556 | printf("Error in add vcpu, the provided vCPU is out of avail CPUs range\n"); | ||
557 | |||
558 | break; | ||
559 | } | ||
560 | |||
561 | default: | ||
562 | printf("Error in add vcpu [%m]\n"); | ||
563 | |||
564 | } | ||
565 | return rc; | ||
566 | } | ||
567 | |||
568 | return 0; | ||
569 | } | ||
570 | |||
571 | /** | ||
572 | * ne_start_enclave() - Start the given enclave. | ||
573 | * @enclave_fd : The file descriptor associated with the enclave. | ||
574 | * @enclave_start_info : Enclave metadata used for starting e.g. vsock CID. | ||
575 | * | ||
576 | * Context: Process context. | ||
577 | * Return: | ||
578 | * * 0 on success. | ||
579 | * * Negative return value on failure. | ||
580 | */ | ||
581 | static int ne_start_enclave(int enclave_fd, struct ne_enclave_start_info *enclave_start_info) | ||
582 | { | ||
583 | int rc = -EINVAL; | ||
584 | |||
585 | rc = ioctl(enclave_fd, NE_START_ENCLAVE, enclave_start_info); | ||
586 | if (rc < 0) { | ||
587 | switch (errno) { | ||
588 | case NE_ERR_NOT_IN_INIT_STATE: { | ||
589 | printf("Error in start enclave, enclave not in init state\n"); | ||
590 | |||
591 | break; | ||
592 | } | ||
593 | |||
594 | case NE_ERR_NO_MEM_REGIONS_ADDED: { | ||
595 | printf("Error in start enclave, no memory regions have been added\n"); | ||
596 | |||
597 | break; | ||
598 | } | ||
599 | |||
600 | case NE_ERR_NO_VCPUS_ADDED: { | ||
601 | printf("Error in start enclave, no vCPUs have been added\n"); | ||
602 | |||
603 | break; | ||
604 | } | ||
605 | |||
606 | case NE_ERR_FULL_CORES_NOT_USED: { | ||
607 | printf("Error in start enclave, enclave has no full cores set\n"); | ||
608 | |||
609 | break; | ||
610 | } | ||
611 | |||
612 | case NE_ERR_ENCLAVE_MEM_MIN_SIZE: { | ||
613 | printf("Error in start enclave, enclave memory is less than min size\n"); | ||
614 | |||
615 | break; | ||
616 | } | ||
617 | |||
618 | case NE_ERR_INVALID_FLAG_VALUE: { | ||
619 | printf("Error in start enclave, provided invalid flag\n"); | ||
620 | |||
621 | break; | ||
622 | } | ||
623 | |||
624 | case NE_ERR_INVALID_ENCLAVE_CID: { | ||
625 | printf("Error in start enclave, provided invalid enclave CID\n"); | ||
626 | |||
627 | break; | ||
628 | } | ||
629 | |||
630 | default: | ||
631 | printf("Error in start enclave [%m]\n"); | ||
632 | } | ||
633 | |||
634 | return rc; | ||
635 | } | ||
636 | |||
637 | return 0; | ||
638 | } | ||
639 | |||
640 | /** | ||
641 | * ne_start_enclave_check_booted() - Start the enclave and wait for a hearbeat | ||
642 | * from it, on a newly created vsock channel, | ||
643 | * to check it has booted. | ||
644 | * @enclave_fd : The file descriptor associated with the enclave. | ||
645 | * | ||
646 | * Context: Process context. | ||
647 | * Return: | ||
648 | * * 0 on success. | ||
649 | * * Negative return value on failure. | ||
650 | */ | ||
651 | static int ne_start_enclave_check_booted(int enclave_fd) | ||
652 | { | ||
653 | struct sockaddr_vm client_vsock_addr = {}; | ||
654 | int client_vsock_fd = -1; | ||
655 | socklen_t client_vsock_len = sizeof(client_vsock_addr); | ||
656 | struct ne_enclave_start_info enclave_start_info = {}; | ||
657 | struct pollfd fds[1] = {}; | ||
658 | int rc = -EINVAL; | ||
659 | unsigned char recv_buf = 0; | ||
660 | struct sockaddr_vm server_vsock_addr = { | ||
661 | .svm_family = AF_VSOCK, | ||
662 | .svm_cid = NE_IMAGE_LOAD_HEARTBEAT_CID, | ||
663 | .svm_port = NE_IMAGE_LOAD_HEARTBEAT_PORT, | ||
664 | }; | ||
665 | int server_vsock_fd = -1; | ||
666 | |||
667 | server_vsock_fd = socket(AF_VSOCK, SOCK_STREAM, 0); | ||
668 | if (server_vsock_fd < 0) { | ||
669 | rc = server_vsock_fd; | ||
670 | |||
671 | printf("Error in socket [%m]\n"); | ||
672 | |||
673 | return rc; | ||
674 | } | ||
675 | |||
676 | rc = bind(server_vsock_fd, (struct sockaddr *)&server_vsock_addr, | ||
677 | sizeof(server_vsock_addr)); | ||
678 | if (rc < 0) { | ||
679 | printf("Error in bind [%m]\n"); | ||
680 | |||
681 | goto out; | ||
682 | } | ||
683 | |||
684 | rc = listen(server_vsock_fd, 1); | ||
685 | if (rc < 0) { | ||
686 | printf("Error in listen [%m]\n"); | ||
687 | |||
688 | goto out; | ||
689 | } | ||
690 | |||
691 | rc = ne_start_enclave(enclave_fd, &enclave_start_info); | ||
692 | if (rc < 0) | ||
693 | goto out; | ||
694 | |||
695 | printf("Enclave started, CID %llu\n", enclave_start_info.enclave_cid); | ||
696 | |||
697 | fds[0].fd = server_vsock_fd; | ||
698 | fds[0].events = POLLIN; | ||
699 | |||
700 | rc = poll(fds, 1, NE_POLL_WAIT_TIME_MS); | ||
701 | if (rc < 0) { | ||
702 | printf("Error in poll [%m]\n"); | ||
703 | |||
704 | goto out; | ||
705 | } | ||
706 | |||
707 | if (!rc) { | ||
708 | printf("Poll timeout, %d seconds elapsed\n", NE_POLL_WAIT_TIME); | ||
709 | |||
710 | rc = -ETIMEDOUT; | ||
711 | |||
712 | goto out; | ||
713 | } | ||
714 | |||
715 | if ((fds[0].revents & POLLIN) == 0) { | ||
716 | printf("Poll received value %d\n", fds[0].revents); | ||
717 | |||
718 | rc = -EINVAL; | ||
719 | |||
720 | goto out; | ||
721 | } | ||
722 | |||
723 | rc = accept(server_vsock_fd, (struct sockaddr *)&client_vsock_addr, | ||
724 | &client_vsock_len); | ||
725 | if (rc < 0) { | ||
726 | printf("Error in accept [%m]\n"); | ||
727 | |||
728 | goto out; | ||
729 | } | ||
730 | |||
731 | client_vsock_fd = rc; | ||
732 | |||
733 | /* | ||
734 | * Read the heartbeat value that the init process in the enclave sends | ||
735 | * after vsock connect. | ||
736 | */ | ||
737 | rc = read(client_vsock_fd, &recv_buf, sizeof(recv_buf)); | ||
738 | if (rc < 0) { | ||
739 | printf("Error in read [%m]\n"); | ||
740 | |||
741 | goto out; | ||
742 | } | ||
743 | |||
744 | if (rc != sizeof(recv_buf) || recv_buf != NE_IMAGE_LOAD_HEARTBEAT_VALUE) { | ||
745 | printf("Read %d instead of %d\n", recv_buf, | ||
746 | NE_IMAGE_LOAD_HEARTBEAT_VALUE); | ||
747 | |||
748 | goto out; | ||
749 | } | ||
750 | |||
751 | /* Write the heartbeat value back. */ | ||
752 | rc = write(client_vsock_fd, &recv_buf, sizeof(recv_buf)); | ||
753 | if (rc < 0) { | ||
754 | printf("Error in write [%m]\n"); | ||
755 | |||
756 | goto out; | ||
757 | } | ||
758 | |||
759 | rc = 0; | ||
760 | |||
761 | out: | ||
762 | close(server_vsock_fd); | ||
763 | |||
764 | return rc; | ||
765 | } | ||
766 | |||
767 | int main(int argc, char *argv[]) | ||
768 | { | ||
769 | int enclave_fd = -1; | ||
770 | unsigned int i = 0; | ||
771 | int ne_dev_fd = -1; | ||
772 | struct ne_user_mem_region ne_user_mem_regions[NE_DEFAULT_NR_MEM_REGIONS] = {}; | ||
773 | unsigned int ne_vcpus[NE_DEFAULT_NR_VCPUS] = {}; | ||
774 | int rc = -EINVAL; | ||
775 | pthread_t thread_id = 0; | ||
776 | unsigned long slot_uid = 0; | ||
777 | |||
778 | if (argc != 2) { | ||
779 | printf("Usage: %s <path_to_enclave_image>\n", argv[0]); | ||
780 | |||
781 | exit(EXIT_FAILURE); | ||
782 | } | ||
783 | |||
784 | if (strlen(argv[1]) >= PATH_MAX) { | ||
785 | printf("The size of the path to enclave image is higher than max path\n"); | ||
786 | |||
787 | exit(EXIT_FAILURE); | ||
788 | } | ||
789 | |||
790 | ne_dev_fd = open(NE_DEV_NAME, O_RDWR | O_CLOEXEC); | ||
791 | if (ne_dev_fd < 0) { | ||
792 | printf("Error in open NE device [%m]\n"); | ||
793 | |||
794 | exit(EXIT_FAILURE); | ||
795 | } | ||
796 | |||
797 | printf("Creating enclave slot ...\n"); | ||
798 | |||
799 | rc = ne_create_vm(ne_dev_fd, &slot_uid, &enclave_fd); | ||
800 | |||
801 | close(ne_dev_fd); | ||
802 | |||
803 | if (rc < 0) | ||
804 | exit(EXIT_FAILURE); | ||
805 | |||
806 | printf("Enclave fd %d\n", enclave_fd); | ||
807 | |||
808 | rc = pthread_create(&thread_id, NULL, ne_poll_enclave_fd, (void *)&enclave_fd); | ||
809 | if (rc < 0) { | ||
810 | printf("Error in thread create [%m]\n"); | ||
811 | |||
812 | close(enclave_fd); | ||
813 | |||
814 | exit(EXIT_FAILURE); | ||
815 | } | ||
816 | |||
817 | for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) { | ||
818 | ne_user_mem_regions[i].memory_size = NE_MIN_MEM_REGION_SIZE; | ||
819 | |||
820 | rc = ne_alloc_user_mem_region(&ne_user_mem_regions[i]); | ||
821 | if (rc < 0) { | ||
822 | printf("Error in alloc userspace memory region, iter %d\n", i); | ||
823 | |||
824 | goto release_enclave_fd; | ||
825 | } | ||
826 | } | ||
827 | |||
828 | rc = ne_load_enclave_image(enclave_fd, ne_user_mem_regions, argv[1]); | ||
829 | if (rc < 0) | ||
830 | goto release_enclave_fd; | ||
831 | |||
832 | for (i = 0; i < NE_DEFAULT_NR_MEM_REGIONS; i++) { | ||
833 | rc = ne_set_user_mem_region(enclave_fd, ne_user_mem_regions[i]); | ||
834 | if (rc < 0) { | ||
835 | printf("Error in set memory region, iter %d\n", i); | ||
836 | |||
837 | goto release_enclave_fd; | ||
838 | } | ||
839 | } | ||
840 | |||
841 | printf("Enclave memory regions were added\n"); | ||
842 | |||
843 | for (i = 0; i < NE_DEFAULT_NR_VCPUS; i++) { | ||
844 | /* | ||
845 | * The vCPU is chosen from the enclave vCPU pool, if the value | ||
846 | * of the vcpu_id is 0. | ||
847 | */ | ||
848 | ne_vcpus[i] = 0; | ||
849 | rc = ne_add_vcpu(enclave_fd, &ne_vcpus[i]); | ||
850 | if (rc < 0) { | ||
851 | printf("Error in add vcpu, iter %d\n", i); | ||
852 | |||
853 | goto release_enclave_fd; | ||
854 | } | ||
855 | |||
856 | printf("Added vCPU %d to the enclave\n", ne_vcpus[i]); | ||
857 | } | ||
858 | |||
859 | printf("Enclave vCPUs were added\n"); | ||
860 | |||
861 | rc = ne_start_enclave_check_booted(enclave_fd); | ||
862 | if (rc < 0) { | ||
863 | printf("Error in the enclave start / image loading heartbeat logic [rc=%d]\n", rc); | ||
864 | |||
865 | goto release_enclave_fd; | ||
866 | } | ||
867 | |||
868 | printf("Entering sleep for %d seconds ...\n", NE_SLEEP_TIME); | ||
869 | |||
870 | sleep(NE_SLEEP_TIME); | ||
871 | |||
872 | close(enclave_fd); | ||
873 | |||
874 | ne_free_mem_regions(ne_user_mem_regions); | ||
875 | |||
876 | exit(EXIT_SUCCESS); | ||
877 | |||
878 | release_enclave_fd: | ||
879 | close(enclave_fd); | ||
880 | ne_free_mem_regions(ne_user_mem_regions); | ||
881 | |||
882 | exit(EXIT_FAILURE); | ||
883 | } | ||
diff --git a/samples/pidfd/.gitignore b/samples/pidfd/.gitignore new file mode 100644 index 000000000..eea857fca --- /dev/null +++ b/samples/pidfd/.gitignore | |||
@@ -0,0 +1,2 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | pidfd-metadata | ||
diff --git a/samples/pidfd/Makefile b/samples/pidfd/Makefile new file mode 100644 index 000000000..9754e2d81 --- /dev/null +++ b/samples/pidfd/Makefile | |||
@@ -0,0 +1,4 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0 | ||
2 | usertprogs-always-y += pidfd-metadata | ||
3 | |||
4 | userccflags += -I usr/include | ||
diff --git a/samples/pidfd/pidfd-metadata.c b/samples/pidfd/pidfd-metadata.c new file mode 100644 index 000000000..c459155da --- /dev/null +++ b/samples/pidfd/pidfd-metadata.c | |||
@@ -0,0 +1,120 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | #define _GNU_SOURCE | ||
4 | #include <err.h> | ||
5 | #include <errno.h> | ||
6 | #include <fcntl.h> | ||
7 | #include <inttypes.h> | ||
8 | #include <limits.h> | ||
9 | #include <sched.h> | ||
10 | #include <signal.h> | ||
11 | #include <stdio.h> | ||
12 | #include <stdlib.h> | ||
13 | #include <string.h> | ||
14 | #include <sys/stat.h> | ||
15 | #include <sys/syscall.h> | ||
16 | #include <sys/types.h> | ||
17 | #include <sys/wait.h> | ||
18 | #include <unistd.h> | ||
19 | |||
20 | #ifndef CLONE_PIDFD | ||
21 | #define CLONE_PIDFD 0x00001000 | ||
22 | #endif | ||
23 | |||
24 | #ifndef __NR_pidfd_send_signal | ||
25 | #define __NR_pidfd_send_signal -1 | ||
26 | #endif | ||
27 | |||
28 | static int do_child(void *args) | ||
29 | { | ||
30 | printf("%d\n", getpid()); | ||
31 | _exit(EXIT_SUCCESS); | ||
32 | } | ||
33 | |||
34 | static pid_t pidfd_clone(int flags, int *pidfd) | ||
35 | { | ||
36 | size_t stack_size = 1024; | ||
37 | char *stack[1024] = { 0 }; | ||
38 | |||
39 | #ifdef __ia64__ | ||
40 | return __clone2(do_child, stack, stack_size, flags | SIGCHLD, NULL, pidfd); | ||
41 | #else | ||
42 | return clone(do_child, stack + stack_size, flags | SIGCHLD, NULL, pidfd); | ||
43 | #endif | ||
44 | } | ||
45 | |||
46 | static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info, | ||
47 | unsigned int flags) | ||
48 | { | ||
49 | return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); | ||
50 | } | ||
51 | |||
52 | static int pidfd_metadata_fd(pid_t pid, int pidfd) | ||
53 | { | ||
54 | int procfd, ret; | ||
55 | char path[100]; | ||
56 | |||
57 | snprintf(path, sizeof(path), "/proc/%d", pid); | ||
58 | procfd = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC); | ||
59 | if (procfd < 0) { | ||
60 | warn("Failed to open %s\n", path); | ||
61 | return -1; | ||
62 | } | ||
63 | |||
64 | /* | ||
65 | * Verify that the pid has not been recycled and our /proc/<pid> handle | ||
66 | * is still valid. | ||
67 | */ | ||
68 | ret = sys_pidfd_send_signal(pidfd, 0, NULL, 0); | ||
69 | if (ret < 0) { | ||
70 | switch (errno) { | ||
71 | case EPERM: | ||
72 | /* Process exists, just not allowed to signal it. */ | ||
73 | break; | ||
74 | default: | ||
75 | warn("Failed to signal process\n"); | ||
76 | close(procfd); | ||
77 | procfd = -1; | ||
78 | } | ||
79 | } | ||
80 | |||
81 | return procfd; | ||
82 | } | ||
83 | |||
84 | int main(int argc, char *argv[]) | ||
85 | { | ||
86 | int pidfd = -1, ret = EXIT_FAILURE; | ||
87 | char buf[4096] = { 0 }; | ||
88 | pid_t pid; | ||
89 | int procfd, statusfd; | ||
90 | ssize_t bytes; | ||
91 | |||
92 | pid = pidfd_clone(CLONE_PIDFD, &pidfd); | ||
93 | if (pid < 0) | ||
94 | err(ret, "CLONE_PIDFD"); | ||
95 | if (pidfd == -1) { | ||
96 | warnx("CLONE_PIDFD is not supported by the kernel"); | ||
97 | goto out; | ||
98 | } | ||
99 | |||
100 | procfd = pidfd_metadata_fd(pid, pidfd); | ||
101 | close(pidfd); | ||
102 | if (procfd < 0) | ||
103 | goto out; | ||
104 | |||
105 | statusfd = openat(procfd, "status", O_RDONLY | O_CLOEXEC); | ||
106 | close(procfd); | ||
107 | if (statusfd < 0) | ||
108 | goto out; | ||
109 | |||
110 | bytes = read(statusfd, buf, sizeof(buf)); | ||
111 | if (bytes > 0) | ||
112 | bytes = write(STDOUT_FILENO, buf, bytes); | ||
113 | close(statusfd); | ||
114 | ret = EXIT_SUCCESS; | ||
115 | |||
116 | out: | ||
117 | (void)wait(NULL); | ||
118 | |||
119 | exit(ret); | ||
120 | } | ||
diff --git a/samples/pktgen/README.rst b/samples/pktgen/README.rst new file mode 100644 index 000000000..f9c53ca5c --- /dev/null +++ b/samples/pktgen/README.rst | |||
@@ -0,0 +1,46 @@ | |||
1 | Sample and benchmark scripts for pktgen (packet generator) | ||
2 | ========================================================== | ||
3 | This directory contains some pktgen sample and benchmark scripts, that | ||
4 | can easily be copied and adjusted for your own use-case. | ||
5 | |||
6 | General doc is located in kernel: Documentation/networking/pktgen.rst | ||
7 | |||
8 | Helper include files | ||
9 | ==================== | ||
10 | This directory contains two helper shell files, that can be "included" | ||
11 | by shell source'ing. Namely "functions.sh" and "parameters.sh". | ||
12 | |||
13 | Common parameters | ||
14 | ----------------- | ||
15 | The parameters.sh file support easy and consistant parameter parsing | ||
16 | across the sample scripts. Usage example is printed on errors:: | ||
17 | |||
18 | Usage: ./pktgen_sample01_simple.sh [-vx] -i ethX | ||
19 | -i : ($DEV) output interface/device (required) | ||
20 | -s : ($PKT_SIZE) packet size | ||
21 | -d : ($DEST_IP) destination IP. CIDR (e.g. 198.18.0.0/15) is also allowed | ||
22 | -m : ($DST_MAC) destination MAC-addr | ||
23 | -p : ($DST_PORT) destination PORT range (e.g. 433-444) is also allowed | ||
24 | -t : ($THREADS) threads to start | ||
25 | -f : ($F_THREAD) index of first thread (zero indexed CPU number) | ||
26 | -c : ($SKB_CLONE) SKB clones send before alloc new SKB | ||
27 | -n : ($COUNT) num messages to send per thread, 0 means indefinitely | ||
28 | -b : ($BURST) HW level bursting of SKBs | ||
29 | -v : ($VERBOSE) verbose | ||
30 | -x : ($DEBUG) debug | ||
31 | |||
32 | The global variable being set is also listed. E.g. the required | ||
33 | interface/device parameter "-i" sets variable $DEV. | ||
34 | |||
35 | Common functions | ||
36 | ---------------- | ||
37 | The functions.sh file provides; Three different shell functions for | ||
38 | configuring the different components of pktgen: pg_ctrl(), pg_thread() | ||
39 | and pg_set(). | ||
40 | |||
41 | These functions correspond to pktgens different components. | ||
42 | * pg_ctrl() control "pgctrl" (/proc/net/pktgen/pgctrl) | ||
43 | * pg_thread() control the kernel threads and binding to devices | ||
44 | * pg_set() control setup of individual devices | ||
45 | |||
46 | See sample scripts for usage examples. | ||
diff --git a/samples/pktgen/functions.sh b/samples/pktgen/functions.sh new file mode 100644 index 000000000..dae06d5b3 --- /dev/null +++ b/samples/pktgen/functions.sh | |||
@@ -0,0 +1,334 @@ | |||
1 | # | ||
2 | # Common functions used by pktgen scripts | ||
3 | # - Depending on bash 3 (or higher) syntax | ||
4 | # | ||
5 | # Author: Jesper Dangaaard Brouer | ||
6 | # License: GPL | ||
7 | |||
8 | set -o errexit | ||
9 | |||
10 | ## -- General shell logging cmds -- | ||
11 | function err() { | ||
12 | local exitcode=$1 | ||
13 | shift | ||
14 | echo "ERROR: $@" >&2 | ||
15 | exit $exitcode | ||
16 | } | ||
17 | |||
18 | function warn() { | ||
19 | echo "WARN : $@" >&2 | ||
20 | } | ||
21 | |||
22 | function info() { | ||
23 | if [[ -n "$VERBOSE" ]]; then | ||
24 | echo "INFO : $@" >&2 | ||
25 | fi | ||
26 | } | ||
27 | |||
28 | ## -- Pktgen proc config commands -- ## | ||
29 | export PROC_DIR=/proc/net/pktgen | ||
30 | # | ||
31 | # Three different shell functions for configuring the different | ||
32 | # components of pktgen: | ||
33 | # pg_ctrl(), pg_thread() and pg_set(). | ||
34 | # | ||
35 | # These functions correspond to pktgens different components. | ||
36 | # * pg_ctrl() control "pgctrl" (/proc/net/pktgen/pgctrl) | ||
37 | # * pg_thread() control the kernel threads and binding to devices | ||
38 | # * pg_set() control setup of individual devices | ||
39 | function pg_ctrl() { | ||
40 | local proc_file="pgctrl" | ||
41 | proc_cmd ${proc_file} "$@" | ||
42 | } | ||
43 | |||
44 | function pg_thread() { | ||
45 | local thread=$1 | ||
46 | local proc_file="kpktgend_${thread}" | ||
47 | shift | ||
48 | proc_cmd ${proc_file} "$@" | ||
49 | } | ||
50 | |||
51 | function pg_set() { | ||
52 | local dev=$1 | ||
53 | local proc_file="$dev" | ||
54 | shift | ||
55 | proc_cmd ${proc_file} "$@" | ||
56 | } | ||
57 | |||
58 | # More generic replacement for pgset(), that does not depend on global | ||
59 | # variable for proc file. | ||
60 | function proc_cmd() { | ||
61 | local result | ||
62 | local proc_file=$1 | ||
63 | local status=0 | ||
64 | # after shift, the remaining args are contained in $@ | ||
65 | shift | ||
66 | local proc_ctrl=${PROC_DIR}/$proc_file | ||
67 | if [[ ! -e "$proc_ctrl" ]]; then | ||
68 | err 3 "proc file:$proc_ctrl does not exists (dev added to thread?)" | ||
69 | else | ||
70 | if [[ ! -w "$proc_ctrl" ]]; then | ||
71 | err 4 "proc file:$proc_ctrl not writable, not root?!" | ||
72 | fi | ||
73 | fi | ||
74 | |||
75 | if [[ "$DEBUG" == "yes" ]]; then | ||
76 | echo "cmd: $@ > $proc_ctrl" | ||
77 | fi | ||
78 | # Quoting of "$@" is important for space expansion | ||
79 | echo "$@" > "$proc_ctrl" || status=$? | ||
80 | |||
81 | if [[ "$proc_file" != "pgctrl" ]]; then | ||
82 | result=$(grep "Result: OK:" $proc_ctrl) || true | ||
83 | if [[ "$result" == "" ]]; then | ||
84 | grep "Result:" $proc_ctrl >&2 | ||
85 | fi | ||
86 | fi | ||
87 | if (( $status != 0 )); then | ||
88 | err 5 "Write error($status) occurred cmd: \"$@ > $proc_ctrl\"" | ||
89 | fi | ||
90 | } | ||
91 | |||
92 | # Old obsolete "pgset" function, with slightly improved err handling | ||
93 | function pgset() { | ||
94 | local result | ||
95 | |||
96 | if [[ "$DEBUG" == "yes" ]]; then | ||
97 | echo "cmd: $1 > $PGDEV" | ||
98 | fi | ||
99 | echo $1 > $PGDEV | ||
100 | local status=$? | ||
101 | |||
102 | result=`cat $PGDEV | fgrep "Result: OK:"` | ||
103 | if [[ "$result" == "" ]]; then | ||
104 | cat $PGDEV | fgrep Result: | ||
105 | fi | ||
106 | if (( $status != 0 )); then | ||
107 | err 5 "Write error($status) occurred cmd: \"$1 > $PGDEV\"" | ||
108 | fi | ||
109 | } | ||
110 | |||
111 | [[ $EUID -eq 0 ]] && trap 'pg_ctrl "reset"' EXIT | ||
112 | |||
113 | ## -- General shell tricks -- | ||
114 | |||
115 | function root_check_run_with_sudo() { | ||
116 | # Trick so, program can be run as normal user, will just use "sudo" | ||
117 | # call as root_check_run_as_sudo "$@" | ||
118 | if [ "$EUID" -ne 0 ]; then | ||
119 | if [ -x $0 ]; then # Directly executable use sudo | ||
120 | info "Not root, running with sudo" | ||
121 | sudo "$0" "$@" | ||
122 | exit $? | ||
123 | fi | ||
124 | err 4 "cannot perform sudo run of $0" | ||
125 | fi | ||
126 | } | ||
127 | |||
128 | # Exact input device's NUMA node info | ||
129 | function get_iface_node() | ||
130 | { | ||
131 | local node=$(</sys/class/net/$1/device/numa_node) | ||
132 | if [[ $node == -1 ]]; then | ||
133 | echo 0 | ||
134 | else | ||
135 | echo $node | ||
136 | fi | ||
137 | } | ||
138 | |||
139 | # Given an Dev/iface, get its queues' irq numbers | ||
140 | function get_iface_irqs() | ||
141 | { | ||
142 | local IFACE=$1 | ||
143 | local queues="${IFACE}-.*TxRx" | ||
144 | |||
145 | irqs=$(grep "$queues" /proc/interrupts | cut -f1 -d:) | ||
146 | [ -z "$irqs" ] && irqs=$(grep $IFACE /proc/interrupts | cut -f1 -d:) | ||
147 | [ -z "$irqs" ] && irqs=$(for i in `ls -Ux /sys/class/net/$IFACE/device/msi_irqs` ;\ | ||
148 | do grep "$i:.*TxRx" /proc/interrupts | grep -v fdir | cut -f 1 -d : ;\ | ||
149 | done) | ||
150 | [ -z "$irqs" ] && err 3 "Could not find interrupts for $IFACE" | ||
151 | |||
152 | echo $irqs | ||
153 | } | ||
154 | |||
155 | # Given a NUMA node, return cpu ids belonging to it. | ||
156 | function get_node_cpus() | ||
157 | { | ||
158 | local node=$1 | ||
159 | local node_cpu_list | ||
160 | local node_cpu_range_list=`cut -f1- -d, --output-delimiter=" " \ | ||
161 | /sys/devices/system/node/node$node/cpulist` | ||
162 | |||
163 | for cpu_range in $node_cpu_range_list | ||
164 | do | ||
165 | node_cpu_list="$node_cpu_list "`seq -s " " ${cpu_range//-/ }` | ||
166 | done | ||
167 | |||
168 | echo $node_cpu_list | ||
169 | } | ||
170 | |||
171 | # Check $1 is in between $2, $3 ($2 <= $1 <= $3) | ||
172 | function in_between() { [[ ($1 -ge $2) && ($1 -le $3) ]] ; } | ||
173 | |||
174 | # Extend shrunken IPv6 address. | ||
175 | # fe80::42:bcff:fe84:e10a => fe80:0:0:0:42:bcff:fe84:e10a | ||
176 | function extend_addr6() | ||
177 | { | ||
178 | local addr=$1 | ||
179 | local sep=: sep2=:: | ||
180 | local sep_cnt=$(tr -cd $sep <<< $1 | wc -c) | ||
181 | local shrink | ||
182 | |||
183 | # separator count should be (2 <= $sep_cnt <= 7) | ||
184 | if ! (in_between $sep_cnt 2 7); then | ||
185 | err 5 "Invalid IP6 address: $1" | ||
186 | fi | ||
187 | |||
188 | # if shrink '::' occurs multiple, it's malformed. | ||
189 | shrink=( $(egrep -o "$sep{2,}" <<< $addr) ) | ||
190 | if [[ ${#shrink[@]} -ne 0 ]]; then | ||
191 | if [[ ${#shrink[@]} -gt 1 || ( ${shrink[0]} != $sep2 ) ]]; then | ||
192 | err 5 "Invalid IP6 address: $1" | ||
193 | fi | ||
194 | fi | ||
195 | |||
196 | # add 0 at begin & end, and extend addr by adding :0 | ||
197 | [[ ${addr:0:1} == $sep ]] && addr=0${addr} | ||
198 | [[ ${addr: -1} == $sep ]] && addr=${addr}0 | ||
199 | echo "${addr/$sep2/$(printf ':0%.s' $(seq $[8-sep_cnt])):}" | ||
200 | } | ||
201 | |||
202 | # Given a single IP(v4/v6) address, whether it is valid. | ||
203 | function validate_addr() | ||
204 | { | ||
205 | # check function is called with (funcname)6 | ||
206 | [[ ${FUNCNAME[1]: -1} == 6 ]] && local IP6=6 | ||
207 | local bitlen=$[ IP6 ? 128 : 32 ] | ||
208 | local len=$[ IP6 ? 8 : 4 ] | ||
209 | local max=$[ 2**(len*2)-1 ] | ||
210 | local net prefix | ||
211 | local addr sep | ||
212 | |||
213 | IFS='/' read net prefix <<< $1 | ||
214 | [[ $IP6 ]] && net=$(extend_addr6 $net) | ||
215 | |||
216 | # if prefix exists, check (0 <= $prefix <= $bitlen) | ||
217 | if [[ -n $prefix ]]; then | ||
218 | if ! (in_between $prefix 0 $bitlen); then | ||
219 | err 5 "Invalid prefix: /$prefix" | ||
220 | fi | ||
221 | fi | ||
222 | |||
223 | # set separator for each IP(v4/v6) | ||
224 | [[ $IP6 ]] && sep=: || sep=. | ||
225 | IFS=$sep read -a addr <<< $net | ||
226 | |||
227 | # array length | ||
228 | if [[ ${#addr[@]} != $len ]]; then | ||
229 | err 5 "Invalid IP$IP6 address: $1" | ||
230 | fi | ||
231 | |||
232 | # check each digit (0 <= $digit <= $max) | ||
233 | for digit in "${addr[@]}"; do | ||
234 | [[ $IP6 ]] && digit=$[ 16#$digit ] | ||
235 | if ! (in_between $digit 0 $max); then | ||
236 | err 5 "Invalid IP$IP6 address: $1" | ||
237 | fi | ||
238 | done | ||
239 | |||
240 | return 0 | ||
241 | } | ||
242 | |||
243 | function validate_addr6() { validate_addr $@ ; } | ||
244 | |||
245 | # Given a single IP(v4/v6) or CIDR, return minimum and maximum IP addr. | ||
246 | function parse_addr() | ||
247 | { | ||
248 | # check function is called with (funcname)6 | ||
249 | [[ ${FUNCNAME[1]: -1} == 6 ]] && local IP6=6 | ||
250 | local net prefix | ||
251 | local min_ip max_ip | ||
252 | |||
253 | IFS='/' read net prefix <<< $1 | ||
254 | [[ $IP6 ]] && net=$(extend_addr6 $net) | ||
255 | |||
256 | if [[ -z $prefix ]]; then | ||
257 | min_ip=$net | ||
258 | max_ip=$net | ||
259 | else | ||
260 | # defining array for converting Decimal 2 Binary | ||
261 | # 00000000 00000001 00000010 00000011 00000100 ... | ||
262 | local d2b='{0..1}{0..1}{0..1}{0..1}{0..1}{0..1}{0..1}{0..1}' | ||
263 | [[ $IP6 ]] && d2b+=$d2b | ||
264 | eval local D2B=($d2b) | ||
265 | |||
266 | local bitlen=$[ IP6 ? 128 : 32 ] | ||
267 | local remain=$[ bitlen-prefix ] | ||
268 | local octet=$[ IP6 ? 16 : 8 ] | ||
269 | local min_mask max_mask | ||
270 | local min max | ||
271 | local ip_bit | ||
272 | local ip sep | ||
273 | |||
274 | # set separator for each IP(v4/v6) | ||
275 | [[ $IP6 ]] && sep=: || sep=. | ||
276 | IFS=$sep read -ra ip <<< $net | ||
277 | |||
278 | min_mask="$(printf '1%.s' $(seq $prefix))$(printf '0%.s' $(seq $remain))" | ||
279 | max_mask="$(printf '0%.s' $(seq $prefix))$(printf '1%.s' $(seq $remain))" | ||
280 | |||
281 | # calculate min/max ip with &,| operator | ||
282 | for i in "${!ip[@]}"; do | ||
283 | digit=$[ IP6 ? 16#${ip[$i]} : ${ip[$i]} ] | ||
284 | ip_bit=${D2B[$digit]} | ||
285 | |||
286 | idx=$[ octet*i ] | ||
287 | min[$i]=$[ 2#$ip_bit & 2#${min_mask:$idx:$octet} ] | ||
288 | max[$i]=$[ 2#$ip_bit | 2#${max_mask:$idx:$octet} ] | ||
289 | [[ $IP6 ]] && { min[$i]=$(printf '%X' ${min[$i]}); | ||
290 | max[$i]=$(printf '%X' ${max[$i]}); } | ||
291 | done | ||
292 | |||
293 | min_ip=$(IFS=$sep; echo "${min[*]}") | ||
294 | max_ip=$(IFS=$sep; echo "${max[*]}") | ||
295 | fi | ||
296 | |||
297 | echo $min_ip $max_ip | ||
298 | } | ||
299 | |||
300 | function parse_addr6() { parse_addr $@ ; } | ||
301 | |||
302 | # Given a single or range of port(s), return minimum and maximum port number. | ||
303 | function parse_ports() | ||
304 | { | ||
305 | local port_str=$1 | ||
306 | local port_list | ||
307 | local min_port | ||
308 | local max_port | ||
309 | |||
310 | IFS="-" read -ra port_list <<< $port_str | ||
311 | |||
312 | min_port=${port_list[0]} | ||
313 | max_port=${port_list[1]:-$min_port} | ||
314 | |||
315 | echo $min_port $max_port | ||
316 | } | ||
317 | |||
318 | # Given a minimum and maximum port, verify port number. | ||
319 | function validate_ports() | ||
320 | { | ||
321 | local min_port=$1 | ||
322 | local max_port=$2 | ||
323 | |||
324 | # 1 <= port <= 65535 | ||
325 | if (in_between $min_port 1 65535); then | ||
326 | if (in_between $max_port 1 65535); then | ||
327 | if [[ $min_port -le $max_port ]]; then | ||
328 | return 0 | ||
329 | fi | ||
330 | fi | ||
331 | fi | ||
332 | |||
333 | err 5 "Invalid port(s): $min_port-$max_port" | ||
334 | } | ||
diff --git a/samples/pktgen/parameters.sh b/samples/pktgen/parameters.sh new file mode 100644 index 000000000..ff0ed474f --- /dev/null +++ b/samples/pktgen/parameters.sh | |||
@@ -0,0 +1,121 @@ | |||
1 | # | ||
2 | # SPDX-License-Identifier: GPL-2.0 | ||
3 | # Common parameter parsing for pktgen scripts | ||
4 | # | ||
5 | |||
6 | function usage() { | ||
7 | echo "" | ||
8 | echo "Usage: $0 [-vx] -i ethX" | ||
9 | echo " -i : (\$DEV) output interface/device (required)" | ||
10 | echo " -s : (\$PKT_SIZE) packet size" | ||
11 | echo " -d : (\$DEST_IP) destination IP. CIDR (e.g. 198.18.0.0/15) is also allowed" | ||
12 | echo " -m : (\$DST_MAC) destination MAC-addr" | ||
13 | echo " -p : (\$DST_PORT) destination PORT range (e.g. 433-444) is also allowed" | ||
14 | echo " -t : (\$THREADS) threads to start" | ||
15 | echo " -f : (\$F_THREAD) index of first thread (zero indexed CPU number)" | ||
16 | echo " -c : (\$SKB_CLONE) SKB clones send before alloc new SKB" | ||
17 | echo " -n : (\$COUNT) num messages to send per thread, 0 means indefinitely" | ||
18 | echo " -b : (\$BURST) HW level bursting of SKBs" | ||
19 | echo " -v : (\$VERBOSE) verbose" | ||
20 | echo " -x : (\$DEBUG) debug" | ||
21 | echo " -6 : (\$IP6) IPv6" | ||
22 | echo "" | ||
23 | } | ||
24 | |||
25 | ## --- Parse command line arguments / parameters --- | ||
26 | ## echo "Commandline options:" | ||
27 | while getopts "s:i:d:m:p:f:t:c:n:b:vxh6" option; do | ||
28 | case $option in | ||
29 | i) # interface | ||
30 | export DEV=$OPTARG | ||
31 | info "Output device set to: DEV=$DEV" | ||
32 | ;; | ||
33 | s) | ||
34 | export PKT_SIZE=$OPTARG | ||
35 | info "Packet size set to: PKT_SIZE=$PKT_SIZE bytes" | ||
36 | ;; | ||
37 | d) # destination IP | ||
38 | export DEST_IP=$OPTARG | ||
39 | info "Destination IP set to: DEST_IP=$DEST_IP" | ||
40 | ;; | ||
41 | m) # MAC | ||
42 | export DST_MAC=$OPTARG | ||
43 | info "Destination MAC set to: DST_MAC=$DST_MAC" | ||
44 | ;; | ||
45 | p) # PORT | ||
46 | export DST_PORT=$OPTARG | ||
47 | info "Destination PORT set to: DST_PORT=$DST_PORT" | ||
48 | ;; | ||
49 | f) | ||
50 | export F_THREAD=$OPTARG | ||
51 | info "Index of first thread (zero indexed CPU number): $F_THREAD" | ||
52 | ;; | ||
53 | t) | ||
54 | export THREADS=$OPTARG | ||
55 | info "Number of threads to start: $THREADS" | ||
56 | ;; | ||
57 | c) | ||
58 | export CLONE_SKB=$OPTARG | ||
59 | info "CLONE_SKB=$CLONE_SKB" | ||
60 | ;; | ||
61 | n) | ||
62 | export COUNT=$OPTARG | ||
63 | info "COUNT=$COUNT" | ||
64 | ;; | ||
65 | b) | ||
66 | export BURST=$OPTARG | ||
67 | info "SKB bursting: BURST=$BURST" | ||
68 | ;; | ||
69 | v) | ||
70 | export VERBOSE=yes | ||
71 | info "Verbose mode: VERBOSE=$VERBOSE" | ||
72 | ;; | ||
73 | x) | ||
74 | export DEBUG=yes | ||
75 | info "Debug mode: DEBUG=$DEBUG" | ||
76 | ;; | ||
77 | 6) | ||
78 | export IP6=6 | ||
79 | info "IP6: IP6=$IP6" | ||
80 | ;; | ||
81 | h|?|*) | ||
82 | usage; | ||
83 | err 2 "[ERROR] Unknown parameters!!!" | ||
84 | esac | ||
85 | done | ||
86 | shift $(( $OPTIND - 1 )) | ||
87 | |||
88 | if [ -z "$PKT_SIZE" ]; then | ||
89 | # NIC adds 4 bytes CRC | ||
90 | export PKT_SIZE=60 | ||
91 | info "Default packet size set to: set to: $PKT_SIZE bytes" | ||
92 | fi | ||
93 | |||
94 | if [ -z "$F_THREAD" ]; then | ||
95 | # First thread (F_THREAD) reference the zero indexed CPU number | ||
96 | export F_THREAD=0 | ||
97 | fi | ||
98 | |||
99 | if [ -z "$THREADS" ]; then | ||
100 | export THREADS=1 | ||
101 | fi | ||
102 | |||
103 | export L_THREAD=$(( THREADS + F_THREAD - 1 )) | ||
104 | |||
105 | if [ -z "$DEV" ]; then | ||
106 | usage | ||
107 | err 2 "Please specify output device" | ||
108 | fi | ||
109 | |||
110 | if [ -z "$DST_MAC" ]; then | ||
111 | warn "Missing destination MAC address" | ||
112 | fi | ||
113 | |||
114 | if [ -z "$DEST_IP" ]; then | ||
115 | warn "Missing destination IP address" | ||
116 | fi | ||
117 | |||
118 | if [ ! -d /proc/net/pktgen ]; then | ||
119 | info "Loading kernel module: pktgen" | ||
120 | modprobe pktgen | ||
121 | fi | ||
diff --git a/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh b/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh new file mode 100755 index 000000000..1b6204125 --- /dev/null +++ b/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh | |||
@@ -0,0 +1,105 @@ | |||
1 | #!/bin/bash | ||
2 | # SPDX-License-Identifier: GPL-2.0 | ||
3 | # | ||
4 | # Benchmark script: | ||
5 | # - developed for benchmarking ingress qdisc path | ||
6 | # | ||
7 | # Script for injecting packets into RX path of the stack with pktgen | ||
8 | # "xmit_mode netif_receive". With an invalid dst_mac this will only | ||
9 | # measure the ingress code path as packets gets dropped in ip_rcv(). | ||
10 | # | ||
11 | # This script don't really need any hardware. It benchmarks software | ||
12 | # RX path just after NIC driver level. With bursting is also | ||
13 | # "removes" the SKB alloc/free overhead. | ||
14 | # | ||
15 | # Setup scenarios for measuring ingress qdisc (with invalid dst_mac): | ||
16 | # ------------------------------------------------------------------ | ||
17 | # (1) no ingress (uses static_key_false(&ingress_needed)) | ||
18 | # | ||
19 | # (2) ingress on other dev (change ingress_needed and calls | ||
20 | # handle_ing() but exit early) | ||
21 | # | ||
22 | # config: tc qdisc add dev $SOMEDEV handle ffff: ingress | ||
23 | # | ||
24 | # (3) ingress on this dev, handle_ing() -> tc_classify() | ||
25 | # | ||
26 | # config: tc qdisc add dev $DEV handle ffff: ingress | ||
27 | # | ||
28 | # (4) ingress on this dev + drop at u32 classifier/action. | ||
29 | # | ||
30 | basedir=`dirname $0` | ||
31 | source ${basedir}/functions.sh | ||
32 | root_check_run_with_sudo "$@" | ||
33 | |||
34 | # Parameter parsing via include | ||
35 | source ${basedir}/parameters.sh | ||
36 | # Using invalid DST_MAC will cause the packets to get dropped in | ||
37 | # ip_rcv() which is part of the test | ||
38 | if [ -z "$DEST_IP" ]; then | ||
39 | [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" | ||
40 | fi | ||
41 | [ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" | ||
42 | [ -z "$BURST" ] && BURST=1024 | ||
43 | [ -z "$COUNT" ] && COUNT="10000000" # Zero means indefinitely | ||
44 | if [ -n "$DEST_IP" ]; then | ||
45 | validate_addr${IP6} $DEST_IP | ||
46 | read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) | ||
47 | fi | ||
48 | if [ -n "$DST_PORT" ]; then | ||
49 | read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) | ||
50 | validate_ports $UDP_DST_MIN $UDP_DST_MAX | ||
51 | fi | ||
52 | |||
53 | # Base Config | ||
54 | DELAY="0" # Zero means max speed | ||
55 | |||
56 | # General cleanup everything since last run | ||
57 | pg_ctrl "reset" | ||
58 | |||
59 | # Threads are specified with parameter -t value in $THREADS | ||
60 | for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do | ||
61 | # The device name is extended with @name, using thread number to | ||
62 | # make then unique, but any name will do. | ||
63 | dev=${DEV}@${thread} | ||
64 | |||
65 | # Add remove all other devices and add_device $dev to thread | ||
66 | pg_thread $thread "rem_device_all" | ||
67 | pg_thread $thread "add_device" $dev | ||
68 | |||
69 | # Base config of dev | ||
70 | pg_set $dev "flag QUEUE_MAP_CPU" | ||
71 | pg_set $dev "count $COUNT" | ||
72 | pg_set $dev "pkt_size $PKT_SIZE" | ||
73 | pg_set $dev "delay $DELAY" | ||
74 | pg_set $dev "flag NO_TIMESTAMP" | ||
75 | |||
76 | # Destination | ||
77 | pg_set $dev "dst_mac $DST_MAC" | ||
78 | pg_set $dev "dst${IP6}_min $DST_MIN" | ||
79 | pg_set $dev "dst${IP6}_max $DST_MAX" | ||
80 | |||
81 | if [ -n "$DST_PORT" ]; then | ||
82 | # Single destination port or random port range | ||
83 | pg_set $dev "flag UDPDST_RND" | ||
84 | pg_set $dev "udp_dst_min $UDP_DST_MIN" | ||
85 | pg_set $dev "udp_dst_max $UDP_DST_MAX" | ||
86 | fi | ||
87 | |||
88 | # Inject packet into RX path of stack | ||
89 | pg_set $dev "xmit_mode netif_receive" | ||
90 | |||
91 | # Burst allow us to avoid measuring SKB alloc/free overhead | ||
92 | pg_set $dev "burst $BURST" | ||
93 | done | ||
94 | |||
95 | # start_run | ||
96 | echo "Running... ctrl^C to stop" >&2 | ||
97 | pg_ctrl "start" | ||
98 | echo "Done" >&2 | ||
99 | |||
100 | # Print results | ||
101 | for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do | ||
102 | dev=${DEV}@${thread} | ||
103 | echo "Device: $dev" | ||
104 | cat /proc/net/pktgen/$dev | grep -A2 "Result:" | ||
105 | done | ||
diff --git a/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh b/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh new file mode 100755 index 000000000..e607cb369 --- /dev/null +++ b/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh | |||
@@ -0,0 +1,85 @@ | |||
1 | #!/bin/bash | ||
2 | # SPDX-License-Identifier: GPL-2.0 | ||
3 | # | ||
4 | # Benchmark script: | ||
5 | # - developed for benchmarking egress qdisc path, derived (more | ||
6 | # like cut'n'pasted) from ingress benchmark script. | ||
7 | # | ||
8 | # Script for injecting packets into egress qdisc path of the stack | ||
9 | # with pktgen "xmit_mode queue_xmit". | ||
10 | # | ||
11 | basedir=`dirname $0` | ||
12 | source ${basedir}/functions.sh | ||
13 | root_check_run_with_sudo "$@" | ||
14 | |||
15 | # Parameter parsing via include | ||
16 | source ${basedir}/parameters.sh | ||
17 | if [ -z "$DEST_IP" ]; then | ||
18 | [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" | ||
19 | fi | ||
20 | [ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" | ||
21 | |||
22 | # Burst greater than 1 are invalid for queue_xmit mode | ||
23 | if [[ -n "$BURST" ]]; then | ||
24 | err 1 "Bursting not supported for this mode" | ||
25 | fi | ||
26 | [ -z "$COUNT" ] && COUNT="10000000" # Zero means indefinitely | ||
27 | if [ -n "$DEST_IP" ]; then | ||
28 | validate_addr${IP6} $DEST_IP | ||
29 | read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) | ||
30 | fi | ||
31 | if [ -n "$DST_PORT" ]; then | ||
32 | read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) | ||
33 | validate_ports $UDP_DST_MIN $UDP_DST_MAX | ||
34 | fi | ||
35 | |||
36 | # Base Config | ||
37 | DELAY="0" # Zero means max speed | ||
38 | |||
39 | # General cleanup everything since last run | ||
40 | pg_ctrl "reset" | ||
41 | |||
42 | # Threads are specified with parameter -t value in $THREADS | ||
43 | for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do | ||
44 | # The device name is extended with @name, using thread number to | ||
45 | # make then unique, but any name will do. | ||
46 | dev=${DEV}@${thread} | ||
47 | |||
48 | # Add remove all other devices and add_device $dev to thread | ||
49 | pg_thread $thread "rem_device_all" | ||
50 | pg_thread $thread "add_device" $dev | ||
51 | |||
52 | # Base config of dev | ||
53 | pg_set $dev "flag QUEUE_MAP_CPU" | ||
54 | pg_set $dev "count $COUNT" | ||
55 | pg_set $dev "pkt_size $PKT_SIZE" | ||
56 | pg_set $dev "delay $DELAY" | ||
57 | pg_set $dev "flag NO_TIMESTAMP" | ||
58 | |||
59 | # Destination | ||
60 | pg_set $dev "dst_mac $DST_MAC" | ||
61 | pg_set $dev "dst${IP6}_min $DST_MIN" | ||
62 | pg_set $dev "dst${IP6}_max $DST_MAX" | ||
63 | |||
64 | if [ -n "$DST_PORT" ]; then | ||
65 | # Single destination port or random port range | ||
66 | pg_set $dev "flag UDPDST_RND" | ||
67 | pg_set $dev "udp_dst_min $UDP_DST_MIN" | ||
68 | pg_set $dev "udp_dst_max $UDP_DST_MAX" | ||
69 | fi | ||
70 | |||
71 | # Inject packet into TX qdisc egress path of stack | ||
72 | pg_set $dev "xmit_mode queue_xmit" | ||
73 | done | ||
74 | |||
75 | # start_run | ||
76 | echo "Running... ctrl^C to stop" >&2 | ||
77 | pg_ctrl "start" | ||
78 | echo "Done" >&2 | ||
79 | |||
80 | # Print results | ||
81 | for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do | ||
82 | dev=${DEV}@${thread} | ||
83 | echo "Device: $dev" | ||
84 | cat /proc/net/pktgen/$dev | grep -A2 "Result:" | ||
85 | done | ||
diff --git a/samples/pktgen/pktgen_sample01_simple.sh b/samples/pktgen/pktgen_sample01_simple.sh new file mode 100755 index 000000000..a4e250b45 --- /dev/null +++ b/samples/pktgen/pktgen_sample01_simple.sh | |||
@@ -0,0 +1,90 @@ | |||
1 | #!/bin/bash | ||
2 | # SPDX-License-Identifier: GPL-2.0 | ||
3 | # | ||
4 | # Simple example: | ||
5 | # * pktgen sending with single thread and single interface | ||
6 | # * flow variation via random UDP source port | ||
7 | # | ||
8 | basedir=`dirname $0` | ||
9 | source ${basedir}/functions.sh | ||
10 | root_check_run_with_sudo "$@" | ||
11 | |||
12 | # Parameter parsing via include | ||
13 | # - go look in parameters.sh to see which setting are avail | ||
14 | # - required param is the interface "-i" stored in $DEV | ||
15 | source ${basedir}/parameters.sh | ||
16 | # | ||
17 | # Set some default params, if they didn't get set | ||
18 | if [ -z "$DEST_IP" ]; then | ||
19 | [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" | ||
20 | fi | ||
21 | [ -z "$CLONE_SKB" ] && CLONE_SKB="0" | ||
22 | # Example enforce param "-m" for dst_mac | ||
23 | [ -z "$DST_MAC" ] && usage && err 2 "Must specify -m dst_mac" | ||
24 | [ -z "$COUNT" ] && COUNT="100000" # Zero means indefinitely | ||
25 | if [ -n "$DEST_IP" ]; then | ||
26 | validate_addr${IP6} $DEST_IP | ||
27 | read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) | ||
28 | fi | ||
29 | if [ -n "$DST_PORT" ]; then | ||
30 | read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) | ||
31 | validate_ports $UDP_DST_MIN $UDP_DST_MAX | ||
32 | fi | ||
33 | |||
34 | # Base Config | ||
35 | DELAY="0" # Zero means max speed | ||
36 | |||
37 | # Flow variation random source port between min and max | ||
38 | UDP_SRC_MIN=9 | ||
39 | UDP_SRC_MAX=109 | ||
40 | |||
41 | # General cleanup everything since last run | ||
42 | # (especially important if other threads were configured by other scripts) | ||
43 | pg_ctrl "reset" | ||
44 | |||
45 | # Add remove all other devices and add_device $DEV to thread 0 | ||
46 | thread=0 | ||
47 | pg_thread $thread "rem_device_all" | ||
48 | pg_thread $thread "add_device" $DEV | ||
49 | |||
50 | # How many packets to send (zero means indefinitely) | ||
51 | pg_set $DEV "count $COUNT" | ||
52 | |||
53 | # Reduce alloc cost by sending same SKB many times | ||
54 | # - this obviously affects the randomness within the packet | ||
55 | pg_set $DEV "clone_skb $CLONE_SKB" | ||
56 | |||
57 | # Set packet size | ||
58 | pg_set $DEV "pkt_size $PKT_SIZE" | ||
59 | |||
60 | # Delay between packets (zero means max speed) | ||
61 | pg_set $DEV "delay $DELAY" | ||
62 | |||
63 | # Flag example disabling timestamping | ||
64 | pg_set $DEV "flag NO_TIMESTAMP" | ||
65 | |||
66 | # Destination | ||
67 | pg_set $DEV "dst_mac $DST_MAC" | ||
68 | pg_set $DEV "dst${IP6}_min $DST_MIN" | ||
69 | pg_set $DEV "dst${IP6}_max $DST_MAX" | ||
70 | |||
71 | if [ -n "$DST_PORT" ]; then | ||
72 | # Single destination port or random port range | ||
73 | pg_set $DEV "flag UDPDST_RND" | ||
74 | pg_set $DEV "udp_dst_min $UDP_DST_MIN" | ||
75 | pg_set $DEV "udp_dst_max $UDP_DST_MAX" | ||
76 | fi | ||
77 | |||
78 | # Setup random UDP port src range | ||
79 | pg_set $DEV "flag UDPSRC_RND" | ||
80 | pg_set $DEV "udp_src_min $UDP_SRC_MIN" | ||
81 | pg_set $DEV "udp_src_max $UDP_SRC_MAX" | ||
82 | |||
83 | # start_run | ||
84 | echo "Running... ctrl^C to stop" >&2 | ||
85 | pg_ctrl "start" | ||
86 | echo "Done" >&2 | ||
87 | |||
88 | # Print results | ||
89 | echo "Result device: $DEV" | ||
90 | cat /proc/net/pktgen/$DEV | ||
diff --git a/samples/pktgen/pktgen_sample02_multiqueue.sh b/samples/pktgen/pktgen_sample02_multiqueue.sh new file mode 100755 index 000000000..cb2495fcd --- /dev/null +++ b/samples/pktgen/pktgen_sample02_multiqueue.sh | |||
@@ -0,0 +1,95 @@ | |||
1 | #!/bin/bash | ||
2 | # SPDX-License-Identifier: GPL-2.0 | ||
3 | # | ||
4 | # Multiqueue: Using pktgen threads for sending on multiple CPUs | ||
5 | # * adding devices to kernel threads | ||
6 | # * notice the naming scheme for keeping device names unique | ||
7 | # * nameing scheme: dev@thread_number | ||
8 | # * flow variation via random UDP source port | ||
9 | # | ||
10 | basedir=`dirname $0` | ||
11 | source ${basedir}/functions.sh | ||
12 | root_check_run_with_sudo "$@" | ||
13 | # | ||
14 | # Required param: -i dev in $DEV | ||
15 | source ${basedir}/parameters.sh | ||
16 | |||
17 | [ -z "$COUNT" ] && COUNT="100000" # Zero means indefinitely | ||
18 | |||
19 | # Base Config | ||
20 | DELAY="0" # Zero means max speed | ||
21 | [ -z "$CLONE_SKB" ] && CLONE_SKB="0" | ||
22 | |||
23 | # Flow variation random source port between min and max | ||
24 | UDP_SRC_MIN=9 | ||
25 | UDP_SRC_MAX=109 | ||
26 | |||
27 | # (example of setting default params in your script) | ||
28 | if [ -z "$DEST_IP" ]; then | ||
29 | [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" | ||
30 | fi | ||
31 | [ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" | ||
32 | if [ -n "$DEST_IP" ]; then | ||
33 | validate_addr${IP6} $DEST_IP | ||
34 | read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) | ||
35 | fi | ||
36 | if [ -n "$DST_PORT" ]; then | ||
37 | read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) | ||
38 | validate_ports $UDP_DST_MIN $UDP_DST_MAX | ||
39 | fi | ||
40 | |||
41 | # General cleanup everything since last run | ||
42 | pg_ctrl "reset" | ||
43 | |||
44 | # Threads are specified with parameter -t value in $THREADS | ||
45 | for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do | ||
46 | # The device name is extended with @name, using thread number to | ||
47 | # make then unique, but any name will do. | ||
48 | dev=${DEV}@${thread} | ||
49 | |||
50 | # Add remove all other devices and add_device $dev to thread | ||
51 | pg_thread $thread "rem_device_all" | ||
52 | pg_thread $thread "add_device" $dev | ||
53 | |||
54 | # Notice config queue to map to cpu (mirrors smp_processor_id()) | ||
55 | # It is beneficial to map IRQ /proc/irq/*/smp_affinity 1:1 to CPU number | ||
56 | pg_set $dev "flag QUEUE_MAP_CPU" | ||
57 | |||
58 | # Base config of dev | ||
59 | pg_set $dev "count $COUNT" | ||
60 | pg_set $dev "clone_skb $CLONE_SKB" | ||
61 | pg_set $dev "pkt_size $PKT_SIZE" | ||
62 | pg_set $dev "delay $DELAY" | ||
63 | |||
64 | # Flag example disabling timestamping | ||
65 | pg_set $dev "flag NO_TIMESTAMP" | ||
66 | |||
67 | # Destination | ||
68 | pg_set $dev "dst_mac $DST_MAC" | ||
69 | pg_set $dev "dst${IP6}_min $DST_MIN" | ||
70 | pg_set $dev "dst${IP6}_max $DST_MAX" | ||
71 | |||
72 | if [ -n "$DST_PORT" ]; then | ||
73 | # Single destination port or random port range | ||
74 | pg_set $dev "flag UDPDST_RND" | ||
75 | pg_set $dev "udp_dst_min $UDP_DST_MIN" | ||
76 | pg_set $dev "udp_dst_max $UDP_DST_MAX" | ||
77 | fi | ||
78 | |||
79 | # Setup random UDP port src range | ||
80 | pg_set $dev "flag UDPSRC_RND" | ||
81 | pg_set $dev "udp_src_min $UDP_SRC_MIN" | ||
82 | pg_set $dev "udp_src_max $UDP_SRC_MAX" | ||
83 | done | ||
84 | |||
85 | # start_run | ||
86 | echo "Running... ctrl^C to stop" >&2 | ||
87 | pg_ctrl "start" | ||
88 | echo "Done" >&2 | ||
89 | |||
90 | # Print results | ||
91 | for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do | ||
92 | dev=${DEV}@${thread} | ||
93 | echo "Device: $dev" | ||
94 | cat /proc/net/pktgen/$dev | grep -A2 "Result:" | ||
95 | done | ||
diff --git a/samples/pktgen/pktgen_sample03_burst_single_flow.sh b/samples/pktgen/pktgen_sample03_burst_single_flow.sh new file mode 100755 index 000000000..fff50765a --- /dev/null +++ b/samples/pktgen/pktgen_sample03_burst_single_flow.sh | |||
@@ -0,0 +1,101 @@ | |||
1 | #!/bin/bash | ||
2 | # SPDX-License-Identifier: GPL-2.0 | ||
3 | # | ||
4 | # Script for max single flow performance | ||
5 | # - If correctly tuned[1], single CPU 10G wirespeed small pkts is possible[2] | ||
6 | # | ||
7 | # Using pktgen "burst" option (use -b $N) | ||
8 | # - To boost max performance | ||
9 | # - Avail since: kernel v3.18 | ||
10 | # * commit 38b2cf2982dc73 ("net: pktgen: packet bursting via skb->xmit_more") | ||
11 | # - This avoids writing the HW tailptr on every driver xmit | ||
12 | # - The performance boost is impressive, see commit and blog [2] | ||
13 | # | ||
14 | # Notice: On purpose generates a single (UDP) flow towards target, | ||
15 | # reason behind this is to only overload/activate a single CPU on | ||
16 | # target host. And no randomness for pktgen also makes it faster. | ||
17 | # | ||
18 | # Tuning see: | ||
19 | # [1] http://netoptimizer.blogspot.dk/2014/06/pktgen-for-network-overload-testing.html | ||
20 | # [2] http://netoptimizer.blogspot.dk/2014/10/unlocked-10gbps-tx-wirespeed-smallest.html | ||
21 | # | ||
22 | basedir=`dirname $0` | ||
23 | source ${basedir}/functions.sh | ||
24 | root_check_run_with_sudo "$@" | ||
25 | |||
26 | # Parameter parsing via include | ||
27 | source ${basedir}/parameters.sh | ||
28 | # Set some default params, if they didn't get set | ||
29 | if [ -z "$DEST_IP" ]; then | ||
30 | [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" | ||
31 | fi | ||
32 | [ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" | ||
33 | [ -z "$BURST" ] && BURST=32 | ||
34 | [ -z "$CLONE_SKB" ] && CLONE_SKB="0" # No need for clones when bursting | ||
35 | [ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely | ||
36 | if [ -n "$DEST_IP" ]; then | ||
37 | validate_addr${IP6} $DEST_IP | ||
38 | read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) | ||
39 | fi | ||
40 | if [ -n "$DST_PORT" ]; then | ||
41 | read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) | ||
42 | validate_ports $UDP_DST_MIN $UDP_DST_MAX | ||
43 | fi | ||
44 | |||
45 | # Base Config | ||
46 | DELAY="0" # Zero means max speed | ||
47 | |||
48 | # General cleanup everything since last run | ||
49 | pg_ctrl "reset" | ||
50 | |||
51 | # Threads are specified with parameter -t value in $THREADS | ||
52 | for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do | ||
53 | dev=${DEV}@${thread} | ||
54 | |||
55 | # Add remove all other devices and add_device $dev to thread | ||
56 | pg_thread $thread "rem_device_all" | ||
57 | pg_thread $thread "add_device" $dev | ||
58 | |||
59 | # Base config | ||
60 | pg_set $dev "flag QUEUE_MAP_CPU" | ||
61 | pg_set $dev "count $COUNT" | ||
62 | pg_set $dev "clone_skb $CLONE_SKB" | ||
63 | pg_set $dev "pkt_size $PKT_SIZE" | ||
64 | pg_set $dev "delay $DELAY" | ||
65 | pg_set $dev "flag NO_TIMESTAMP" | ||
66 | |||
67 | # Destination | ||
68 | pg_set $dev "dst_mac $DST_MAC" | ||
69 | pg_set $dev "dst${IP6}_min $DST_MIN" | ||
70 | pg_set $dev "dst${IP6}_max $DST_MAX" | ||
71 | |||
72 | if [ -n "$DST_PORT" ]; then | ||
73 | # Single destination port or random port range | ||
74 | pg_set $dev "flag UDPDST_RND" | ||
75 | pg_set $dev "udp_dst_min $UDP_DST_MIN" | ||
76 | pg_set $dev "udp_dst_max $UDP_DST_MAX" | ||
77 | fi | ||
78 | |||
79 | # Setup burst, for easy testing -b 0 disable bursting | ||
80 | # (internally in pktgen default and minimum burst=1) | ||
81 | if [[ ${BURST} -ne 0 ]]; then | ||
82 | pg_set $dev "burst $BURST" | ||
83 | else | ||
84 | info "$dev: Not using burst" | ||
85 | fi | ||
86 | done | ||
87 | |||
88 | # Run if user hits control-c | ||
89 | function control_c() { | ||
90 | # Print results | ||
91 | for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do | ||
92 | dev=${DEV}@${thread} | ||
93 | echo "Device: $dev" | ||
94 | cat /proc/net/pktgen/$dev | grep -A2 "Result:" | ||
95 | done | ||
96 | } | ||
97 | # trap keyboard interrupt (Ctrl-C) | ||
98 | trap control_c SIGINT | ||
99 | |||
100 | echo "Running... ctrl^C to stop" >&2 | ||
101 | pg_ctrl "start" | ||
diff --git a/samples/pktgen/pktgen_sample04_many_flows.sh b/samples/pktgen/pktgen_sample04_many_flows.sh new file mode 100755 index 000000000..9db1ecf8d --- /dev/null +++ b/samples/pktgen/pktgen_sample04_many_flows.sh | |||
@@ -0,0 +1,115 @@ | |||
1 | #!/bin/bash | ||
2 | # SPDX-License-Identifier: GPL-2.0 | ||
3 | # | ||
4 | # Script example for many flows testing | ||
5 | # | ||
6 | # Number of simultaneous flows limited by variable $FLOWS | ||
7 | # and number of packets per flow controlled by variable $FLOWLEN | ||
8 | # | ||
9 | basedir=`dirname $0` | ||
10 | source ${basedir}/functions.sh | ||
11 | root_check_run_with_sudo "$@" | ||
12 | |||
13 | # Parameter parsing via include | ||
14 | source ${basedir}/parameters.sh | ||
15 | # Set some default params, if they didn't get set | ||
16 | if [ -z "$DEST_IP" ]; then | ||
17 | [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" | ||
18 | fi | ||
19 | [ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" | ||
20 | [ -z "$CLONE_SKB" ] && CLONE_SKB="0" | ||
21 | [ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely | ||
22 | if [ -n "$DEST_IP" ]; then | ||
23 | validate_addr${IP6} $DEST_IP | ||
24 | read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) | ||
25 | fi | ||
26 | if [ -n "$DST_PORT" ]; then | ||
27 | read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) | ||
28 | validate_ports $UDP_DST_MIN $UDP_DST_MAX | ||
29 | fi | ||
30 | |||
31 | # NOTICE: Script specific settings | ||
32 | # ======= | ||
33 | # Limiting the number of concurrent flows ($FLOWS) | ||
34 | # and also set how many packets each flow contains ($FLOWLEN) | ||
35 | # | ||
36 | [ -z "$FLOWS" ] && FLOWS="8000" | ||
37 | [ -z "$FLOWLEN" ] && FLOWLEN="10" | ||
38 | |||
39 | # Base Config | ||
40 | DELAY="0" # Zero means max speed | ||
41 | |||
42 | if [[ -n "$BURST" ]]; then | ||
43 | err 1 "Bursting not supported for this mode" | ||
44 | fi | ||
45 | |||
46 | # 198.18.0.0 / 198.19.255.255 | ||
47 | read -r SRC_MIN SRC_MAX <<< $(parse_addr 198.18.0.0/15) | ||
48 | |||
49 | # General cleanup everything since last run | ||
50 | pg_ctrl "reset" | ||
51 | |||
52 | # Threads are specified with parameter -t value in $THREADS | ||
53 | for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do | ||
54 | dev=${DEV}@${thread} | ||
55 | |||
56 | # Add remove all other devices and add_device $dev to thread | ||
57 | pg_thread $thread "rem_device_all" | ||
58 | pg_thread $thread "add_device" $dev | ||
59 | |||
60 | # Base config | ||
61 | pg_set $dev "flag QUEUE_MAP_CPU" | ||
62 | pg_set $dev "count $COUNT" | ||
63 | pg_set $dev "clone_skb $CLONE_SKB" | ||
64 | pg_set $dev "pkt_size $PKT_SIZE" | ||
65 | pg_set $dev "delay $DELAY" | ||
66 | pg_set $dev "flag NO_TIMESTAMP" | ||
67 | |||
68 | # Single destination | ||
69 | pg_set $dev "dst_mac $DST_MAC" | ||
70 | pg_set $dev "dst${IP6}_min $DST_MIN" | ||
71 | pg_set $dev "dst${IP6}_max $DST_MAX" | ||
72 | |||
73 | if [ -n "$DST_PORT" ]; then | ||
74 | # Single destination port or random port range | ||
75 | pg_set $dev "flag UDPDST_RND" | ||
76 | pg_set $dev "udp_dst_min $UDP_DST_MIN" | ||
77 | pg_set $dev "udp_dst_max $UDP_DST_MAX" | ||
78 | fi | ||
79 | |||
80 | # Randomize source IP-addresses | ||
81 | pg_set $dev "flag IPSRC_RND" | ||
82 | pg_set $dev "src_min $SRC_MIN" | ||
83 | pg_set $dev "src_max $SRC_MAX" | ||
84 | |||
85 | # Limit number of flows (max 65535) | ||
86 | pg_set $dev "flows $FLOWS" | ||
87 | # | ||
88 | # How many packets a flow will send, before flow "entry" is | ||
89 | # re-generated/setup. | ||
90 | pg_set $dev "flowlen $FLOWLEN" | ||
91 | # | ||
92 | # Flag FLOW_SEQ will cause $FLOWLEN packets from the same flow | ||
93 | # being send back-to-back, before next flow is selected | ||
94 | # incrementally. This helps lookup caches, and is more realistic. | ||
95 | # | ||
96 | pg_set $dev "flag FLOW_SEQ" | ||
97 | |||
98 | done | ||
99 | |||
100 | # Run if user hits control-c | ||
101 | function print_result() { | ||
102 | # Print results | ||
103 | for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do | ||
104 | dev=${DEV}@${thread} | ||
105 | echo "Device: $dev" | ||
106 | cat /proc/net/pktgen/$dev | grep -A2 "Result:" | ||
107 | done | ||
108 | } | ||
109 | # trap keyboard interrupt (Ctrl-C) | ||
110 | trap true SIGINT | ||
111 | |||
112 | echo "Running... ctrl^C to stop" >&2 | ||
113 | pg_ctrl "start" | ||
114 | |||
115 | print_result | ||
diff --git a/samples/pktgen/pktgen_sample05_flow_per_thread.sh b/samples/pktgen/pktgen_sample05_flow_per_thread.sh new file mode 100755 index 000000000..9fc6c6da0 --- /dev/null +++ b/samples/pktgen/pktgen_sample05_flow_per_thread.sh | |||
@@ -0,0 +1,99 @@ | |||
1 | #!/bin/bash | ||
2 | # SPDX-License-Identifier: GPL-2.0 | ||
3 | # | ||
4 | # Script will generate one flow per thread (-t N) | ||
5 | # - Same destination IP | ||
6 | # - Fake source IPs for each flow (fixed based on thread number) | ||
7 | # | ||
8 | # Useful for scale testing on receiver, to see whether silo'ing flows | ||
9 | # works and scales. For optimal scalability (on receiver) each | ||
10 | # separate-flow should not access shared variables/data. This script | ||
11 | # helps magnify any of these scaling issues by overloading the receiver. | ||
12 | # | ||
13 | basedir=`dirname $0` | ||
14 | source ${basedir}/functions.sh | ||
15 | root_check_run_with_sudo "$@" | ||
16 | |||
17 | # Parameter parsing via include | ||
18 | source ${basedir}/parameters.sh | ||
19 | # Set some default params, if they didn't get set | ||
20 | if [ -z "$DEST_IP" ]; then | ||
21 | [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" | ||
22 | fi | ||
23 | [ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" | ||
24 | [ -z "$CLONE_SKB" ] && CLONE_SKB="0" | ||
25 | [ -z "$BURST" ] && BURST=32 | ||
26 | [ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely | ||
27 | if [ -n "$DEST_IP" ]; then | ||
28 | validate_addr${IP6} $DEST_IP | ||
29 | read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) | ||
30 | fi | ||
31 | if [ -n "$DST_PORT" ]; then | ||
32 | read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) | ||
33 | validate_ports $UDP_DST_MIN $UDP_DST_MAX | ||
34 | fi | ||
35 | |||
36 | # Base Config | ||
37 | DELAY="0" # Zero means max speed | ||
38 | |||
39 | # General cleanup everything since last run | ||
40 | pg_ctrl "reset" | ||
41 | |||
42 | # Threads are specified with parameter -t value in $THREADS | ||
43 | for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do | ||
44 | dev=${DEV}@${thread} | ||
45 | |||
46 | # Add remove all other devices and add_device $dev to thread | ||
47 | pg_thread $thread "rem_device_all" | ||
48 | pg_thread $thread "add_device" $dev | ||
49 | |||
50 | # Base config | ||
51 | pg_set $dev "flag QUEUE_MAP_CPU" | ||
52 | pg_set $dev "count $COUNT" | ||
53 | pg_set $dev "clone_skb $CLONE_SKB" | ||
54 | pg_set $dev "pkt_size $PKT_SIZE" | ||
55 | pg_set $dev "delay $DELAY" | ||
56 | pg_set $dev "flag NO_TIMESTAMP" | ||
57 | |||
58 | # Single destination | ||
59 | pg_set $dev "dst_mac $DST_MAC" | ||
60 | pg_set $dev "dst${IP6}_min $DST_MIN" | ||
61 | pg_set $dev "dst${IP6}_max $DST_MAX" | ||
62 | |||
63 | if [ -n "$DST_PORT" ]; then | ||
64 | # Single destination port or random port range | ||
65 | pg_set $dev "flag UDPDST_RND" | ||
66 | pg_set $dev "udp_dst_min $UDP_DST_MIN" | ||
67 | pg_set $dev "udp_dst_max $UDP_DST_MAX" | ||
68 | fi | ||
69 | |||
70 | # Setup source IP-addresses based on thread number | ||
71 | pg_set $dev "src_min 198.18.$((thread+1)).1" | ||
72 | pg_set $dev "src_max 198.18.$((thread+1)).1" | ||
73 | |||
74 | # Setup burst, for easy testing -b 0 disable bursting | ||
75 | # (internally in pktgen default and minimum burst=1) | ||
76 | if [[ ${BURST} -ne 0 ]]; then | ||
77 | pg_set $dev "burst $BURST" | ||
78 | else | ||
79 | info "$dev: Not using burst" | ||
80 | fi | ||
81 | |||
82 | done | ||
83 | |||
84 | # Run if user hits control-c | ||
85 | function print_result() { | ||
86 | # Print results | ||
87 | for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do | ||
88 | dev=${DEV}@${thread} | ||
89 | echo "Device: $dev" | ||
90 | cat /proc/net/pktgen/$dev | grep -A2 "Result:" | ||
91 | done | ||
92 | } | ||
93 | # trap keyboard interrupt (Ctrl-C) | ||
94 | trap true SIGINT | ||
95 | |||
96 | echo "Running... ctrl^C to stop" >&2 | ||
97 | pg_ctrl "start" | ||
98 | |||
99 | print_result | ||
diff --git a/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh new file mode 100755 index 000000000..728106060 --- /dev/null +++ b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh | |||
@@ -0,0 +1,113 @@ | |||
1 | #!/bin/bash | ||
2 | # | ||
3 | # Multiqueue: Using pktgen threads for sending on multiple CPUs | ||
4 | # * adding devices to kernel threads which are in the same NUMA node | ||
5 | # * bound devices queue's irq affinity to the threads, 1:1 mapping | ||
6 | # * notice the naming scheme for keeping device names unique | ||
7 | # * nameing scheme: dev@thread_number | ||
8 | # * flow variation via random UDP source port | ||
9 | # | ||
10 | basedir=`dirname $0` | ||
11 | source ${basedir}/functions.sh | ||
12 | root_check_run_with_sudo "$@" | ||
13 | # | ||
14 | # Required param: -i dev in $DEV | ||
15 | source ${basedir}/parameters.sh | ||
16 | |||
17 | # Base Config | ||
18 | DELAY="0" # Zero means max speed | ||
19 | [ -z "$COUNT" ] && COUNT="20000000" # Zero means indefinitely | ||
20 | [ -z "$CLONE_SKB" ] && CLONE_SKB="0" | ||
21 | |||
22 | # Flow variation random source port between min and max | ||
23 | UDP_SRC_MIN=9 | ||
24 | UDP_SRC_MAX=109 | ||
25 | |||
26 | node=`get_iface_node $DEV` | ||
27 | irq_array=(`get_iface_irqs $DEV`) | ||
28 | cpu_array=(`get_node_cpus $node`) | ||
29 | |||
30 | [ $THREADS -gt ${#irq_array[*]} -o $THREADS -gt ${#cpu_array[*]} ] && \ | ||
31 | err 1 "Thread number $THREADS exceeds: min (${#irq_array[*]},${#cpu_array[*]})" | ||
32 | |||
33 | # (example of setting default params in your script) | ||
34 | if [ -z "$DEST_IP" ]; then | ||
35 | [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" | ||
36 | fi | ||
37 | [ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" | ||
38 | if [ -n "$DEST_IP" ]; then | ||
39 | validate_addr${IP6} $DEST_IP | ||
40 | read -r DST_MIN DST_MAX <<< $(parse_addr${IP6} $DEST_IP) | ||
41 | fi | ||
42 | if [ -n "$DST_PORT" ]; then | ||
43 | read -r UDP_DST_MIN UDP_DST_MAX <<< $(parse_ports $DST_PORT) | ||
44 | validate_ports $UDP_DST_MIN $UDP_DST_MAX | ||
45 | fi | ||
46 | |||
47 | # General cleanup everything since last run | ||
48 | pg_ctrl "reset" | ||
49 | |||
50 | # Threads are specified with parameter -t value in $THREADS | ||
51 | for ((i = 0; i < $THREADS; i++)); do | ||
52 | # The device name is extended with @name, using thread number to | ||
53 | # make then unique, but any name will do. | ||
54 | # Set the queue's irq affinity to this $thread (processor) | ||
55 | # if '-f' is designated, offset cpu id | ||
56 | thread=${cpu_array[$((i+F_THREAD))]} | ||
57 | dev=${DEV}@${thread} | ||
58 | echo $thread > /proc/irq/${irq_array[$i]}/smp_affinity_list | ||
59 | info "irq ${irq_array[$i]} is set affinity to `cat /proc/irq/${irq_array[$i]}/smp_affinity_list`" | ||
60 | |||
61 | # Add remove all other devices and add_device $dev to thread | ||
62 | pg_thread $thread "rem_device_all" | ||
63 | pg_thread $thread "add_device" $dev | ||
64 | |||
65 | # select queue and bind the queue and $dev in 1:1 relationship | ||
66 | queue_num=$i | ||
67 | info "queue number is $queue_num" | ||
68 | pg_set $dev "queue_map_min $queue_num" | ||
69 | pg_set $dev "queue_map_max $queue_num" | ||
70 | |||
71 | # Notice config queue to map to cpu (mirrors smp_processor_id()) | ||
72 | # It is beneficial to map IRQ /proc/irq/*/smp_affinity 1:1 to CPU number | ||
73 | pg_set $dev "flag QUEUE_MAP_CPU" | ||
74 | |||
75 | # Base config of dev | ||
76 | pg_set $dev "count $COUNT" | ||
77 | pg_set $dev "clone_skb $CLONE_SKB" | ||
78 | pg_set $dev "pkt_size $PKT_SIZE" | ||
79 | pg_set $dev "delay $DELAY" | ||
80 | |||
81 | # Flag example disabling timestamping | ||
82 | pg_set $dev "flag NO_TIMESTAMP" | ||
83 | |||
84 | # Destination | ||
85 | pg_set $dev "dst_mac $DST_MAC" | ||
86 | pg_set $dev "dst${IP6}_min $DST_MIN" | ||
87 | pg_set $dev "dst${IP6}_max $DST_MAX" | ||
88 | |||
89 | if [ -n "$DST_PORT" ]; then | ||
90 | # Single destination port or random port range | ||
91 | pg_set $dev "flag UDPDST_RND" | ||
92 | pg_set $dev "udp_dst_min $UDP_DST_MIN" | ||
93 | pg_set $dev "udp_dst_max $UDP_DST_MAX" | ||
94 | fi | ||
95 | |||
96 | # Setup random UDP port src range | ||
97 | pg_set $dev "flag UDPSRC_RND" | ||
98 | pg_set $dev "udp_src_min $UDP_SRC_MIN" | ||
99 | pg_set $dev "udp_src_max $UDP_SRC_MAX" | ||
100 | done | ||
101 | |||
102 | # start_run | ||
103 | echo "Running... ctrl^C to stop" >&2 | ||
104 | pg_ctrl "start" | ||
105 | echo "Done" >&2 | ||
106 | |||
107 | # Print results | ||
108 | for ((i = 0; i < $THREADS; i++)); do | ||
109 | thread=${cpu_array[$((i+F_THREAD))]} | ||
110 | dev=${DEV}@${thread} | ||
111 | echo "Device: $dev" | ||
112 | cat /proc/net/pktgen/$dev | grep -A2 "Result:" | ||
113 | done | ||
diff --git a/samples/qmi/Makefile b/samples/qmi/Makefile new file mode 100644 index 000000000..641943d40 --- /dev/null +++ b/samples/qmi/Makefile | |||
@@ -0,0 +1,2 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | obj-$(CONFIG_SAMPLE_QMI_CLIENT) += qmi_sample_client.o | ||
diff --git a/samples/qmi/qmi_sample_client.c b/samples/qmi/qmi_sample_client.c new file mode 100644 index 000000000..c9e7276c3 --- /dev/null +++ b/samples/qmi/qmi_sample_client.c | |||
@@ -0,0 +1,622 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Sample in-kernel QMI client driver | ||
4 | * | ||
5 | * Copyright (c) 2013-2014, The Linux Foundation. All rights reserved. | ||
6 | * Copyright (C) 2017 Linaro Ltd. | ||
7 | */ | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/debugfs.h> | ||
11 | #include <linux/device.h> | ||
12 | #include <linux/platform_device.h> | ||
13 | #include <linux/qrtr.h> | ||
14 | #include <linux/net.h> | ||
15 | #include <linux/completion.h> | ||
16 | #include <linux/idr.h> | ||
17 | #include <linux/string.h> | ||
18 | #include <net/sock.h> | ||
19 | #include <linux/soc/qcom/qmi.h> | ||
20 | |||
21 | #define PING_REQ1_TLV_TYPE 0x1 | ||
22 | #define PING_RESP1_TLV_TYPE 0x2 | ||
23 | #define PING_OPT1_TLV_TYPE 0x10 | ||
24 | #define PING_OPT2_TLV_TYPE 0x11 | ||
25 | |||
26 | #define DATA_REQ1_TLV_TYPE 0x1 | ||
27 | #define DATA_RESP1_TLV_TYPE 0x2 | ||
28 | #define DATA_OPT1_TLV_TYPE 0x10 | ||
29 | #define DATA_OPT2_TLV_TYPE 0x11 | ||
30 | |||
31 | #define TEST_MED_DATA_SIZE_V01 8192 | ||
32 | #define TEST_MAX_NAME_SIZE_V01 255 | ||
33 | |||
34 | #define TEST_PING_REQ_MSG_ID_V01 0x20 | ||
35 | #define TEST_DATA_REQ_MSG_ID_V01 0x21 | ||
36 | |||
37 | #define TEST_PING_REQ_MAX_MSG_LEN_V01 266 | ||
38 | #define TEST_DATA_REQ_MAX_MSG_LEN_V01 8456 | ||
39 | |||
40 | struct test_name_type_v01 { | ||
41 | u32 name_len; | ||
42 | char name[TEST_MAX_NAME_SIZE_V01]; | ||
43 | }; | ||
44 | |||
45 | static struct qmi_elem_info test_name_type_v01_ei[] = { | ||
46 | { | ||
47 | .data_type = QMI_DATA_LEN, | ||
48 | .elem_len = 1, | ||
49 | .elem_size = sizeof(u8), | ||
50 | .array_type = NO_ARRAY, | ||
51 | .tlv_type = QMI_COMMON_TLV_TYPE, | ||
52 | .offset = offsetof(struct test_name_type_v01, | ||
53 | name_len), | ||
54 | }, | ||
55 | { | ||
56 | .data_type = QMI_UNSIGNED_1_BYTE, | ||
57 | .elem_len = TEST_MAX_NAME_SIZE_V01, | ||
58 | .elem_size = sizeof(char), | ||
59 | .array_type = VAR_LEN_ARRAY, | ||
60 | .tlv_type = QMI_COMMON_TLV_TYPE, | ||
61 | .offset = offsetof(struct test_name_type_v01, | ||
62 | name), | ||
63 | }, | ||
64 | {} | ||
65 | }; | ||
66 | |||
67 | struct test_ping_req_msg_v01 { | ||
68 | char ping[4]; | ||
69 | |||
70 | u8 client_name_valid; | ||
71 | struct test_name_type_v01 client_name; | ||
72 | }; | ||
73 | |||
74 | static struct qmi_elem_info test_ping_req_msg_v01_ei[] = { | ||
75 | { | ||
76 | .data_type = QMI_UNSIGNED_1_BYTE, | ||
77 | .elem_len = 4, | ||
78 | .elem_size = sizeof(char), | ||
79 | .array_type = STATIC_ARRAY, | ||
80 | .tlv_type = PING_REQ1_TLV_TYPE, | ||
81 | .offset = offsetof(struct test_ping_req_msg_v01, | ||
82 | ping), | ||
83 | }, | ||
84 | { | ||
85 | .data_type = QMI_OPT_FLAG, | ||
86 | .elem_len = 1, | ||
87 | .elem_size = sizeof(u8), | ||
88 | .array_type = NO_ARRAY, | ||
89 | .tlv_type = PING_OPT1_TLV_TYPE, | ||
90 | .offset = offsetof(struct test_ping_req_msg_v01, | ||
91 | client_name_valid), | ||
92 | }, | ||
93 | { | ||
94 | .data_type = QMI_STRUCT, | ||
95 | .elem_len = 1, | ||
96 | .elem_size = sizeof(struct test_name_type_v01), | ||
97 | .array_type = NO_ARRAY, | ||
98 | .tlv_type = PING_OPT1_TLV_TYPE, | ||
99 | .offset = offsetof(struct test_ping_req_msg_v01, | ||
100 | client_name), | ||
101 | .ei_array = test_name_type_v01_ei, | ||
102 | }, | ||
103 | {} | ||
104 | }; | ||
105 | |||
106 | struct test_ping_resp_msg_v01 { | ||
107 | struct qmi_response_type_v01 resp; | ||
108 | |||
109 | u8 pong_valid; | ||
110 | char pong[4]; | ||
111 | |||
112 | u8 service_name_valid; | ||
113 | struct test_name_type_v01 service_name; | ||
114 | }; | ||
115 | |||
116 | static struct qmi_elem_info test_ping_resp_msg_v01_ei[] = { | ||
117 | { | ||
118 | .data_type = QMI_STRUCT, | ||
119 | .elem_len = 1, | ||
120 | .elem_size = sizeof(struct qmi_response_type_v01), | ||
121 | .array_type = NO_ARRAY, | ||
122 | .tlv_type = PING_RESP1_TLV_TYPE, | ||
123 | .offset = offsetof(struct test_ping_resp_msg_v01, | ||
124 | resp), | ||
125 | .ei_array = qmi_response_type_v01_ei, | ||
126 | }, | ||
127 | { | ||
128 | .data_type = QMI_OPT_FLAG, | ||
129 | .elem_len = 1, | ||
130 | .elem_size = sizeof(u8), | ||
131 | .array_type = NO_ARRAY, | ||
132 | .tlv_type = PING_OPT1_TLV_TYPE, | ||
133 | .offset = offsetof(struct test_ping_resp_msg_v01, | ||
134 | pong_valid), | ||
135 | }, | ||
136 | { | ||
137 | .data_type = QMI_UNSIGNED_1_BYTE, | ||
138 | .elem_len = 4, | ||
139 | .elem_size = sizeof(char), | ||
140 | .array_type = STATIC_ARRAY, | ||
141 | .tlv_type = PING_OPT1_TLV_TYPE, | ||
142 | .offset = offsetof(struct test_ping_resp_msg_v01, | ||
143 | pong), | ||
144 | }, | ||
145 | { | ||
146 | .data_type = QMI_OPT_FLAG, | ||
147 | .elem_len = 1, | ||
148 | .elem_size = sizeof(u8), | ||
149 | .array_type = NO_ARRAY, | ||
150 | .tlv_type = PING_OPT2_TLV_TYPE, | ||
151 | .offset = offsetof(struct test_ping_resp_msg_v01, | ||
152 | service_name_valid), | ||
153 | }, | ||
154 | { | ||
155 | .data_type = QMI_STRUCT, | ||
156 | .elem_len = 1, | ||
157 | .elem_size = sizeof(struct test_name_type_v01), | ||
158 | .array_type = NO_ARRAY, | ||
159 | .tlv_type = PING_OPT2_TLV_TYPE, | ||
160 | .offset = offsetof(struct test_ping_resp_msg_v01, | ||
161 | service_name), | ||
162 | .ei_array = test_name_type_v01_ei, | ||
163 | }, | ||
164 | {} | ||
165 | }; | ||
166 | |||
167 | struct test_data_req_msg_v01 { | ||
168 | u32 data_len; | ||
169 | u8 data[TEST_MED_DATA_SIZE_V01]; | ||
170 | |||
171 | u8 client_name_valid; | ||
172 | struct test_name_type_v01 client_name; | ||
173 | }; | ||
174 | |||
175 | static struct qmi_elem_info test_data_req_msg_v01_ei[] = { | ||
176 | { | ||
177 | .data_type = QMI_DATA_LEN, | ||
178 | .elem_len = 1, | ||
179 | .elem_size = sizeof(u32), | ||
180 | .array_type = NO_ARRAY, | ||
181 | .tlv_type = DATA_REQ1_TLV_TYPE, | ||
182 | .offset = offsetof(struct test_data_req_msg_v01, | ||
183 | data_len), | ||
184 | }, | ||
185 | { | ||
186 | .data_type = QMI_UNSIGNED_1_BYTE, | ||
187 | .elem_len = TEST_MED_DATA_SIZE_V01, | ||
188 | .elem_size = sizeof(u8), | ||
189 | .array_type = VAR_LEN_ARRAY, | ||
190 | .tlv_type = DATA_REQ1_TLV_TYPE, | ||
191 | .offset = offsetof(struct test_data_req_msg_v01, | ||
192 | data), | ||
193 | }, | ||
194 | { | ||
195 | .data_type = QMI_OPT_FLAG, | ||
196 | .elem_len = 1, | ||
197 | .elem_size = sizeof(u8), | ||
198 | .array_type = NO_ARRAY, | ||
199 | .tlv_type = DATA_OPT1_TLV_TYPE, | ||
200 | .offset = offsetof(struct test_data_req_msg_v01, | ||
201 | client_name_valid), | ||
202 | }, | ||
203 | { | ||
204 | .data_type = QMI_STRUCT, | ||
205 | .elem_len = 1, | ||
206 | .elem_size = sizeof(struct test_name_type_v01), | ||
207 | .array_type = NO_ARRAY, | ||
208 | .tlv_type = DATA_OPT1_TLV_TYPE, | ||
209 | .offset = offsetof(struct test_data_req_msg_v01, | ||
210 | client_name), | ||
211 | .ei_array = test_name_type_v01_ei, | ||
212 | }, | ||
213 | {} | ||
214 | }; | ||
215 | |||
216 | struct test_data_resp_msg_v01 { | ||
217 | struct qmi_response_type_v01 resp; | ||
218 | |||
219 | u8 data_valid; | ||
220 | u32 data_len; | ||
221 | u8 data[TEST_MED_DATA_SIZE_V01]; | ||
222 | |||
223 | u8 service_name_valid; | ||
224 | struct test_name_type_v01 service_name; | ||
225 | }; | ||
226 | |||
227 | static struct qmi_elem_info test_data_resp_msg_v01_ei[] = { | ||
228 | { | ||
229 | .data_type = QMI_STRUCT, | ||
230 | .elem_len = 1, | ||
231 | .elem_size = sizeof(struct qmi_response_type_v01), | ||
232 | .array_type = NO_ARRAY, | ||
233 | .tlv_type = DATA_RESP1_TLV_TYPE, | ||
234 | .offset = offsetof(struct test_data_resp_msg_v01, | ||
235 | resp), | ||
236 | .ei_array = qmi_response_type_v01_ei, | ||
237 | }, | ||
238 | { | ||
239 | .data_type = QMI_OPT_FLAG, | ||
240 | .elem_len = 1, | ||
241 | .elem_size = sizeof(u8), | ||
242 | .array_type = NO_ARRAY, | ||
243 | .tlv_type = DATA_OPT1_TLV_TYPE, | ||
244 | .offset = offsetof(struct test_data_resp_msg_v01, | ||
245 | data_valid), | ||
246 | }, | ||
247 | { | ||
248 | .data_type = QMI_DATA_LEN, | ||
249 | .elem_len = 1, | ||
250 | .elem_size = sizeof(u32), | ||
251 | .array_type = NO_ARRAY, | ||
252 | .tlv_type = DATA_OPT1_TLV_TYPE, | ||
253 | .offset = offsetof(struct test_data_resp_msg_v01, | ||
254 | data_len), | ||
255 | }, | ||
256 | { | ||
257 | .data_type = QMI_UNSIGNED_1_BYTE, | ||
258 | .elem_len = TEST_MED_DATA_SIZE_V01, | ||
259 | .elem_size = sizeof(u8), | ||
260 | .array_type = VAR_LEN_ARRAY, | ||
261 | .tlv_type = DATA_OPT1_TLV_TYPE, | ||
262 | .offset = offsetof(struct test_data_resp_msg_v01, | ||
263 | data), | ||
264 | }, | ||
265 | { | ||
266 | .data_type = QMI_OPT_FLAG, | ||
267 | .elem_len = 1, | ||
268 | .elem_size = sizeof(u8), | ||
269 | .array_type = NO_ARRAY, | ||
270 | .tlv_type = DATA_OPT2_TLV_TYPE, | ||
271 | .offset = offsetof(struct test_data_resp_msg_v01, | ||
272 | service_name_valid), | ||
273 | }, | ||
274 | { | ||
275 | .data_type = QMI_STRUCT, | ||
276 | .elem_len = 1, | ||
277 | .elem_size = sizeof(struct test_name_type_v01), | ||
278 | .array_type = NO_ARRAY, | ||
279 | .tlv_type = DATA_OPT2_TLV_TYPE, | ||
280 | .offset = offsetof(struct test_data_resp_msg_v01, | ||
281 | service_name), | ||
282 | .ei_array = test_name_type_v01_ei, | ||
283 | }, | ||
284 | {} | ||
285 | }; | ||
286 | |||
287 | /* | ||
288 | * ping_write() - ping_pong debugfs file write handler | ||
289 | * @file: debugfs file context | ||
290 | * @user_buf: reference to the user data (ignored) | ||
291 | * @count: number of bytes in @user_buf | ||
292 | * @ppos: offset in @file to write | ||
293 | * | ||
294 | * This function allows user space to send out a ping_pong QMI encoded message | ||
295 | * to the associated remote test service and will return with the result of the | ||
296 | * transaction. It serves as an example of how to provide a custom response | ||
297 | * handler. | ||
298 | * | ||
299 | * Return: @count, or negative errno on failure. | ||
300 | */ | ||
301 | static ssize_t ping_write(struct file *file, const char __user *user_buf, | ||
302 | size_t count, loff_t *ppos) | ||
303 | { | ||
304 | struct qmi_handle *qmi = file->private_data; | ||
305 | struct test_ping_req_msg_v01 req = {}; | ||
306 | struct qmi_txn txn; | ||
307 | int ret; | ||
308 | |||
309 | memcpy(req.ping, "ping", sizeof(req.ping)); | ||
310 | |||
311 | ret = qmi_txn_init(qmi, &txn, NULL, NULL); | ||
312 | if (ret < 0) | ||
313 | return ret; | ||
314 | |||
315 | ret = qmi_send_request(qmi, NULL, &txn, | ||
316 | TEST_PING_REQ_MSG_ID_V01, | ||
317 | TEST_PING_REQ_MAX_MSG_LEN_V01, | ||
318 | test_ping_req_msg_v01_ei, &req); | ||
319 | if (ret < 0) { | ||
320 | qmi_txn_cancel(&txn); | ||
321 | return ret; | ||
322 | } | ||
323 | |||
324 | ret = qmi_txn_wait(&txn, 5 * HZ); | ||
325 | if (ret < 0) | ||
326 | count = ret; | ||
327 | |||
328 | return count; | ||
329 | } | ||
330 | |||
331 | static const struct file_operations ping_fops = { | ||
332 | .open = simple_open, | ||
333 | .write = ping_write, | ||
334 | }; | ||
335 | |||
336 | static void ping_pong_cb(struct qmi_handle *qmi, struct sockaddr_qrtr *sq, | ||
337 | struct qmi_txn *txn, const void *data) | ||
338 | { | ||
339 | const struct test_ping_resp_msg_v01 *resp = data; | ||
340 | |||
341 | if (!txn) { | ||
342 | pr_err("spurious ping response\n"); | ||
343 | return; | ||
344 | } | ||
345 | |||
346 | if (resp->resp.result == QMI_RESULT_FAILURE_V01) | ||
347 | txn->result = -ENXIO; | ||
348 | else if (!resp->pong_valid || memcmp(resp->pong, "pong", 4)) | ||
349 | txn->result = -EINVAL; | ||
350 | |||
351 | complete(&txn->completion); | ||
352 | } | ||
353 | |||
354 | /* | ||
355 | * data_write() - data debugfs file write handler | ||
356 | * @file: debugfs file context | ||
357 | * @user_buf: reference to the user data | ||
358 | * @count: number of bytes in @user_buf | ||
359 | * @ppos: offset in @file to write | ||
360 | * | ||
361 | * This function allows user space to send out a data QMI encoded message to | ||
362 | * the associated remote test service and will return with the result of the | ||
363 | * transaction. It serves as an example of how to have the QMI helpers decode a | ||
364 | * transaction response into a provided object automatically. | ||
365 | * | ||
366 | * Return: @count, or negative errno on failure. | ||
367 | */ | ||
368 | static ssize_t data_write(struct file *file, const char __user *user_buf, | ||
369 | size_t count, loff_t *ppos) | ||
370 | |||
371 | { | ||
372 | struct qmi_handle *qmi = file->private_data; | ||
373 | struct test_data_resp_msg_v01 *resp; | ||
374 | struct test_data_req_msg_v01 *req; | ||
375 | struct qmi_txn txn; | ||
376 | int ret; | ||
377 | |||
378 | req = kzalloc(sizeof(*req), GFP_KERNEL); | ||
379 | if (!req) | ||
380 | return -ENOMEM; | ||
381 | |||
382 | resp = kzalloc(sizeof(*resp), GFP_KERNEL); | ||
383 | if (!resp) { | ||
384 | kfree(req); | ||
385 | return -ENOMEM; | ||
386 | } | ||
387 | |||
388 | req->data_len = min_t(size_t, sizeof(req->data), count); | ||
389 | if (copy_from_user(req->data, user_buf, req->data_len)) { | ||
390 | ret = -EFAULT; | ||
391 | goto out; | ||
392 | } | ||
393 | |||
394 | ret = qmi_txn_init(qmi, &txn, test_data_resp_msg_v01_ei, resp); | ||
395 | if (ret < 0) | ||
396 | goto out; | ||
397 | |||
398 | ret = qmi_send_request(qmi, NULL, &txn, | ||
399 | TEST_DATA_REQ_MSG_ID_V01, | ||
400 | TEST_DATA_REQ_MAX_MSG_LEN_V01, | ||
401 | test_data_req_msg_v01_ei, req); | ||
402 | if (ret < 0) { | ||
403 | qmi_txn_cancel(&txn); | ||
404 | goto out; | ||
405 | } | ||
406 | |||
407 | ret = qmi_txn_wait(&txn, 5 * HZ); | ||
408 | if (ret < 0) { | ||
409 | goto out; | ||
410 | } else if (!resp->data_valid || | ||
411 | resp->data_len != req->data_len || | ||
412 | memcmp(resp->data, req->data, req->data_len)) { | ||
413 | pr_err("response data doesn't match expectation\n"); | ||
414 | ret = -EINVAL; | ||
415 | goto out; | ||
416 | } | ||
417 | |||
418 | ret = count; | ||
419 | |||
420 | out: | ||
421 | kfree(resp); | ||
422 | kfree(req); | ||
423 | |||
424 | return ret; | ||
425 | } | ||
426 | |||
427 | static const struct file_operations data_fops = { | ||
428 | .open = simple_open, | ||
429 | .write = data_write, | ||
430 | }; | ||
431 | |||
432 | static struct qmi_msg_handler qmi_sample_handlers[] = { | ||
433 | { | ||
434 | .type = QMI_RESPONSE, | ||
435 | .msg_id = TEST_PING_REQ_MSG_ID_V01, | ||
436 | .ei = test_ping_resp_msg_v01_ei, | ||
437 | .decoded_size = sizeof(struct test_ping_req_msg_v01), | ||
438 | .fn = ping_pong_cb | ||
439 | }, | ||
440 | {} | ||
441 | }; | ||
442 | |||
443 | struct qmi_sample { | ||
444 | struct qmi_handle qmi; | ||
445 | |||
446 | struct dentry *de_dir; | ||
447 | struct dentry *de_data; | ||
448 | struct dentry *de_ping; | ||
449 | }; | ||
450 | |||
451 | static struct dentry *qmi_debug_dir; | ||
452 | |||
453 | static int qmi_sample_probe(struct platform_device *pdev) | ||
454 | { | ||
455 | struct sockaddr_qrtr *sq; | ||
456 | struct qmi_sample *sample; | ||
457 | char path[20]; | ||
458 | int ret; | ||
459 | |||
460 | sample = devm_kzalloc(&pdev->dev, sizeof(*sample), GFP_KERNEL); | ||
461 | if (!sample) | ||
462 | return -ENOMEM; | ||
463 | |||
464 | ret = qmi_handle_init(&sample->qmi, TEST_DATA_REQ_MAX_MSG_LEN_V01, | ||
465 | NULL, | ||
466 | qmi_sample_handlers); | ||
467 | if (ret < 0) | ||
468 | return ret; | ||
469 | |||
470 | sq = dev_get_platdata(&pdev->dev); | ||
471 | ret = kernel_connect(sample->qmi.sock, (struct sockaddr *)sq, | ||
472 | sizeof(*sq), 0); | ||
473 | if (ret < 0) { | ||
474 | pr_err("failed to connect to remote service port\n"); | ||
475 | goto err_release_qmi_handle; | ||
476 | } | ||
477 | |||
478 | snprintf(path, sizeof(path), "%d:%d", sq->sq_node, sq->sq_port); | ||
479 | |||
480 | sample->de_dir = debugfs_create_dir(path, qmi_debug_dir); | ||
481 | if (IS_ERR(sample->de_dir)) { | ||
482 | ret = PTR_ERR(sample->de_dir); | ||
483 | goto err_release_qmi_handle; | ||
484 | } | ||
485 | |||
486 | sample->de_data = debugfs_create_file("data", 0600, sample->de_dir, | ||
487 | sample, &data_fops); | ||
488 | if (IS_ERR(sample->de_data)) { | ||
489 | ret = PTR_ERR(sample->de_data); | ||
490 | goto err_remove_de_dir; | ||
491 | } | ||
492 | |||
493 | sample->de_ping = debugfs_create_file("ping", 0600, sample->de_dir, | ||
494 | sample, &ping_fops); | ||
495 | if (IS_ERR(sample->de_ping)) { | ||
496 | ret = PTR_ERR(sample->de_ping); | ||
497 | goto err_remove_de_data; | ||
498 | } | ||
499 | |||
500 | platform_set_drvdata(pdev, sample); | ||
501 | |||
502 | return 0; | ||
503 | |||
504 | err_remove_de_data: | ||
505 | debugfs_remove(sample->de_data); | ||
506 | err_remove_de_dir: | ||
507 | debugfs_remove(sample->de_dir); | ||
508 | err_release_qmi_handle: | ||
509 | qmi_handle_release(&sample->qmi); | ||
510 | |||
511 | return ret; | ||
512 | } | ||
513 | |||
514 | static int qmi_sample_remove(struct platform_device *pdev) | ||
515 | { | ||
516 | struct qmi_sample *sample = platform_get_drvdata(pdev); | ||
517 | |||
518 | debugfs_remove(sample->de_ping); | ||
519 | debugfs_remove(sample->de_data); | ||
520 | debugfs_remove(sample->de_dir); | ||
521 | |||
522 | qmi_handle_release(&sample->qmi); | ||
523 | |||
524 | return 0; | ||
525 | } | ||
526 | |||
527 | static struct platform_driver qmi_sample_driver = { | ||
528 | .probe = qmi_sample_probe, | ||
529 | .remove = qmi_sample_remove, | ||
530 | .driver = { | ||
531 | .name = "qmi_sample_client", | ||
532 | }, | ||
533 | }; | ||
534 | |||
535 | static int qmi_sample_new_server(struct qmi_handle *qmi, | ||
536 | struct qmi_service *service) | ||
537 | { | ||
538 | struct platform_device *pdev; | ||
539 | struct sockaddr_qrtr sq = { AF_QIPCRTR, service->node, service->port }; | ||
540 | int ret; | ||
541 | |||
542 | pdev = platform_device_alloc("qmi_sample_client", PLATFORM_DEVID_AUTO); | ||
543 | if (!pdev) | ||
544 | return -ENOMEM; | ||
545 | |||
546 | ret = platform_device_add_data(pdev, &sq, sizeof(sq)); | ||
547 | if (ret) | ||
548 | goto err_put_device; | ||
549 | |||
550 | ret = platform_device_add(pdev); | ||
551 | if (ret) | ||
552 | goto err_put_device; | ||
553 | |||
554 | service->priv = pdev; | ||
555 | |||
556 | return 0; | ||
557 | |||
558 | err_put_device: | ||
559 | platform_device_put(pdev); | ||
560 | |||
561 | return ret; | ||
562 | } | ||
563 | |||
564 | static void qmi_sample_del_server(struct qmi_handle *qmi, | ||
565 | struct qmi_service *service) | ||
566 | { | ||
567 | struct platform_device *pdev = service->priv; | ||
568 | |||
569 | platform_device_unregister(pdev); | ||
570 | } | ||
571 | |||
572 | static struct qmi_handle lookup_client; | ||
573 | |||
574 | static struct qmi_ops lookup_ops = { | ||
575 | .new_server = qmi_sample_new_server, | ||
576 | .del_server = qmi_sample_del_server, | ||
577 | }; | ||
578 | |||
579 | static int qmi_sample_init(void) | ||
580 | { | ||
581 | int ret; | ||
582 | |||
583 | qmi_debug_dir = debugfs_create_dir("qmi_sample", NULL); | ||
584 | if (IS_ERR(qmi_debug_dir)) { | ||
585 | pr_err("failed to create qmi_sample dir\n"); | ||
586 | return PTR_ERR(qmi_debug_dir); | ||
587 | } | ||
588 | |||
589 | ret = platform_driver_register(&qmi_sample_driver); | ||
590 | if (ret) | ||
591 | goto err_remove_debug_dir; | ||
592 | |||
593 | ret = qmi_handle_init(&lookup_client, 0, &lookup_ops, NULL); | ||
594 | if (ret < 0) | ||
595 | goto err_unregister_driver; | ||
596 | |||
597 | qmi_add_lookup(&lookup_client, 15, 0, 0); | ||
598 | |||
599 | return 0; | ||
600 | |||
601 | err_unregister_driver: | ||
602 | platform_driver_unregister(&qmi_sample_driver); | ||
603 | err_remove_debug_dir: | ||
604 | debugfs_remove(qmi_debug_dir); | ||
605 | |||
606 | return ret; | ||
607 | } | ||
608 | |||
609 | static void qmi_sample_exit(void) | ||
610 | { | ||
611 | qmi_handle_release(&lookup_client); | ||
612 | |||
613 | platform_driver_unregister(&qmi_sample_driver); | ||
614 | |||
615 | debugfs_remove(qmi_debug_dir); | ||
616 | } | ||
617 | |||
618 | module_init(qmi_sample_init); | ||
619 | module_exit(qmi_sample_exit); | ||
620 | |||
621 | MODULE_DESCRIPTION("Sample QMI client driver"); | ||
622 | MODULE_LICENSE("GPL v2"); | ||
diff --git a/samples/rpmsg/Makefile b/samples/rpmsg/Makefile new file mode 100644 index 000000000..ddf9a5d13 --- /dev/null +++ b/samples/rpmsg/Makefile | |||
@@ -0,0 +1,2 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | obj-$(CONFIG_SAMPLE_RPMSG_CLIENT) += rpmsg_client_sample.o | ||
diff --git a/samples/rpmsg/rpmsg_client_sample.c b/samples/rpmsg/rpmsg_client_sample.c new file mode 100644 index 000000000..ae5081662 --- /dev/null +++ b/samples/rpmsg/rpmsg_client_sample.c | |||
@@ -0,0 +1,96 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * Remote processor messaging - sample client driver | ||
4 | * | ||
5 | * Copyright (C) 2011 Texas Instruments, Inc. | ||
6 | * Copyright (C) 2011 Google, Inc. | ||
7 | * | ||
8 | * Ohad Ben-Cohen <ohad@wizery.com> | ||
9 | * Brian Swetland <swetland@google.com> | ||
10 | */ | ||
11 | |||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <linux/rpmsg.h> | ||
15 | |||
16 | #define MSG "hello world!" | ||
17 | |||
18 | static int count = 100; | ||
19 | module_param(count, int, 0644); | ||
20 | |||
21 | struct instance_data { | ||
22 | int rx_count; | ||
23 | }; | ||
24 | |||
25 | static int rpmsg_sample_cb(struct rpmsg_device *rpdev, void *data, int len, | ||
26 | void *priv, u32 src) | ||
27 | { | ||
28 | int ret; | ||
29 | struct instance_data *idata = dev_get_drvdata(&rpdev->dev); | ||
30 | |||
31 | dev_info(&rpdev->dev, "incoming msg %d (src: 0x%x)\n", | ||
32 | ++idata->rx_count, src); | ||
33 | |||
34 | print_hex_dump_debug(__func__, DUMP_PREFIX_NONE, 16, 1, data, len, | ||
35 | true); | ||
36 | |||
37 | /* samples should not live forever */ | ||
38 | if (idata->rx_count >= count) { | ||
39 | dev_info(&rpdev->dev, "goodbye!\n"); | ||
40 | return 0; | ||
41 | } | ||
42 | |||
43 | /* send a new message now */ | ||
44 | ret = rpmsg_send(rpdev->ept, MSG, strlen(MSG)); | ||
45 | if (ret) | ||
46 | dev_err(&rpdev->dev, "rpmsg_send failed: %d\n", ret); | ||
47 | |||
48 | return 0; | ||
49 | } | ||
50 | |||
51 | static int rpmsg_sample_probe(struct rpmsg_device *rpdev) | ||
52 | { | ||
53 | int ret; | ||
54 | struct instance_data *idata; | ||
55 | |||
56 | dev_info(&rpdev->dev, "new channel: 0x%x -> 0x%x!\n", | ||
57 | rpdev->src, rpdev->dst); | ||
58 | |||
59 | idata = devm_kzalloc(&rpdev->dev, sizeof(*idata), GFP_KERNEL); | ||
60 | if (!idata) | ||
61 | return -ENOMEM; | ||
62 | |||
63 | dev_set_drvdata(&rpdev->dev, idata); | ||
64 | |||
65 | /* send a message to our remote processor */ | ||
66 | ret = rpmsg_send(rpdev->ept, MSG, strlen(MSG)); | ||
67 | if (ret) { | ||
68 | dev_err(&rpdev->dev, "rpmsg_send failed: %d\n", ret); | ||
69 | return ret; | ||
70 | } | ||
71 | |||
72 | return 0; | ||
73 | } | ||
74 | |||
75 | static void rpmsg_sample_remove(struct rpmsg_device *rpdev) | ||
76 | { | ||
77 | dev_info(&rpdev->dev, "rpmsg sample client driver is removed\n"); | ||
78 | } | ||
79 | |||
80 | static struct rpmsg_device_id rpmsg_driver_sample_id_table[] = { | ||
81 | { .name = "rpmsg-client-sample" }, | ||
82 | { }, | ||
83 | }; | ||
84 | MODULE_DEVICE_TABLE(rpmsg, rpmsg_driver_sample_id_table); | ||
85 | |||
86 | static struct rpmsg_driver rpmsg_sample_client = { | ||
87 | .drv.name = KBUILD_MODNAME, | ||
88 | .id_table = rpmsg_driver_sample_id_table, | ||
89 | .probe = rpmsg_sample_probe, | ||
90 | .callback = rpmsg_sample_cb, | ||
91 | .remove = rpmsg_sample_remove, | ||
92 | }; | ||
93 | module_rpmsg_driver(rpmsg_sample_client); | ||
94 | |||
95 | MODULE_DESCRIPTION("Remote processor messaging sample client driver"); | ||
96 | MODULE_LICENSE("GPL v2"); | ||
diff --git a/samples/seccomp/.gitignore b/samples/seccomp/.gitignore new file mode 100644 index 000000000..4a5a5b7db --- /dev/null +++ b/samples/seccomp/.gitignore | |||
@@ -0,0 +1,5 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | bpf-direct | ||
3 | bpf-fancy | ||
4 | dropper | ||
5 | user-trap | ||
diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile new file mode 100644 index 000000000..c85ae0ed8 --- /dev/null +++ b/samples/seccomp/Makefile | |||
@@ -0,0 +1,6 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0 | ||
2 | userprogs-always-y += bpf-fancy dropper bpf-direct user-trap | ||
3 | |||
4 | bpf-fancy-objs := bpf-fancy.o bpf-helper.o | ||
5 | |||
6 | userccflags += -I usr/include | ||
diff --git a/samples/seccomp/bpf-direct.c b/samples/seccomp/bpf-direct.c new file mode 100644 index 000000000..c09e4a17a --- /dev/null +++ b/samples/seccomp/bpf-direct.c | |||
@@ -0,0 +1,191 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Seccomp filter example for x86 (32-bit and 64-bit) with BPF macros | ||
4 | * | ||
5 | * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org> | ||
6 | * Author: Will Drewry <wad@chromium.org> | ||
7 | * | ||
8 | * The code may be used by anyone for any purpose, | ||
9 | * and can serve as a starting point for developing | ||
10 | * applications using prctl(PR_SET_SECCOMP, 2, ...). | ||
11 | */ | ||
12 | #if defined(__i386__) || defined(__x86_64__) | ||
13 | #define SUPPORTED_ARCH 1 | ||
14 | #endif | ||
15 | |||
16 | #if defined(SUPPORTED_ARCH) | ||
17 | #define __USE_GNU 1 | ||
18 | #define _GNU_SOURCE 1 | ||
19 | |||
20 | #include <linux/types.h> | ||
21 | #include <linux/filter.h> | ||
22 | #include <linux/seccomp.h> | ||
23 | #include <linux/unistd.h> | ||
24 | #include <signal.h> | ||
25 | #include <stdio.h> | ||
26 | #include <stddef.h> | ||
27 | #include <string.h> | ||
28 | #include <sys/prctl.h> | ||
29 | #include <unistd.h> | ||
30 | |||
31 | #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n])) | ||
32 | #define syscall_nr (offsetof(struct seccomp_data, nr)) | ||
33 | |||
34 | #if defined(__i386__) | ||
35 | #define REG_RESULT REG_EAX | ||
36 | #define REG_SYSCALL REG_EAX | ||
37 | #define REG_ARG0 REG_EBX | ||
38 | #define REG_ARG1 REG_ECX | ||
39 | #define REG_ARG2 REG_EDX | ||
40 | #define REG_ARG3 REG_ESI | ||
41 | #define REG_ARG4 REG_EDI | ||
42 | #define REG_ARG5 REG_EBP | ||
43 | #elif defined(__x86_64__) | ||
44 | #define REG_RESULT REG_RAX | ||
45 | #define REG_SYSCALL REG_RAX | ||
46 | #define REG_ARG0 REG_RDI | ||
47 | #define REG_ARG1 REG_RSI | ||
48 | #define REG_ARG2 REG_RDX | ||
49 | #define REG_ARG3 REG_R10 | ||
50 | #define REG_ARG4 REG_R8 | ||
51 | #define REG_ARG5 REG_R9 | ||
52 | #endif | ||
53 | |||
54 | #ifndef PR_SET_NO_NEW_PRIVS | ||
55 | #define PR_SET_NO_NEW_PRIVS 38 | ||
56 | #endif | ||
57 | |||
58 | #ifndef SYS_SECCOMP | ||
59 | #define SYS_SECCOMP 1 | ||
60 | #endif | ||
61 | |||
62 | static void emulator(int nr, siginfo_t *info, void *void_context) | ||
63 | { | ||
64 | ucontext_t *ctx = (ucontext_t *)(void_context); | ||
65 | int syscall; | ||
66 | char *buf; | ||
67 | ssize_t bytes; | ||
68 | size_t len; | ||
69 | if (info->si_code != SYS_SECCOMP) | ||
70 | return; | ||
71 | if (!ctx) | ||
72 | return; | ||
73 | syscall = ctx->uc_mcontext.gregs[REG_SYSCALL]; | ||
74 | buf = (char *) ctx->uc_mcontext.gregs[REG_ARG1]; | ||
75 | len = (size_t) ctx->uc_mcontext.gregs[REG_ARG2]; | ||
76 | |||
77 | if (syscall != __NR_write) | ||
78 | return; | ||
79 | if (ctx->uc_mcontext.gregs[REG_ARG0] != STDERR_FILENO) | ||
80 | return; | ||
81 | /* Redirect stderr messages to stdout. Doesn't handle EINTR, etc */ | ||
82 | ctx->uc_mcontext.gregs[REG_RESULT] = -1; | ||
83 | if (write(STDOUT_FILENO, "[ERR] ", 6) > 0) { | ||
84 | bytes = write(STDOUT_FILENO, buf, len); | ||
85 | ctx->uc_mcontext.gregs[REG_RESULT] = bytes; | ||
86 | } | ||
87 | return; | ||
88 | } | ||
89 | |||
90 | static int install_emulator(void) | ||
91 | { | ||
92 | struct sigaction act; | ||
93 | sigset_t mask; | ||
94 | memset(&act, 0, sizeof(act)); | ||
95 | sigemptyset(&mask); | ||
96 | sigaddset(&mask, SIGSYS); | ||
97 | |||
98 | act.sa_sigaction = &emulator; | ||
99 | act.sa_flags = SA_SIGINFO; | ||
100 | if (sigaction(SIGSYS, &act, NULL) < 0) { | ||
101 | perror("sigaction"); | ||
102 | return -1; | ||
103 | } | ||
104 | if (sigprocmask(SIG_UNBLOCK, &mask, NULL)) { | ||
105 | perror("sigprocmask"); | ||
106 | return -1; | ||
107 | } | ||
108 | return 0; | ||
109 | } | ||
110 | |||
111 | static int install_filter(void) | ||
112 | { | ||
113 | struct sock_filter filter[] = { | ||
114 | /* Grab the system call number */ | ||
115 | BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_nr), | ||
116 | /* Jump table for the allowed syscalls */ | ||
117 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_rt_sigreturn, 0, 1), | ||
118 | BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), | ||
119 | #ifdef __NR_sigreturn | ||
120 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_sigreturn, 0, 1), | ||
121 | BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), | ||
122 | #endif | ||
123 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit_group, 0, 1), | ||
124 | BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), | ||
125 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit, 0, 1), | ||
126 | BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), | ||
127 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_read, 1, 0), | ||
128 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_write, 3, 2), | ||
129 | |||
130 | /* Check that read is only using stdin. */ | ||
131 | BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)), | ||
132 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDIN_FILENO, 4, 0), | ||
133 | BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), | ||
134 | |||
135 | /* Check that write is only using stdout */ | ||
136 | BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)), | ||
137 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDOUT_FILENO, 1, 0), | ||
138 | /* Trap attempts to write to stderr */ | ||
139 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDERR_FILENO, 1, 2), | ||
140 | |||
141 | BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), | ||
142 | BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_TRAP), | ||
143 | BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), | ||
144 | }; | ||
145 | struct sock_fprog prog = { | ||
146 | .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), | ||
147 | .filter = filter, | ||
148 | }; | ||
149 | |||
150 | if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { | ||
151 | perror("prctl(NO_NEW_PRIVS)"); | ||
152 | return 1; | ||
153 | } | ||
154 | |||
155 | |||
156 | if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) { | ||
157 | perror("prctl"); | ||
158 | return 1; | ||
159 | } | ||
160 | return 0; | ||
161 | } | ||
162 | |||
163 | #define payload(_c) (_c), sizeof((_c)) | ||
164 | int main(int argc, char **argv) | ||
165 | { | ||
166 | char buf[4096]; | ||
167 | ssize_t bytes = 0; | ||
168 | if (install_emulator()) | ||
169 | return 1; | ||
170 | if (install_filter()) | ||
171 | return 1; | ||
172 | syscall(__NR_write, STDOUT_FILENO, | ||
173 | payload("OHAI! WHAT IS YOUR NAME? ")); | ||
174 | bytes = syscall(__NR_read, STDIN_FILENO, buf, sizeof(buf)); | ||
175 | syscall(__NR_write, STDOUT_FILENO, payload("HELLO, ")); | ||
176 | syscall(__NR_write, STDOUT_FILENO, buf, bytes); | ||
177 | syscall(__NR_write, STDERR_FILENO, | ||
178 | payload("Error message going to STDERR\n")); | ||
179 | return 0; | ||
180 | } | ||
181 | #else /* SUPPORTED_ARCH */ | ||
182 | /* | ||
183 | * This sample is x86-only. Since kernel samples are compiled with the | ||
184 | * host toolchain, a non-x86 host will result in using only the main() | ||
185 | * below. | ||
186 | */ | ||
187 | int main(void) | ||
188 | { | ||
189 | return 1; | ||
190 | } | ||
191 | #endif /* SUPPORTED_ARCH */ | ||
diff --git a/samples/seccomp/bpf-fancy.c b/samples/seccomp/bpf-fancy.c new file mode 100644 index 000000000..1ccb43502 --- /dev/null +++ b/samples/seccomp/bpf-fancy.c | |||
@@ -0,0 +1,105 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Seccomp BPF example using a macro-based generator. | ||
4 | * | ||
5 | * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org> | ||
6 | * Author: Will Drewry <wad@chromium.org> | ||
7 | * | ||
8 | * The code may be used by anyone for any purpose, | ||
9 | * and can serve as a starting point for developing | ||
10 | * applications using prctl(PR_ATTACH_SECCOMP_FILTER). | ||
11 | */ | ||
12 | |||
13 | #include <linux/filter.h> | ||
14 | #include <linux/seccomp.h> | ||
15 | #include <linux/unistd.h> | ||
16 | #include <stdio.h> | ||
17 | #include <string.h> | ||
18 | #include <sys/prctl.h> | ||
19 | #include <unistd.h> | ||
20 | |||
21 | #include "bpf-helper.h" | ||
22 | |||
23 | #ifndef PR_SET_NO_NEW_PRIVS | ||
24 | #define PR_SET_NO_NEW_PRIVS 38 | ||
25 | #endif | ||
26 | |||
27 | int main(int argc, char **argv) | ||
28 | { | ||
29 | struct bpf_labels l = { | ||
30 | .count = 0, | ||
31 | }; | ||
32 | static const char msg1[] = "Please type something: "; | ||
33 | static const char msg2[] = "You typed: "; | ||
34 | char buf[256]; | ||
35 | struct sock_filter filter[] = { | ||
36 | /* TODO: LOAD_SYSCALL_NR(arch) and enforce an arch */ | ||
37 | LOAD_SYSCALL_NR, | ||
38 | SYSCALL(__NR_exit, ALLOW), | ||
39 | SYSCALL(__NR_exit_group, ALLOW), | ||
40 | SYSCALL(__NR_write, JUMP(&l, write_fd)), | ||
41 | SYSCALL(__NR_read, JUMP(&l, read)), | ||
42 | DENY, /* Don't passthrough into a label */ | ||
43 | |||
44 | LABEL(&l, read), | ||
45 | ARG(0), | ||
46 | JNE(STDIN_FILENO, DENY), | ||
47 | ARG(1), | ||
48 | JNE((unsigned long)buf, DENY), | ||
49 | ARG(2), | ||
50 | JGE(sizeof(buf), DENY), | ||
51 | ALLOW, | ||
52 | |||
53 | LABEL(&l, write_fd), | ||
54 | ARG(0), | ||
55 | JEQ(STDOUT_FILENO, JUMP(&l, write_buf)), | ||
56 | JEQ(STDERR_FILENO, JUMP(&l, write_buf)), | ||
57 | DENY, | ||
58 | |||
59 | LABEL(&l, write_buf), | ||
60 | ARG(1), | ||
61 | JEQ((unsigned long)msg1, JUMP(&l, msg1_len)), | ||
62 | JEQ((unsigned long)msg2, JUMP(&l, msg2_len)), | ||
63 | JEQ((unsigned long)buf, JUMP(&l, buf_len)), | ||
64 | DENY, | ||
65 | |||
66 | LABEL(&l, msg1_len), | ||
67 | ARG(2), | ||
68 | JLT(sizeof(msg1), ALLOW), | ||
69 | DENY, | ||
70 | |||
71 | LABEL(&l, msg2_len), | ||
72 | ARG(2), | ||
73 | JLT(sizeof(msg2), ALLOW), | ||
74 | DENY, | ||
75 | |||
76 | LABEL(&l, buf_len), | ||
77 | ARG(2), | ||
78 | JLT(sizeof(buf), ALLOW), | ||
79 | DENY, | ||
80 | }; | ||
81 | struct sock_fprog prog = { | ||
82 | .filter = filter, | ||
83 | .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), | ||
84 | }; | ||
85 | ssize_t bytes; | ||
86 | bpf_resolve_jumps(&l, filter, sizeof(filter)/sizeof(*filter)); | ||
87 | |||
88 | if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { | ||
89 | perror("prctl(NO_NEW_PRIVS)"); | ||
90 | return 1; | ||
91 | } | ||
92 | |||
93 | if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) { | ||
94 | perror("prctl(SECCOMP)"); | ||
95 | return 1; | ||
96 | } | ||
97 | syscall(__NR_write, STDOUT_FILENO, msg1, strlen(msg1)); | ||
98 | bytes = syscall(__NR_read, STDIN_FILENO, buf, sizeof(buf)-1); | ||
99 | bytes = (bytes > 0 ? bytes : 0); | ||
100 | syscall(__NR_write, STDERR_FILENO, msg2, strlen(msg2)); | ||
101 | syscall(__NR_write, STDERR_FILENO, buf, bytes); | ||
102 | /* Now get killed */ | ||
103 | syscall(__NR_write, STDERR_FILENO, msg2, strlen(msg2)+2); | ||
104 | return 0; | ||
105 | } | ||
diff --git a/samples/seccomp/bpf-helper.c b/samples/seccomp/bpf-helper.c new file mode 100644 index 000000000..ae260d77a --- /dev/null +++ b/samples/seccomp/bpf-helper.c | |||
@@ -0,0 +1,96 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Seccomp BPF helper functions | ||
4 | * | ||
5 | * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org> | ||
6 | * Author: Will Drewry <wad@chromium.org> | ||
7 | * | ||
8 | * The code may be used by anyone for any purpose, | ||
9 | * and can serve as a starting point for developing | ||
10 | * applications using prctl(PR_ATTACH_SECCOMP_FILTER). | ||
11 | */ | ||
12 | |||
13 | #include <stdio.h> | ||
14 | #include <stdlib.h> | ||
15 | #include <string.h> | ||
16 | |||
17 | #include "bpf-helper.h" | ||
18 | |||
19 | int bpf_resolve_jumps(struct bpf_labels *labels, | ||
20 | struct sock_filter *filter, size_t count) | ||
21 | { | ||
22 | size_t i; | ||
23 | |||
24 | if (count < 1 || count > BPF_MAXINSNS) | ||
25 | return -1; | ||
26 | /* | ||
27 | * Walk it once, backwards, to build the label table and do fixups. | ||
28 | * Since backward jumps are disallowed by BPF, this is easy. | ||
29 | */ | ||
30 | for (i = 0; i < count; ++i) { | ||
31 | size_t offset = count - i - 1; | ||
32 | struct sock_filter *instr = &filter[offset]; | ||
33 | if (instr->code != (BPF_JMP+BPF_JA)) | ||
34 | continue; | ||
35 | switch ((instr->jt<<8)|instr->jf) { | ||
36 | case (JUMP_JT<<8)|JUMP_JF: | ||
37 | if (labels->labels[instr->k].location == 0xffffffff) { | ||
38 | fprintf(stderr, "Unresolved label: '%s'\n", | ||
39 | labels->labels[instr->k].label); | ||
40 | return 1; | ||
41 | } | ||
42 | instr->k = labels->labels[instr->k].location - | ||
43 | (offset + 1); | ||
44 | instr->jt = 0; | ||
45 | instr->jf = 0; | ||
46 | continue; | ||
47 | case (LABEL_JT<<8)|LABEL_JF: | ||
48 | if (labels->labels[instr->k].location != 0xffffffff) { | ||
49 | fprintf(stderr, "Duplicate label use: '%s'\n", | ||
50 | labels->labels[instr->k].label); | ||
51 | return 1; | ||
52 | } | ||
53 | labels->labels[instr->k].location = offset; | ||
54 | instr->k = 0; /* fall through */ | ||
55 | instr->jt = 0; | ||
56 | instr->jf = 0; | ||
57 | continue; | ||
58 | } | ||
59 | } | ||
60 | return 0; | ||
61 | } | ||
62 | |||
63 | /* Simple lookup table for labels. */ | ||
64 | __u32 seccomp_bpf_label(struct bpf_labels *labels, const char *label) | ||
65 | { | ||
66 | struct __bpf_label *begin = labels->labels, *end; | ||
67 | int id; | ||
68 | |||
69 | if (labels->count == BPF_LABELS_MAX) { | ||
70 | fprintf(stderr, "Too many labels\n"); | ||
71 | exit(1); | ||
72 | } | ||
73 | if (labels->count == 0) { | ||
74 | begin->label = label; | ||
75 | begin->location = 0xffffffff; | ||
76 | labels->count++; | ||
77 | return 0; | ||
78 | } | ||
79 | end = begin + labels->count; | ||
80 | for (id = 0; begin < end; ++begin, ++id) { | ||
81 | if (!strcmp(label, begin->label)) | ||
82 | return id; | ||
83 | } | ||
84 | begin->label = label; | ||
85 | begin->location = 0xffffffff; | ||
86 | labels->count++; | ||
87 | return id; | ||
88 | } | ||
89 | |||
90 | void seccomp_bpf_print(struct sock_filter *filter, size_t count) | ||
91 | { | ||
92 | struct sock_filter *end = filter + count; | ||
93 | for ( ; filter < end; ++filter) | ||
94 | printf("{ code=%u,jt=%u,jf=%u,k=%u },\n", | ||
95 | filter->code, filter->jt, filter->jf, filter->k); | ||
96 | } | ||
diff --git a/samples/seccomp/bpf-helper.h b/samples/seccomp/bpf-helper.h new file mode 100644 index 000000000..0cc9816fe --- /dev/null +++ b/samples/seccomp/bpf-helper.h | |||
@@ -0,0 +1,263 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
2 | /* | ||
3 | * Example wrapper around BPF macros. | ||
4 | * | ||
5 | * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org> | ||
6 | * Author: Will Drewry <wad@chromium.org> | ||
7 | * | ||
8 | * The code may be used by anyone for any purpose, | ||
9 | * and can serve as a starting point for developing | ||
10 | * applications using prctl(PR_SET_SECCOMP, 2, ...). | ||
11 | * | ||
12 | * No guarantees are provided with respect to the correctness | ||
13 | * or functionality of this code. | ||
14 | */ | ||
15 | #ifndef __BPF_HELPER_H__ | ||
16 | #define __BPF_HELPER_H__ | ||
17 | |||
18 | #include <asm/bitsperlong.h> /* for __BITS_PER_LONG */ | ||
19 | #include <endian.h> | ||
20 | #include <linux/filter.h> | ||
21 | #include <linux/seccomp.h> /* for seccomp_data */ | ||
22 | #include <linux/types.h> | ||
23 | #include <linux/unistd.h> | ||
24 | #include <stddef.h> | ||
25 | |||
26 | #define BPF_LABELS_MAX 256 | ||
27 | struct bpf_labels { | ||
28 | int count; | ||
29 | struct __bpf_label { | ||
30 | const char *label; | ||
31 | __u32 location; | ||
32 | } labels[BPF_LABELS_MAX]; | ||
33 | }; | ||
34 | |||
35 | int bpf_resolve_jumps(struct bpf_labels *labels, | ||
36 | struct sock_filter *filter, size_t count); | ||
37 | __u32 seccomp_bpf_label(struct bpf_labels *labels, const char *label); | ||
38 | void seccomp_bpf_print(struct sock_filter *filter, size_t count); | ||
39 | |||
40 | #define JUMP_JT 0xff | ||
41 | #define JUMP_JF 0xff | ||
42 | #define LABEL_JT 0xfe | ||
43 | #define LABEL_JF 0xfe | ||
44 | |||
45 | #define ALLOW \ | ||
46 | BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW) | ||
47 | #define DENY \ | ||
48 | BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL) | ||
49 | #define JUMP(labels, label) \ | ||
50 | BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \ | ||
51 | JUMP_JT, JUMP_JF) | ||
52 | #define LABEL(labels, label) \ | ||
53 | BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \ | ||
54 | LABEL_JT, LABEL_JF) | ||
55 | #define SYSCALL(nr, jt) \ | ||
56 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (nr), 0, 1), \ | ||
57 | jt | ||
58 | |||
59 | /* Lame, but just an example */ | ||
60 | #define FIND_LABEL(labels, label) seccomp_bpf_label((labels), #label) | ||
61 | |||
62 | #define EXPAND(...) __VA_ARGS__ | ||
63 | |||
64 | /* Ensure that we load the logically correct offset. */ | ||
65 | #if __BYTE_ORDER == __LITTLE_ENDIAN | ||
66 | #define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) | ||
67 | #elif __BYTE_ORDER == __BIG_ENDIAN | ||
68 | #define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32) | ||
69 | #else | ||
70 | #error "Unknown endianness" | ||
71 | #endif | ||
72 | |||
73 | /* Map all width-sensitive operations */ | ||
74 | #if __BITS_PER_LONG == 32 | ||
75 | |||
76 | #define JEQ(x, jt) JEQ32(x, EXPAND(jt)) | ||
77 | #define JNE(x, jt) JNE32(x, EXPAND(jt)) | ||
78 | #define JGT(x, jt) JGT32(x, EXPAND(jt)) | ||
79 | #define JLT(x, jt) JLT32(x, EXPAND(jt)) | ||
80 | #define JGE(x, jt) JGE32(x, EXPAND(jt)) | ||
81 | #define JLE(x, jt) JLE32(x, EXPAND(jt)) | ||
82 | #define JA(x, jt) JA32(x, EXPAND(jt)) | ||
83 | #define ARG(i) ARG_32(i) | ||
84 | |||
85 | #elif __BITS_PER_LONG == 64 | ||
86 | |||
87 | /* Ensure that we load the logically correct offset. */ | ||
88 | #if __BYTE_ORDER == __LITTLE_ENDIAN | ||
89 | #define ENDIAN(_lo, _hi) _lo, _hi | ||
90 | #define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32) | ||
91 | #elif __BYTE_ORDER == __BIG_ENDIAN | ||
92 | #define ENDIAN(_lo, _hi) _hi, _lo | ||
93 | #define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) | ||
94 | #endif | ||
95 | |||
96 | union arg64 { | ||
97 | struct { | ||
98 | __u32 ENDIAN(lo32, hi32); | ||
99 | }; | ||
100 | __u64 u64; | ||
101 | }; | ||
102 | |||
103 | #define JEQ(x, jt) \ | ||
104 | JEQ64(((union arg64){.u64 = (x)}).lo32, \ | ||
105 | ((union arg64){.u64 = (x)}).hi32, \ | ||
106 | EXPAND(jt)) | ||
107 | #define JGT(x, jt) \ | ||
108 | JGT64(((union arg64){.u64 = (x)}).lo32, \ | ||
109 | ((union arg64){.u64 = (x)}).hi32, \ | ||
110 | EXPAND(jt)) | ||
111 | #define JGE(x, jt) \ | ||
112 | JGE64(((union arg64){.u64 = (x)}).lo32, \ | ||
113 | ((union arg64){.u64 = (x)}).hi32, \ | ||
114 | EXPAND(jt)) | ||
115 | #define JNE(x, jt) \ | ||
116 | JNE64(((union arg64){.u64 = (x)}).lo32, \ | ||
117 | ((union arg64){.u64 = (x)}).hi32, \ | ||
118 | EXPAND(jt)) | ||
119 | #define JLT(x, jt) \ | ||
120 | JLT64(((union arg64){.u64 = (x)}).lo32, \ | ||
121 | ((union arg64){.u64 = (x)}).hi32, \ | ||
122 | EXPAND(jt)) | ||
123 | #define JLE(x, jt) \ | ||
124 | JLE64(((union arg64){.u64 = (x)}).lo32, \ | ||
125 | ((union arg64){.u64 = (x)}).hi32, \ | ||
126 | EXPAND(jt)) | ||
127 | |||
128 | #define JA(x, jt) \ | ||
129 | JA64(((union arg64){.u64 = (x)}).lo32, \ | ||
130 | ((union arg64){.u64 = (x)}).hi32, \ | ||
131 | EXPAND(jt)) | ||
132 | #define ARG(i) ARG_64(i) | ||
133 | |||
134 | #else | ||
135 | #error __BITS_PER_LONG value unusable. | ||
136 | #endif | ||
137 | |||
138 | /* Loads the arg into A */ | ||
139 | #define ARG_32(idx) \ | ||
140 | BPF_STMT(BPF_LD+BPF_W+BPF_ABS, LO_ARG(idx)) | ||
141 | |||
142 | /* Loads lo into M[0] and hi into M[1] and A */ | ||
143 | #define ARG_64(idx) \ | ||
144 | BPF_STMT(BPF_LD+BPF_W+BPF_ABS, LO_ARG(idx)), \ | ||
145 | BPF_STMT(BPF_ST, 0), /* lo -> M[0] */ \ | ||
146 | BPF_STMT(BPF_LD+BPF_W+BPF_ABS, HI_ARG(idx)), \ | ||
147 | BPF_STMT(BPF_ST, 1) /* hi -> M[1] */ | ||
148 | |||
149 | #define JEQ32(value, jt) \ | ||
150 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (value), 0, 1), \ | ||
151 | jt | ||
152 | |||
153 | #define JNE32(value, jt) \ | ||
154 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (value), 1, 0), \ | ||
155 | jt | ||
156 | |||
157 | #define JA32(value, jt) \ | ||
158 | BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (value), 0, 1), \ | ||
159 | jt | ||
160 | |||
161 | #define JGE32(value, jt) \ | ||
162 | BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (value), 0, 1), \ | ||
163 | jt | ||
164 | |||
165 | #define JGT32(value, jt) \ | ||
166 | BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (value), 0, 1), \ | ||
167 | jt | ||
168 | |||
169 | #define JLE32(value, jt) \ | ||
170 | BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (value), 1, 0), \ | ||
171 | jt | ||
172 | |||
173 | #define JLT32(value, jt) \ | ||
174 | BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (value), 1, 0), \ | ||
175 | jt | ||
176 | |||
177 | /* | ||
178 | * All the JXX64 checks assume lo is saved in M[0] and hi is saved in both | ||
179 | * A and M[1]. This invariant is kept by restoring A if necessary. | ||
180 | */ | ||
181 | #define JEQ64(lo, hi, jt) \ | ||
182 | /* if (hi != arg.hi) goto NOMATCH; */ \ | ||
183 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \ | ||
184 | BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \ | ||
185 | /* if (lo != arg.lo) goto NOMATCH; */ \ | ||
186 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (lo), 0, 2), \ | ||
187 | BPF_STMT(BPF_LD+BPF_MEM, 1), \ | ||
188 | jt, \ | ||
189 | BPF_STMT(BPF_LD+BPF_MEM, 1) | ||
190 | |||
191 | #define JNE64(lo, hi, jt) \ | ||
192 | /* if (hi != arg.hi) goto MATCH; */ \ | ||
193 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 3), \ | ||
194 | BPF_STMT(BPF_LD+BPF_MEM, 0), \ | ||
195 | /* if (lo != arg.lo) goto MATCH; */ \ | ||
196 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (lo), 2, 0), \ | ||
197 | BPF_STMT(BPF_LD+BPF_MEM, 1), \ | ||
198 | jt, \ | ||
199 | BPF_STMT(BPF_LD+BPF_MEM, 1) | ||
200 | |||
201 | #define JA64(lo, hi, jt) \ | ||
202 | /* if (hi & arg.hi) goto MATCH; */ \ | ||
203 | BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (hi), 3, 0), \ | ||
204 | BPF_STMT(BPF_LD+BPF_MEM, 0), \ | ||
205 | /* if (lo & arg.lo) goto MATCH; */ \ | ||
206 | BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (lo), 0, 2), \ | ||
207 | BPF_STMT(BPF_LD+BPF_MEM, 1), \ | ||
208 | jt, \ | ||
209 | BPF_STMT(BPF_LD+BPF_MEM, 1) | ||
210 | |||
211 | #define JGE64(lo, hi, jt) \ | ||
212 | /* if (hi > arg.hi) goto MATCH; */ \ | ||
213 | BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 4, 0), \ | ||
214 | /* if (hi != arg.hi) goto NOMATCH; */ \ | ||
215 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \ | ||
216 | BPF_STMT(BPF_LD+BPF_MEM, 0), \ | ||
217 | /* if (lo >= arg.lo) goto MATCH; */ \ | ||
218 | BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (lo), 0, 2), \ | ||
219 | BPF_STMT(BPF_LD+BPF_MEM, 1), \ | ||
220 | jt, \ | ||
221 | BPF_STMT(BPF_LD+BPF_MEM, 1) | ||
222 | |||
223 | #define JGT64(lo, hi, jt) \ | ||
224 | /* if (hi > arg.hi) goto MATCH; */ \ | ||
225 | BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 4, 0), \ | ||
226 | /* if (hi != arg.hi) goto NOMATCH; */ \ | ||
227 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \ | ||
228 | BPF_STMT(BPF_LD+BPF_MEM, 0), \ | ||
229 | /* if (lo > arg.lo) goto MATCH; */ \ | ||
230 | BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 0, 2), \ | ||
231 | BPF_STMT(BPF_LD+BPF_MEM, 1), \ | ||
232 | jt, \ | ||
233 | BPF_STMT(BPF_LD+BPF_MEM, 1) | ||
234 | |||
235 | #define JLE64(lo, hi, jt) \ | ||
236 | /* if (hi < arg.hi) goto MATCH; */ \ | ||
237 | BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (hi), 0, 4), \ | ||
238 | /* if (hi != arg.hi) goto NOMATCH; */ \ | ||
239 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \ | ||
240 | BPF_STMT(BPF_LD+BPF_MEM, 0), \ | ||
241 | /* if (lo <= arg.lo) goto MATCH; */ \ | ||
242 | BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 2, 0), \ | ||
243 | BPF_STMT(BPF_LD+BPF_MEM, 1), \ | ||
244 | jt, \ | ||
245 | BPF_STMT(BPF_LD+BPF_MEM, 1) | ||
246 | |||
247 | #define JLT64(lo, hi, jt) \ | ||
248 | /* if (hi < arg.hi) goto MATCH; */ \ | ||
249 | BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (hi), 0, 4), \ | ||
250 | /* if (hi != arg.hi) goto NOMATCH; */ \ | ||
251 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \ | ||
252 | BPF_STMT(BPF_LD+BPF_MEM, 0), \ | ||
253 | /* if (lo < arg.lo) goto MATCH; */ \ | ||
254 | BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (lo), 2, 0), \ | ||
255 | BPF_STMT(BPF_LD+BPF_MEM, 1), \ | ||
256 | jt, \ | ||
257 | BPF_STMT(BPF_LD+BPF_MEM, 1) | ||
258 | |||
259 | #define LOAD_SYSCALL_NR \ | ||
260 | BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \ | ||
261 | offsetof(struct seccomp_data, nr)) | ||
262 | |||
263 | #endif /* __BPF_HELPER_H__ */ | ||
diff --git a/samples/seccomp/dropper.c b/samples/seccomp/dropper.c new file mode 100644 index 000000000..cc0648eb3 --- /dev/null +++ b/samples/seccomp/dropper.c | |||
@@ -0,0 +1,72 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Naive system call dropper built on seccomp_filter. | ||
4 | * | ||
5 | * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org> | ||
6 | * Author: Will Drewry <wad@chromium.org> | ||
7 | * | ||
8 | * The code may be used by anyone for any purpose, | ||
9 | * and can serve as a starting point for developing | ||
10 | * applications using prctl(PR_SET_SECCOMP, 2, ...). | ||
11 | * | ||
12 | * When run, returns the specified errno for the specified | ||
13 | * system call number against the given architecture. | ||
14 | * | ||
15 | */ | ||
16 | |||
17 | #include <errno.h> | ||
18 | #include <linux/audit.h> | ||
19 | #include <linux/filter.h> | ||
20 | #include <linux/seccomp.h> | ||
21 | #include <linux/unistd.h> | ||
22 | #include <stdio.h> | ||
23 | #include <stddef.h> | ||
24 | #include <stdlib.h> | ||
25 | #include <sys/prctl.h> | ||
26 | #include <unistd.h> | ||
27 | |||
28 | static int install_filter(int nr, int arch, int error) | ||
29 | { | ||
30 | struct sock_filter filter[] = { | ||
31 | BPF_STMT(BPF_LD+BPF_W+BPF_ABS, | ||
32 | (offsetof(struct seccomp_data, arch))), | ||
33 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, arch, 0, 3), | ||
34 | BPF_STMT(BPF_LD+BPF_W+BPF_ABS, | ||
35 | (offsetof(struct seccomp_data, nr))), | ||
36 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1), | ||
37 | BPF_STMT(BPF_RET+BPF_K, | ||
38 | SECCOMP_RET_ERRNO|(error & SECCOMP_RET_DATA)), | ||
39 | BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), | ||
40 | }; | ||
41 | struct sock_fprog prog = { | ||
42 | .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), | ||
43 | .filter = filter, | ||
44 | }; | ||
45 | if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { | ||
46 | perror("prctl(NO_NEW_PRIVS)"); | ||
47 | return 1; | ||
48 | } | ||
49 | if (prctl(PR_SET_SECCOMP, 2, &prog)) { | ||
50 | perror("prctl(PR_SET_SECCOMP)"); | ||
51 | return 1; | ||
52 | } | ||
53 | return 0; | ||
54 | } | ||
55 | |||
56 | int main(int argc, char **argv) | ||
57 | { | ||
58 | if (argc < 5) { | ||
59 | fprintf(stderr, "Usage:\n" | ||
60 | "dropper <syscall_nr> <arch> <errno> <prog> [<args>]\n" | ||
61 | "Hint: AUDIT_ARCH_I386: 0x%X\n" | ||
62 | " AUDIT_ARCH_X86_64: 0x%X\n" | ||
63 | "\n", AUDIT_ARCH_I386, AUDIT_ARCH_X86_64); | ||
64 | return 1; | ||
65 | } | ||
66 | if (install_filter(strtol(argv[1], NULL, 0), strtol(argv[2], NULL, 0), | ||
67 | strtol(argv[3], NULL, 0))) | ||
68 | return 1; | ||
69 | execv(argv[4], &argv[4]); | ||
70 | printf("Failed to execv\n"); | ||
71 | return 255; | ||
72 | } | ||
diff --git a/samples/seccomp/user-trap.c b/samples/seccomp/user-trap.c new file mode 100644 index 000000000..20291ec64 --- /dev/null +++ b/samples/seccomp/user-trap.c | |||
@@ -0,0 +1,375 @@ | |||
1 | #include <signal.h> | ||
2 | #include <stdio.h> | ||
3 | #include <stdlib.h> | ||
4 | #include <unistd.h> | ||
5 | #include <errno.h> | ||
6 | #include <fcntl.h> | ||
7 | #include <string.h> | ||
8 | #include <stddef.h> | ||
9 | #include <sys/sysmacros.h> | ||
10 | #include <sys/types.h> | ||
11 | #include <sys/wait.h> | ||
12 | #include <sys/socket.h> | ||
13 | #include <sys/stat.h> | ||
14 | #include <sys/mman.h> | ||
15 | #include <sys/syscall.h> | ||
16 | #include <sys/user.h> | ||
17 | #include <sys/ioctl.h> | ||
18 | #include <sys/ptrace.h> | ||
19 | #include <sys/mount.h> | ||
20 | #include <linux/limits.h> | ||
21 | #include <linux/filter.h> | ||
22 | #include <linux/seccomp.h> | ||
23 | |||
24 | #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) | ||
25 | |||
26 | static int seccomp(unsigned int op, unsigned int flags, void *args) | ||
27 | { | ||
28 | errno = 0; | ||
29 | return syscall(__NR_seccomp, op, flags, args); | ||
30 | } | ||
31 | |||
32 | static int send_fd(int sock, int fd) | ||
33 | { | ||
34 | struct msghdr msg = {}; | ||
35 | struct cmsghdr *cmsg; | ||
36 | char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c'; | ||
37 | struct iovec io = { | ||
38 | .iov_base = &c, | ||
39 | .iov_len = 1, | ||
40 | }; | ||
41 | |||
42 | msg.msg_iov = &io; | ||
43 | msg.msg_iovlen = 1; | ||
44 | msg.msg_control = buf; | ||
45 | msg.msg_controllen = sizeof(buf); | ||
46 | cmsg = CMSG_FIRSTHDR(&msg); | ||
47 | cmsg->cmsg_level = SOL_SOCKET; | ||
48 | cmsg->cmsg_type = SCM_RIGHTS; | ||
49 | cmsg->cmsg_len = CMSG_LEN(sizeof(int)); | ||
50 | *((int *)CMSG_DATA(cmsg)) = fd; | ||
51 | msg.msg_controllen = cmsg->cmsg_len; | ||
52 | |||
53 | if (sendmsg(sock, &msg, 0) < 0) { | ||
54 | perror("sendmsg"); | ||
55 | return -1; | ||
56 | } | ||
57 | |||
58 | return 0; | ||
59 | } | ||
60 | |||
61 | static int recv_fd(int sock) | ||
62 | { | ||
63 | struct msghdr msg = {}; | ||
64 | struct cmsghdr *cmsg; | ||
65 | char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c'; | ||
66 | struct iovec io = { | ||
67 | .iov_base = &c, | ||
68 | .iov_len = 1, | ||
69 | }; | ||
70 | |||
71 | msg.msg_iov = &io; | ||
72 | msg.msg_iovlen = 1; | ||
73 | msg.msg_control = buf; | ||
74 | msg.msg_controllen = sizeof(buf); | ||
75 | |||
76 | if (recvmsg(sock, &msg, 0) < 0) { | ||
77 | perror("recvmsg"); | ||
78 | return -1; | ||
79 | } | ||
80 | |||
81 | cmsg = CMSG_FIRSTHDR(&msg); | ||
82 | |||
83 | return *((int *)CMSG_DATA(cmsg)); | ||
84 | } | ||
85 | |||
86 | static int user_trap_syscall(int nr, unsigned int flags) | ||
87 | { | ||
88 | struct sock_filter filter[] = { | ||
89 | BPF_STMT(BPF_LD+BPF_W+BPF_ABS, | ||
90 | offsetof(struct seccomp_data, nr)), | ||
91 | BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1), | ||
92 | BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF), | ||
93 | BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), | ||
94 | }; | ||
95 | |||
96 | struct sock_fprog prog = { | ||
97 | .len = (unsigned short)ARRAY_SIZE(filter), | ||
98 | .filter = filter, | ||
99 | }; | ||
100 | |||
101 | return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog); | ||
102 | } | ||
103 | |||
104 | static int handle_req(struct seccomp_notif *req, | ||
105 | struct seccomp_notif_resp *resp, int listener) | ||
106 | { | ||
107 | char path[PATH_MAX], source[PATH_MAX], target[PATH_MAX]; | ||
108 | int ret = -1, mem; | ||
109 | |||
110 | resp->id = req->id; | ||
111 | resp->error = -EPERM; | ||
112 | resp->val = 0; | ||
113 | |||
114 | if (req->data.nr != __NR_mount) { | ||
115 | fprintf(stderr, "huh? trapped something besides mount? %d\n", req->data.nr); | ||
116 | return -1; | ||
117 | } | ||
118 | |||
119 | /* Only allow bind mounts. */ | ||
120 | if (!(req->data.args[3] & MS_BIND)) | ||
121 | return 0; | ||
122 | |||
123 | /* | ||
124 | * Ok, let's read the task's memory to see where they wanted their | ||
125 | * mount to go. | ||
126 | */ | ||
127 | snprintf(path, sizeof(path), "/proc/%d/mem", req->pid); | ||
128 | mem = open(path, O_RDONLY); | ||
129 | if (mem < 0) { | ||
130 | perror("open mem"); | ||
131 | return -1; | ||
132 | } | ||
133 | |||
134 | /* | ||
135 | * Now we avoid a TOCTOU: we referred to a pid by its pid, but since | ||
136 | * the pid that made the syscall may have died, we need to confirm that | ||
137 | * the pid is still valid after we open its /proc/pid/mem file. We can | ||
138 | * ask the listener fd this as follows. | ||
139 | * | ||
140 | * Note that this check should occur *after* any task-specific | ||
141 | * resources are opened, to make sure that the task has not died and | ||
142 | * we're not wrongly reading someone else's state in order to make | ||
143 | * decisions. | ||
144 | */ | ||
145 | if (ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req->id) < 0) { | ||
146 | fprintf(stderr, "task died before we could map its memory\n"); | ||
147 | goto out; | ||
148 | } | ||
149 | |||
150 | /* | ||
151 | * Phew, we've got the right /proc/pid/mem. Now we can read it. Note | ||
152 | * that to avoid another TOCTOU, we should read all of the pointer args | ||
153 | * before we decide to allow the syscall. | ||
154 | */ | ||
155 | if (lseek(mem, req->data.args[0], SEEK_SET) < 0) { | ||
156 | perror("seek"); | ||
157 | goto out; | ||
158 | } | ||
159 | |||
160 | ret = read(mem, source, sizeof(source)); | ||
161 | if (ret < 0) { | ||
162 | perror("read"); | ||
163 | goto out; | ||
164 | } | ||
165 | |||
166 | if (lseek(mem, req->data.args[1], SEEK_SET) < 0) { | ||
167 | perror("seek"); | ||
168 | goto out; | ||
169 | } | ||
170 | |||
171 | ret = read(mem, target, sizeof(target)); | ||
172 | if (ret < 0) { | ||
173 | perror("read"); | ||
174 | goto out; | ||
175 | } | ||
176 | |||
177 | /* | ||
178 | * Our policy is to only allow bind mounts inside /tmp. This isn't very | ||
179 | * interesting, because we could do unprivlieged bind mounts with user | ||
180 | * namespaces already, but you get the idea. | ||
181 | */ | ||
182 | if (!strncmp(source, "/tmp/", 5) && !strncmp(target, "/tmp/", 5)) { | ||
183 | if (mount(source, target, NULL, req->data.args[3], NULL) < 0) { | ||
184 | ret = -1; | ||
185 | perror("actual mount"); | ||
186 | goto out; | ||
187 | } | ||
188 | resp->error = 0; | ||
189 | } | ||
190 | |||
191 | /* Even if we didn't allow it because of policy, generating the | ||
192 | * response was be a success, because we want to tell the worker EPERM. | ||
193 | */ | ||
194 | ret = 0; | ||
195 | |||
196 | out: | ||
197 | close(mem); | ||
198 | return ret; | ||
199 | } | ||
200 | |||
201 | int main(void) | ||
202 | { | ||
203 | int sk_pair[2], ret = 1, status, listener; | ||
204 | pid_t worker = 0 , tracer = 0; | ||
205 | |||
206 | if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair) < 0) { | ||
207 | perror("socketpair"); | ||
208 | return 1; | ||
209 | } | ||
210 | |||
211 | worker = fork(); | ||
212 | if (worker < 0) { | ||
213 | perror("fork"); | ||
214 | goto close_pair; | ||
215 | } | ||
216 | |||
217 | if (worker == 0) { | ||
218 | listener = user_trap_syscall(__NR_mount, | ||
219 | SECCOMP_FILTER_FLAG_NEW_LISTENER); | ||
220 | if (listener < 0) { | ||
221 | perror("seccomp"); | ||
222 | exit(1); | ||
223 | } | ||
224 | |||
225 | /* | ||
226 | * Drop privileges. We definitely can't mount as uid 1000. | ||
227 | */ | ||
228 | if (setuid(1000) < 0) { | ||
229 | perror("setuid"); | ||
230 | exit(1); | ||
231 | } | ||
232 | |||
233 | /* | ||
234 | * Send the listener to the parent; also serves as | ||
235 | * synchronization. | ||
236 | */ | ||
237 | if (send_fd(sk_pair[1], listener) < 0) | ||
238 | exit(1); | ||
239 | close(listener); | ||
240 | |||
241 | if (mkdir("/tmp/foo", 0755) < 0) { | ||
242 | perror("mkdir"); | ||
243 | exit(1); | ||
244 | } | ||
245 | |||
246 | /* | ||
247 | * Try a bad mount just for grins. | ||
248 | */ | ||
249 | if (mount("/dev/sda", "/tmp/foo", NULL, 0, NULL) != -1) { | ||
250 | fprintf(stderr, "huh? mounted /dev/sda?\n"); | ||
251 | exit(1); | ||
252 | } | ||
253 | |||
254 | if (errno != EPERM) { | ||
255 | perror("bad error from mount"); | ||
256 | exit(1); | ||
257 | } | ||
258 | |||
259 | /* | ||
260 | * Ok, we expect this one to succeed. | ||
261 | */ | ||
262 | if (mount("/tmp/foo", "/tmp/foo", NULL, MS_BIND, NULL) < 0) { | ||
263 | perror("mount"); | ||
264 | exit(1); | ||
265 | } | ||
266 | |||
267 | exit(0); | ||
268 | } | ||
269 | |||
270 | /* | ||
271 | * Get the listener from the child. | ||
272 | */ | ||
273 | listener = recv_fd(sk_pair[0]); | ||
274 | if (listener < 0) | ||
275 | goto out_kill; | ||
276 | |||
277 | /* | ||
278 | * Fork a task to handle the requests. This isn't strictly necessary, | ||
279 | * but it makes the particular writing of this sample easier, since we | ||
280 | * can just wait ofr the tracee to exit and kill the tracer. | ||
281 | */ | ||
282 | tracer = fork(); | ||
283 | if (tracer < 0) { | ||
284 | perror("fork"); | ||
285 | goto out_kill; | ||
286 | } | ||
287 | |||
288 | if (tracer == 0) { | ||
289 | struct seccomp_notif *req; | ||
290 | struct seccomp_notif_resp *resp; | ||
291 | struct seccomp_notif_sizes sizes; | ||
292 | |||
293 | if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) < 0) { | ||
294 | perror("seccomp(GET_NOTIF_SIZES)"); | ||
295 | goto out_close; | ||
296 | } | ||
297 | |||
298 | req = malloc(sizes.seccomp_notif); | ||
299 | if (!req) | ||
300 | goto out_close; | ||
301 | |||
302 | resp = malloc(sizes.seccomp_notif_resp); | ||
303 | if (!resp) | ||
304 | goto out_req; | ||
305 | memset(resp, 0, sizes.seccomp_notif_resp); | ||
306 | |||
307 | while (1) { | ||
308 | memset(req, 0, sizes.seccomp_notif); | ||
309 | if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) { | ||
310 | perror("ioctl recv"); | ||
311 | goto out_resp; | ||
312 | } | ||
313 | |||
314 | if (handle_req(req, resp, listener) < 0) | ||
315 | goto out_resp; | ||
316 | |||
317 | /* | ||
318 | * ENOENT here means that the task may have gotten a | ||
319 | * signal and restarted the syscall. It's up to the | ||
320 | * handler to decide what to do in this case, but for | ||
321 | * the sample code, we just ignore it. Probably | ||
322 | * something better should happen, like undoing the | ||
323 | * mount, or keeping track of the args to make sure we | ||
324 | * don't do it again. | ||
325 | */ | ||
326 | if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, resp) < 0 && | ||
327 | errno != ENOENT) { | ||
328 | perror("ioctl send"); | ||
329 | goto out_resp; | ||
330 | } | ||
331 | } | ||
332 | out_resp: | ||
333 | free(resp); | ||
334 | out_req: | ||
335 | free(req); | ||
336 | out_close: | ||
337 | close(listener); | ||
338 | exit(1); | ||
339 | } | ||
340 | |||
341 | close(listener); | ||
342 | |||
343 | if (waitpid(worker, &status, 0) != worker) { | ||
344 | perror("waitpid"); | ||
345 | goto out_kill; | ||
346 | } | ||
347 | |||
348 | if (umount2("/tmp/foo", MNT_DETACH) < 0 && errno != EINVAL) { | ||
349 | perror("umount2"); | ||
350 | goto out_kill; | ||
351 | } | ||
352 | |||
353 | if (remove("/tmp/foo") < 0 && errno != ENOENT) { | ||
354 | perror("remove"); | ||
355 | exit(1); | ||
356 | } | ||
357 | |||
358 | if (!WIFEXITED(status) || WEXITSTATUS(status)) { | ||
359 | fprintf(stderr, "worker exited nonzero\n"); | ||
360 | goto out_kill; | ||
361 | } | ||
362 | |||
363 | ret = 0; | ||
364 | |||
365 | out_kill: | ||
366 | if (tracer > 0) | ||
367 | kill(tracer, SIGKILL); | ||
368 | if (worker > 0) | ||
369 | kill(worker, SIGKILL); | ||
370 | |||
371 | close_pair: | ||
372 | close(sk_pair[0]); | ||
373 | close(sk_pair[1]); | ||
374 | return ret; | ||
375 | } | ||
diff --git a/samples/timers/.gitignore b/samples/timers/.gitignore new file mode 100644 index 000000000..40510c33c --- /dev/null +++ b/samples/timers/.gitignore | |||
@@ -0,0 +1,2 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | hpet_example | ||
diff --git a/samples/timers/Makefile b/samples/timers/Makefile new file mode 100644 index 000000000..e6836cdea --- /dev/null +++ b/samples/timers/Makefile | |||
@@ -0,0 +1,4 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0 | ||
2 | userprogs-always-y += hpet_example | ||
3 | |||
4 | userccflags += -I usr/include | ||
diff --git a/samples/timers/hpet_example.c b/samples/timers/hpet_example.c new file mode 100644 index 000000000..f1cb622f6 --- /dev/null +++ b/samples/timers/hpet_example.c | |||
@@ -0,0 +1,295 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #include <stdio.h> | ||
3 | #include <stdlib.h> | ||
4 | #include <unistd.h> | ||
5 | #include <fcntl.h> | ||
6 | #include <string.h> | ||
7 | #include <memory.h> | ||
8 | #include <malloc.h> | ||
9 | #include <time.h> | ||
10 | #include <ctype.h> | ||
11 | #include <sys/types.h> | ||
12 | #include <sys/wait.h> | ||
13 | #include <signal.h> | ||
14 | #include <errno.h> | ||
15 | #include <sys/time.h> | ||
16 | #include <linux/hpet.h> | ||
17 | |||
18 | |||
19 | extern void hpet_open_close(int, const char **); | ||
20 | extern void hpet_info(int, const char **); | ||
21 | extern void hpet_poll(int, const char **); | ||
22 | extern void hpet_fasync(int, const char **); | ||
23 | extern void hpet_read(int, const char **); | ||
24 | |||
25 | #include <sys/poll.h> | ||
26 | #include <sys/ioctl.h> | ||
27 | |||
28 | struct hpet_command { | ||
29 | char *command; | ||
30 | void (*func)(int argc, const char ** argv); | ||
31 | } hpet_command[] = { | ||
32 | { | ||
33 | "open-close", | ||
34 | hpet_open_close | ||
35 | }, | ||
36 | { | ||
37 | "info", | ||
38 | hpet_info | ||
39 | }, | ||
40 | { | ||
41 | "poll", | ||
42 | hpet_poll | ||
43 | }, | ||
44 | { | ||
45 | "fasync", | ||
46 | hpet_fasync | ||
47 | }, | ||
48 | }; | ||
49 | |||
50 | int | ||
51 | main(int argc, const char ** argv) | ||
52 | { | ||
53 | unsigned int i; | ||
54 | |||
55 | argc--; | ||
56 | argv++; | ||
57 | |||
58 | if (!argc) { | ||
59 | fprintf(stderr, "-hpet: requires command\n"); | ||
60 | return -1; | ||
61 | } | ||
62 | |||
63 | |||
64 | for (i = 0; i < (sizeof (hpet_command) / sizeof (hpet_command[0])); i++) | ||
65 | if (!strcmp(argv[0], hpet_command[i].command)) { | ||
66 | argc--; | ||
67 | argv++; | ||
68 | fprintf(stderr, "-hpet: executing %s\n", | ||
69 | hpet_command[i].command); | ||
70 | hpet_command[i].func(argc, argv); | ||
71 | return 0; | ||
72 | } | ||
73 | |||
74 | fprintf(stderr, "do_hpet: command %s not implemented\n", argv[0]); | ||
75 | |||
76 | return -1; | ||
77 | } | ||
78 | |||
79 | void | ||
80 | hpet_open_close(int argc, const char **argv) | ||
81 | { | ||
82 | int fd; | ||
83 | |||
84 | if (argc != 1) { | ||
85 | fprintf(stderr, "hpet_open_close: device-name\n"); | ||
86 | return; | ||
87 | } | ||
88 | |||
89 | fd = open(argv[0], O_RDONLY); | ||
90 | if (fd < 0) | ||
91 | fprintf(stderr, "hpet_open_close: open failed\n"); | ||
92 | else | ||
93 | close(fd); | ||
94 | |||
95 | return; | ||
96 | } | ||
97 | |||
98 | void | ||
99 | hpet_info(int argc, const char **argv) | ||
100 | { | ||
101 | struct hpet_info info; | ||
102 | int fd; | ||
103 | |||
104 | if (argc != 1) { | ||
105 | fprintf(stderr, "hpet_info: device-name\n"); | ||
106 | return; | ||
107 | } | ||
108 | |||
109 | fd = open(argv[0], O_RDONLY); | ||
110 | if (fd < 0) { | ||
111 | fprintf(stderr, "hpet_info: open of %s failed\n", argv[0]); | ||
112 | return; | ||
113 | } | ||
114 | |||
115 | if (ioctl(fd, HPET_INFO, &info) < 0) { | ||
116 | fprintf(stderr, "hpet_info: failed to get info\n"); | ||
117 | goto out; | ||
118 | } | ||
119 | |||
120 | fprintf(stderr, "hpet_info: hi_irqfreq 0x%lx hi_flags 0x%lx ", | ||
121 | info.hi_ireqfreq, info.hi_flags); | ||
122 | fprintf(stderr, "hi_hpet %d hi_timer %d\n", | ||
123 | info.hi_hpet, info.hi_timer); | ||
124 | |||
125 | out: | ||
126 | close(fd); | ||
127 | return; | ||
128 | } | ||
129 | |||
130 | void | ||
131 | hpet_poll(int argc, const char **argv) | ||
132 | { | ||
133 | unsigned long freq; | ||
134 | int iterations, i, fd; | ||
135 | struct pollfd pfd; | ||
136 | struct hpet_info info; | ||
137 | struct timeval stv, etv; | ||
138 | struct timezone tz; | ||
139 | long usec; | ||
140 | |||
141 | if (argc != 3) { | ||
142 | fprintf(stderr, "hpet_poll: device-name freq iterations\n"); | ||
143 | return; | ||
144 | } | ||
145 | |||
146 | freq = atoi(argv[1]); | ||
147 | iterations = atoi(argv[2]); | ||
148 | |||
149 | fd = open(argv[0], O_RDONLY); | ||
150 | |||
151 | if (fd < 0) { | ||
152 | fprintf(stderr, "hpet_poll: open of %s failed\n", argv[0]); | ||
153 | return; | ||
154 | } | ||
155 | |||
156 | if (ioctl(fd, HPET_IRQFREQ, freq) < 0) { | ||
157 | fprintf(stderr, "hpet_poll: HPET_IRQFREQ failed\n"); | ||
158 | goto out; | ||
159 | } | ||
160 | |||
161 | if (ioctl(fd, HPET_INFO, &info) < 0) { | ||
162 | fprintf(stderr, "hpet_poll: failed to get info\n"); | ||
163 | goto out; | ||
164 | } | ||
165 | |||
166 | fprintf(stderr, "hpet_poll: info.hi_flags 0x%lx\n", info.hi_flags); | ||
167 | |||
168 | if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) { | ||
169 | fprintf(stderr, "hpet_poll: HPET_EPI failed\n"); | ||
170 | goto out; | ||
171 | } | ||
172 | |||
173 | if (ioctl(fd, HPET_IE_ON, 0) < 0) { | ||
174 | fprintf(stderr, "hpet_poll, HPET_IE_ON failed\n"); | ||
175 | goto out; | ||
176 | } | ||
177 | |||
178 | pfd.fd = fd; | ||
179 | pfd.events = POLLIN; | ||
180 | |||
181 | for (i = 0; i < iterations; i++) { | ||
182 | pfd.revents = 0; | ||
183 | gettimeofday(&stv, &tz); | ||
184 | if (poll(&pfd, 1, -1) < 0) | ||
185 | fprintf(stderr, "hpet_poll: poll failed\n"); | ||
186 | else { | ||
187 | long data; | ||
188 | |||
189 | gettimeofday(&etv, &tz); | ||
190 | usec = stv.tv_sec * 1000000 + stv.tv_usec; | ||
191 | usec = (etv.tv_sec * 1000000 + etv.tv_usec) - usec; | ||
192 | |||
193 | fprintf(stderr, | ||
194 | "hpet_poll: expired time = 0x%lx\n", usec); | ||
195 | |||
196 | fprintf(stderr, "hpet_poll: revents = 0x%x\n", | ||
197 | pfd.revents); | ||
198 | |||
199 | if (read(fd, &data, sizeof(data)) != sizeof(data)) { | ||
200 | fprintf(stderr, "hpet_poll: read failed\n"); | ||
201 | } | ||
202 | else | ||
203 | fprintf(stderr, "hpet_poll: data 0x%lx\n", | ||
204 | data); | ||
205 | } | ||
206 | } | ||
207 | |||
208 | out: | ||
209 | close(fd); | ||
210 | return; | ||
211 | } | ||
212 | |||
213 | static int hpet_sigio_count; | ||
214 | |||
215 | static void | ||
216 | hpet_sigio(int val) | ||
217 | { | ||
218 | fprintf(stderr, "hpet_sigio: called\n"); | ||
219 | hpet_sigio_count++; | ||
220 | } | ||
221 | |||
222 | void | ||
223 | hpet_fasync(int argc, const char **argv) | ||
224 | { | ||
225 | unsigned long freq; | ||
226 | int iterations, i, fd, value; | ||
227 | sig_t oldsig; | ||
228 | struct hpet_info info; | ||
229 | |||
230 | hpet_sigio_count = 0; | ||
231 | fd = -1; | ||
232 | |||
233 | if ((oldsig = signal(SIGIO, hpet_sigio)) == SIG_ERR) { | ||
234 | fprintf(stderr, "hpet_fasync: failed to set signal handler\n"); | ||
235 | return; | ||
236 | } | ||
237 | |||
238 | if (argc != 3) { | ||
239 | fprintf(stderr, "hpet_fasync: device-name freq iterations\n"); | ||
240 | goto out; | ||
241 | } | ||
242 | |||
243 | fd = open(argv[0], O_RDONLY); | ||
244 | |||
245 | if (fd < 0) { | ||
246 | fprintf(stderr, "hpet_fasync: failed to open %s\n", argv[0]); | ||
247 | return; | ||
248 | } | ||
249 | |||
250 | |||
251 | if ((fcntl(fd, F_SETOWN, getpid()) == 1) || | ||
252 | ((value = fcntl(fd, F_GETFL)) == 1) || | ||
253 | (fcntl(fd, F_SETFL, value | O_ASYNC) == 1)) { | ||
254 | fprintf(stderr, "hpet_fasync: fcntl failed\n"); | ||
255 | goto out; | ||
256 | } | ||
257 | |||
258 | freq = atoi(argv[1]); | ||
259 | iterations = atoi(argv[2]); | ||
260 | |||
261 | if (ioctl(fd, HPET_IRQFREQ, freq) < 0) { | ||
262 | fprintf(stderr, "hpet_fasync: HPET_IRQFREQ failed\n"); | ||
263 | goto out; | ||
264 | } | ||
265 | |||
266 | if (ioctl(fd, HPET_INFO, &info) < 0) { | ||
267 | fprintf(stderr, "hpet_fasync: failed to get info\n"); | ||
268 | goto out; | ||
269 | } | ||
270 | |||
271 | fprintf(stderr, "hpet_fasync: info.hi_flags 0x%lx\n", info.hi_flags); | ||
272 | |||
273 | if (info.hi_flags && (ioctl(fd, HPET_EPI, 0) < 0)) { | ||
274 | fprintf(stderr, "hpet_fasync: HPET_EPI failed\n"); | ||
275 | goto out; | ||
276 | } | ||
277 | |||
278 | if (ioctl(fd, HPET_IE_ON, 0) < 0) { | ||
279 | fprintf(stderr, "hpet_fasync, HPET_IE_ON failed\n"); | ||
280 | goto out; | ||
281 | } | ||
282 | |||
283 | for (i = 0; i < iterations; i++) { | ||
284 | (void) pause(); | ||
285 | fprintf(stderr, "hpet_fasync: count = %d\n", hpet_sigio_count); | ||
286 | } | ||
287 | |||
288 | out: | ||
289 | signal(SIGIO, oldsig); | ||
290 | |||
291 | if (fd >= 0) | ||
292 | close(fd); | ||
293 | |||
294 | return; | ||
295 | } | ||
diff --git a/samples/trace_events/Makefile b/samples/trace_events/Makefile new file mode 100644 index 000000000..b78344e7b --- /dev/null +++ b/samples/trace_events/Makefile | |||
@@ -0,0 +1,15 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | # builds the trace events example kernel modules; | ||
3 | # then to use one (as root): insmod <module_name.ko> | ||
4 | |||
5 | # If you include a trace header outside of include/trace/events | ||
6 | # then the file that does the #define CREATE_TRACE_POINTS must | ||
7 | # have that tracer file in its main search path. This is because | ||
8 | # define_trace.h will include it, and must be able to find it from | ||
9 | # the include/trace directory. | ||
10 | # | ||
11 | # Here trace-events-sample.c does the CREATE_TRACE_POINTS. | ||
12 | # | ||
13 | CFLAGS_trace-events-sample.o := -I$(src) | ||
14 | |||
15 | obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace-events-sample.o | ||
diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c new file mode 100644 index 000000000..1a72b7d95 --- /dev/null +++ b/samples/trace_events/trace-events-sample.c | |||
@@ -0,0 +1,140 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/kthread.h> | ||
4 | |||
5 | /* | ||
6 | * Any file that uses trace points, must include the header. | ||
7 | * But only one file, must include the header by defining | ||
8 | * CREATE_TRACE_POINTS first. This will make the C code that | ||
9 | * creates the handles for the trace points. | ||
10 | */ | ||
11 | #define CREATE_TRACE_POINTS | ||
12 | #include "trace-events-sample.h" | ||
13 | |||
14 | static const char *random_strings[] = { | ||
15 | "Mother Goose", | ||
16 | "Snoopy", | ||
17 | "Gandalf", | ||
18 | "Frodo", | ||
19 | "One ring to rule them all" | ||
20 | }; | ||
21 | |||
22 | static void simple_thread_func(int cnt) | ||
23 | { | ||
24 | int array[6]; | ||
25 | int len = cnt % 5; | ||
26 | int i; | ||
27 | |||
28 | set_current_state(TASK_INTERRUPTIBLE); | ||
29 | schedule_timeout(HZ); | ||
30 | |||
31 | for (i = 0; i < len; i++) | ||
32 | array[i] = i + 1; | ||
33 | array[i] = 0; | ||
34 | |||
35 | /* Silly tracepoints */ | ||
36 | trace_foo_bar("hello", cnt, array, random_strings[len], | ||
37 | current->cpus_ptr); | ||
38 | |||
39 | trace_foo_with_template_simple("HELLO", cnt); | ||
40 | |||
41 | trace_foo_bar_with_cond("Some times print", cnt); | ||
42 | |||
43 | trace_foo_with_template_cond("prints other times", cnt); | ||
44 | |||
45 | trace_foo_with_template_print("I have to be different", cnt); | ||
46 | } | ||
47 | |||
48 | static int simple_thread(void *arg) | ||
49 | { | ||
50 | int cnt = 0; | ||
51 | |||
52 | while (!kthread_should_stop()) | ||
53 | simple_thread_func(cnt++); | ||
54 | |||
55 | return 0; | ||
56 | } | ||
57 | |||
58 | static struct task_struct *simple_tsk; | ||
59 | static struct task_struct *simple_tsk_fn; | ||
60 | |||
61 | static void simple_thread_func_fn(int cnt) | ||
62 | { | ||
63 | set_current_state(TASK_INTERRUPTIBLE); | ||
64 | schedule_timeout(HZ); | ||
65 | |||
66 | /* More silly tracepoints */ | ||
67 | trace_foo_bar_with_fn("Look at me", cnt); | ||
68 | trace_foo_with_template_fn("Look at me too", cnt); | ||
69 | } | ||
70 | |||
71 | static int simple_thread_fn(void *arg) | ||
72 | { | ||
73 | int cnt = 0; | ||
74 | |||
75 | while (!kthread_should_stop()) | ||
76 | simple_thread_func_fn(cnt++); | ||
77 | |||
78 | return 0; | ||
79 | } | ||
80 | |||
81 | static DEFINE_MUTEX(thread_mutex); | ||
82 | static int simple_thread_cnt; | ||
83 | |||
84 | int foo_bar_reg(void) | ||
85 | { | ||
86 | mutex_lock(&thread_mutex); | ||
87 | if (simple_thread_cnt++) | ||
88 | goto out; | ||
89 | |||
90 | pr_info("Starting thread for foo_bar_fn\n"); | ||
91 | /* | ||
92 | * We shouldn't be able to start a trace when the module is | ||
93 | * unloading (there's other locks to prevent that). But | ||
94 | * for consistency sake, we still take the thread_mutex. | ||
95 | */ | ||
96 | simple_tsk_fn = kthread_run(simple_thread_fn, NULL, "event-sample-fn"); | ||
97 | out: | ||
98 | mutex_unlock(&thread_mutex); | ||
99 | return 0; | ||
100 | } | ||
101 | |||
102 | void foo_bar_unreg(void) | ||
103 | { | ||
104 | mutex_lock(&thread_mutex); | ||
105 | if (--simple_thread_cnt) | ||
106 | goto out; | ||
107 | |||
108 | pr_info("Killing thread for foo_bar_fn\n"); | ||
109 | if (simple_tsk_fn) | ||
110 | kthread_stop(simple_tsk_fn); | ||
111 | simple_tsk_fn = NULL; | ||
112 | out: | ||
113 | mutex_unlock(&thread_mutex); | ||
114 | } | ||
115 | |||
116 | static int __init trace_event_init(void) | ||
117 | { | ||
118 | simple_tsk = kthread_run(simple_thread, NULL, "event-sample"); | ||
119 | if (IS_ERR(simple_tsk)) | ||
120 | return -1; | ||
121 | |||
122 | return 0; | ||
123 | } | ||
124 | |||
125 | static void __exit trace_event_exit(void) | ||
126 | { | ||
127 | kthread_stop(simple_tsk); | ||
128 | mutex_lock(&thread_mutex); | ||
129 | if (simple_tsk_fn) | ||
130 | kthread_stop(simple_tsk_fn); | ||
131 | simple_tsk_fn = NULL; | ||
132 | mutex_unlock(&thread_mutex); | ||
133 | } | ||
134 | |||
135 | module_init(trace_event_init); | ||
136 | module_exit(trace_event_exit); | ||
137 | |||
138 | MODULE_AUTHOR("Steven Rostedt"); | ||
139 | MODULE_DESCRIPTION("trace-events-sample"); | ||
140 | MODULE_LICENSE("GPL"); | ||
diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h new file mode 100644 index 000000000..13a35f7cb --- /dev/null +++ b/samples/trace_events/trace-events-sample.h | |||
@@ -0,0 +1,524 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
2 | /* | ||
3 | * If TRACE_SYSTEM is defined, that will be the directory created | ||
4 | * in the ftrace directory under /sys/kernel/tracing/events/<system> | ||
5 | * | ||
6 | * The define_trace.h below will also look for a file name of | ||
7 | * TRACE_SYSTEM.h where TRACE_SYSTEM is what is defined here. | ||
8 | * In this case, it would look for sample-trace.h | ||
9 | * | ||
10 | * If the header name will be different than the system name | ||
11 | * (as in this case), then you can override the header name that | ||
12 | * define_trace.h will look up by defining TRACE_INCLUDE_FILE | ||
13 | * | ||
14 | * This file is called trace-events-sample.h but we want the system | ||
15 | * to be called "sample-trace". Therefore we must define the name of this | ||
16 | * file: | ||
17 | * | ||
18 | * #define TRACE_INCLUDE_FILE trace-events-sample | ||
19 | * | ||
20 | * As we do an the bottom of this file. | ||
21 | * | ||
22 | * Notice that TRACE_SYSTEM should be defined outside of #if | ||
23 | * protection, just like TRACE_INCLUDE_FILE. | ||
24 | */ | ||
25 | #undef TRACE_SYSTEM | ||
26 | #define TRACE_SYSTEM sample-trace | ||
27 | |||
28 | /* | ||
29 | * TRACE_SYSTEM is expected to be a C valid variable (alpha-numeric | ||
30 | * and underscore), although it may start with numbers. If for some | ||
31 | * reason it is not, you need to add the following lines: | ||
32 | */ | ||
33 | #undef TRACE_SYSTEM_VAR | ||
34 | #define TRACE_SYSTEM_VAR sample_trace | ||
35 | /* | ||
36 | * But the above is only needed if TRACE_SYSTEM is not alpha-numeric | ||
37 | * and underscored. By default, TRACE_SYSTEM_VAR will be equal to | ||
38 | * TRACE_SYSTEM. As TRACE_SYSTEM_VAR must be alpha-numeric, if | ||
39 | * TRACE_SYSTEM is not, then TRACE_SYSTEM_VAR must be defined with | ||
40 | * only alpha-numeric and underscores. | ||
41 | * | ||
42 | * The TRACE_SYSTEM_VAR is only used internally and not visible to | ||
43 | * user space. | ||
44 | */ | ||
45 | |||
46 | /* | ||
47 | * Notice that this file is not protected like a normal header. | ||
48 | * We also must allow for rereading of this file. The | ||
49 | * | ||
50 | * || defined(TRACE_HEADER_MULTI_READ) | ||
51 | * | ||
52 | * serves this purpose. | ||
53 | */ | ||
54 | #if !defined(_TRACE_EVENT_SAMPLE_H) || defined(TRACE_HEADER_MULTI_READ) | ||
55 | #define _TRACE_EVENT_SAMPLE_H | ||
56 | |||
57 | /* | ||
58 | * All trace headers should include tracepoint.h, until we finally | ||
59 | * make it into a standard header. | ||
60 | */ | ||
61 | #include <linux/tracepoint.h> | ||
62 | |||
63 | /* | ||
64 | * The TRACE_EVENT macro is broken up into 5 parts. | ||
65 | * | ||
66 | * name: name of the trace point. This is also how to enable the tracepoint. | ||
67 | * A function called trace_foo_bar() will be created. | ||
68 | * | ||
69 | * proto: the prototype of the function trace_foo_bar() | ||
70 | * Here it is trace_foo_bar(char *foo, int bar). | ||
71 | * | ||
72 | * args: must match the arguments in the prototype. | ||
73 | * Here it is simply "foo, bar". | ||
74 | * | ||
75 | * struct: This defines the way the data will be stored in the ring buffer. | ||
76 | * The items declared here become part of a special structure | ||
77 | * called "__entry", which can be used in the fast_assign part of the | ||
78 | * TRACE_EVENT macro. | ||
79 | * | ||
80 | * Here are the currently defined types you can use: | ||
81 | * | ||
82 | * __field : Is broken up into type and name. Where type can be any | ||
83 | * primitive type (integer, long or pointer). | ||
84 | * | ||
85 | * __field(int, foo) | ||
86 | * | ||
87 | * __entry->foo = 5; | ||
88 | * | ||
89 | * __field_struct : This can be any static complex data type (struct, union | ||
90 | * but not an array). Be careful using complex types, as each | ||
91 | * event is limited in size, and copying large amounts of data | ||
92 | * into the ring buffer can slow things down. | ||
93 | * | ||
94 | * __field_struct(struct bar, foo) | ||
95 | * | ||
96 | * __entry->bar.x = y; | ||
97 | |||
98 | * __array: There are three fields (type, name, size). The type is the | ||
99 | * type of elements in the array, the name is the name of the array. | ||
100 | * size is the number of items in the array (not the total size). | ||
101 | * | ||
102 | * __array( char, foo, 10) is the same as saying: char foo[10]; | ||
103 | * | ||
104 | * Assigning arrays can be done like any array: | ||
105 | * | ||
106 | * __entry->foo[0] = 'a'; | ||
107 | * | ||
108 | * memcpy(__entry->foo, bar, 10); | ||
109 | * | ||
110 | * __dynamic_array: This is similar to array, but can vary its size from | ||
111 | * instance to instance of the tracepoint being called. | ||
112 | * Like __array, this too has three elements (type, name, size); | ||
113 | * type is the type of the element, name is the name of the array. | ||
114 | * The size is different than __array. It is not a static number, | ||
115 | * but the algorithm to figure out the length of the array for the | ||
116 | * specific instance of tracepoint. Again, size is the number of | ||
117 | * items in the array, not the total length in bytes. | ||
118 | * | ||
119 | * __dynamic_array( int, foo, bar) is similar to: int foo[bar]; | ||
120 | * | ||
121 | * Note, unlike arrays, you must use the __get_dynamic_array() macro | ||
122 | * to access the array. | ||
123 | * | ||
124 | * memcpy(__get_dynamic_array(foo), bar, 10); | ||
125 | * | ||
126 | * Notice, that "__entry" is not needed here. | ||
127 | * | ||
128 | * __string: This is a special kind of __dynamic_array. It expects to | ||
129 | * have a null terminated character array passed to it (it allows | ||
130 | * for NULL too, which would be converted into "(null)"). __string | ||
131 | * takes two parameter (name, src), where name is the name of | ||
132 | * the string saved, and src is the string to copy into the | ||
133 | * ring buffer. | ||
134 | * | ||
135 | * __string(foo, bar) is similar to: strcpy(foo, bar) | ||
136 | * | ||
137 | * To assign a string, use the helper macro __assign_str(). | ||
138 | * | ||
139 | * __assign_str(foo, bar); | ||
140 | * | ||
141 | * In most cases, the __assign_str() macro will take the same | ||
142 | * parameters as the __string() macro had to declare the string. | ||
143 | * | ||
144 | * __bitmask: This is another kind of __dynamic_array, but it expects | ||
145 | * an array of longs, and the number of bits to parse. It takes | ||
146 | * two parameters (name, nr_bits), where name is the name of the | ||
147 | * bitmask to save, and the nr_bits is the number of bits to record. | ||
148 | * | ||
149 | * __bitmask(target_cpu, nr_cpumask_bits) | ||
150 | * | ||
151 | * To assign a bitmask, use the __assign_bitmask() helper macro. | ||
152 | * | ||
153 | * __assign_bitmask(target_cpus, cpumask_bits(bar), nr_cpumask_bits); | ||
154 | * | ||
155 | * | ||
156 | * fast_assign: This is a C like function that is used to store the items | ||
157 | * into the ring buffer. A special variable called "__entry" will be the | ||
158 | * structure that points into the ring buffer and has the same fields as | ||
159 | * described by the struct part of TRACE_EVENT above. | ||
160 | * | ||
161 | * printk: This is a way to print out the data in pretty print. This is | ||
162 | * useful if the system crashes and you are logging via a serial line, | ||
163 | * the data can be printed to the console using this "printk" method. | ||
164 | * This is also used to print out the data from the trace files. | ||
165 | * Again, the __entry macro is used to access the data from the ring buffer. | ||
166 | * | ||
167 | * Note, __dynamic_array, __string, and __bitmask require special helpers | ||
168 | * to access the data. | ||
169 | * | ||
170 | * For __dynamic_array(int, foo, bar) use __get_dynamic_array(foo) | ||
171 | * Use __get_dynamic_array_len(foo) to get the length of the array | ||
172 | * saved. Note, __get_dynamic_array_len() returns the total allocated | ||
173 | * length of the dynamic array; __print_array() expects the second | ||
174 | * parameter to be the number of elements. To get that, the array length | ||
175 | * needs to be divided by the element size. | ||
176 | * | ||
177 | * For __string(foo, bar) use __get_str(foo) | ||
178 | * | ||
179 | * For __bitmask(target_cpus, nr_cpumask_bits) use __get_bitmask(target_cpus) | ||
180 | * | ||
181 | * | ||
182 | * Note, that for both the assign and the printk, __entry is the handler | ||
183 | * to the data structure in the ring buffer, and is defined by the | ||
184 | * TP_STRUCT__entry. | ||
185 | */ | ||
186 | |||
187 | /* | ||
188 | * It is OK to have helper functions in the file, but they need to be protected | ||
189 | * from being defined more than once. Remember, this file gets included more | ||
190 | * than once. | ||
191 | */ | ||
192 | #ifndef __TRACE_EVENT_SAMPLE_HELPER_FUNCTIONS | ||
193 | #define __TRACE_EVENT_SAMPLE_HELPER_FUNCTIONS | ||
194 | static inline int __length_of(const int *list) | ||
195 | { | ||
196 | int i; | ||
197 | |||
198 | if (!list) | ||
199 | return 0; | ||
200 | |||
201 | for (i = 0; list[i]; i++) | ||
202 | ; | ||
203 | return i; | ||
204 | } | ||
205 | |||
206 | enum { | ||
207 | TRACE_SAMPLE_FOO = 2, | ||
208 | TRACE_SAMPLE_BAR = 4, | ||
209 | TRACE_SAMPLE_ZOO = 8, | ||
210 | }; | ||
211 | #endif | ||
212 | |||
213 | /* | ||
214 | * If enums are used in the TP_printk(), their names will be shown in | ||
215 | * format files and not their values. This can cause problems with user | ||
216 | * space programs that parse the format files to know how to translate | ||
217 | * the raw binary trace output into human readable text. | ||
218 | * | ||
219 | * To help out user space programs, any enum that is used in the TP_printk() | ||
220 | * should be defined by TRACE_DEFINE_ENUM() macro. All that is needed to | ||
221 | * be done is to add this macro with the enum within it in the trace | ||
222 | * header file, and it will be converted in the output. | ||
223 | */ | ||
224 | |||
225 | TRACE_DEFINE_ENUM(TRACE_SAMPLE_FOO); | ||
226 | TRACE_DEFINE_ENUM(TRACE_SAMPLE_BAR); | ||
227 | TRACE_DEFINE_ENUM(TRACE_SAMPLE_ZOO); | ||
228 | |||
229 | TRACE_EVENT(foo_bar, | ||
230 | |||
231 | TP_PROTO(const char *foo, int bar, const int *lst, | ||
232 | const char *string, const struct cpumask *mask), | ||
233 | |||
234 | TP_ARGS(foo, bar, lst, string, mask), | ||
235 | |||
236 | TP_STRUCT__entry( | ||
237 | __array( char, foo, 10 ) | ||
238 | __field( int, bar ) | ||
239 | __dynamic_array(int, list, __length_of(lst)) | ||
240 | __string( str, string ) | ||
241 | __bitmask( cpus, num_possible_cpus() ) | ||
242 | ), | ||
243 | |||
244 | TP_fast_assign( | ||
245 | strlcpy(__entry->foo, foo, 10); | ||
246 | __entry->bar = bar; | ||
247 | memcpy(__get_dynamic_array(list), lst, | ||
248 | __length_of(lst) * sizeof(int)); | ||
249 | __assign_str(str, string); | ||
250 | __assign_bitmask(cpus, cpumask_bits(mask), num_possible_cpus()); | ||
251 | ), | ||
252 | |||
253 | TP_printk("foo %s %d %s %s %s %s (%s)", __entry->foo, __entry->bar, | ||
254 | |||
255 | /* | ||
256 | * Notice here the use of some helper functions. This includes: | ||
257 | * | ||
258 | * __print_symbolic( variable, { value, "string" }, ... ), | ||
259 | * | ||
260 | * The variable is tested against each value of the { } pair. If | ||
261 | * the variable matches one of the values, then it will print the | ||
262 | * string in that pair. If non are matched, it returns a string | ||
263 | * version of the number (if __entry->bar == 7 then "7" is returned). | ||
264 | */ | ||
265 | __print_symbolic(__entry->bar, | ||
266 | { 0, "zero" }, | ||
267 | { TRACE_SAMPLE_FOO, "TWO" }, | ||
268 | { TRACE_SAMPLE_BAR, "FOUR" }, | ||
269 | { TRACE_SAMPLE_ZOO, "EIGHT" }, | ||
270 | { 10, "TEN" } | ||
271 | ), | ||
272 | |||
273 | /* | ||
274 | * __print_flags( variable, "delim", { value, "flag" }, ... ), | ||
275 | * | ||
276 | * This is similar to __print_symbolic, except that it tests the bits | ||
277 | * of the value. If ((FLAG & variable) == FLAG) then the string is | ||
278 | * printed. If more than one flag matches, then each one that does is | ||
279 | * also printed with delim in between them. | ||
280 | * If not all bits are accounted for, then the not found bits will be | ||
281 | * added in hex format: 0x506 will show BIT2|BIT4|0x500 | ||
282 | */ | ||
283 | __print_flags(__entry->bar, "|", | ||
284 | { 1, "BIT1" }, | ||
285 | { 2, "BIT2" }, | ||
286 | { 4, "BIT3" }, | ||
287 | { 8, "BIT4" } | ||
288 | ), | ||
289 | /* | ||
290 | * __print_array( array, len, element_size ) | ||
291 | * | ||
292 | * This prints out the array that is defined by __array in a nice format. | ||
293 | */ | ||
294 | __print_array(__get_dynamic_array(list), | ||
295 | __get_dynamic_array_len(list) / sizeof(int), | ||
296 | sizeof(int)), | ||
297 | __get_str(str), __get_bitmask(cpus)) | ||
298 | ); | ||
299 | |||
300 | /* | ||
301 | * There may be a case where a tracepoint should only be called if | ||
302 | * some condition is set. Otherwise the tracepoint should not be called. | ||
303 | * But to do something like: | ||
304 | * | ||
305 | * if (cond) | ||
306 | * trace_foo(); | ||
307 | * | ||
308 | * Would cause a little overhead when tracing is not enabled, and that | ||
309 | * overhead, even if small, is not something we want. As tracepoints | ||
310 | * use static branch (aka jump_labels), where no branch is taken to | ||
311 | * skip the tracepoint when not enabled, and a jmp is placed to jump | ||
312 | * to the tracepoint code when it is enabled, having a if statement | ||
313 | * nullifies that optimization. It would be nice to place that | ||
314 | * condition within the static branch. This is where TRACE_EVENT_CONDITION | ||
315 | * comes in. | ||
316 | * | ||
317 | * TRACE_EVENT_CONDITION() is just like TRACE_EVENT, except it adds another | ||
318 | * parameter just after args. Where TRACE_EVENT has: | ||
319 | * | ||
320 | * TRACE_EVENT(name, proto, args, struct, assign, printk) | ||
321 | * | ||
322 | * the CONDITION version has: | ||
323 | * | ||
324 | * TRACE_EVENT_CONDITION(name, proto, args, cond, struct, assign, printk) | ||
325 | * | ||
326 | * Everything is the same as TRACE_EVENT except for the new cond. Think | ||
327 | * of the cond variable as: | ||
328 | * | ||
329 | * if (cond) | ||
330 | * trace_foo_bar_with_cond(); | ||
331 | * | ||
332 | * Except that the logic for the if branch is placed after the static branch. | ||
333 | * That is, the if statement that processes the condition will not be | ||
334 | * executed unless that traecpoint is enabled. Otherwise it still remains | ||
335 | * a nop. | ||
336 | */ | ||
337 | TRACE_EVENT_CONDITION(foo_bar_with_cond, | ||
338 | |||
339 | TP_PROTO(const char *foo, int bar), | ||
340 | |||
341 | TP_ARGS(foo, bar), | ||
342 | |||
343 | TP_CONDITION(!(bar % 10)), | ||
344 | |||
345 | TP_STRUCT__entry( | ||
346 | __string( foo, foo ) | ||
347 | __field( int, bar ) | ||
348 | ), | ||
349 | |||
350 | TP_fast_assign( | ||
351 | __assign_str(foo, foo); | ||
352 | __entry->bar = bar; | ||
353 | ), | ||
354 | |||
355 | TP_printk("foo %s %d", __get_str(foo), __entry->bar) | ||
356 | ); | ||
357 | |||
358 | int foo_bar_reg(void); | ||
359 | void foo_bar_unreg(void); | ||
360 | |||
361 | /* | ||
362 | * Now in the case that some function needs to be called when the | ||
363 | * tracepoint is enabled and/or when it is disabled, the | ||
364 | * TRACE_EVENT_FN() serves this purpose. This is just like TRACE_EVENT() | ||
365 | * but adds two more parameters at the end: | ||
366 | * | ||
367 | * TRACE_EVENT_FN( name, proto, args, struct, assign, printk, reg, unreg) | ||
368 | * | ||
369 | * reg and unreg are functions with the prototype of: | ||
370 | * | ||
371 | * void reg(void) | ||
372 | * | ||
373 | * The reg function gets called before the tracepoint is enabled, and | ||
374 | * the unreg function gets called after the tracepoint is disabled. | ||
375 | * | ||
376 | * Note, reg and unreg are allowed to be NULL. If you only need to | ||
377 | * call a function before enabling, or after disabling, just set one | ||
378 | * function and pass in NULL for the other parameter. | ||
379 | */ | ||
380 | TRACE_EVENT_FN(foo_bar_with_fn, | ||
381 | |||
382 | TP_PROTO(const char *foo, int bar), | ||
383 | |||
384 | TP_ARGS(foo, bar), | ||
385 | |||
386 | TP_STRUCT__entry( | ||
387 | __string( foo, foo ) | ||
388 | __field( int, bar ) | ||
389 | ), | ||
390 | |||
391 | TP_fast_assign( | ||
392 | __assign_str(foo, foo); | ||
393 | __entry->bar = bar; | ||
394 | ), | ||
395 | |||
396 | TP_printk("foo %s %d", __get_str(foo), __entry->bar), | ||
397 | |||
398 | foo_bar_reg, foo_bar_unreg | ||
399 | ); | ||
400 | |||
401 | /* | ||
402 | * Each TRACE_EVENT macro creates several helper functions to produce | ||
403 | * the code to add the tracepoint, create the files in the trace | ||
404 | * directory, hook it to perf, assign the values and to print out | ||
405 | * the raw data from the ring buffer. To prevent too much bloat, | ||
406 | * if there are more than one tracepoint that uses the same format | ||
407 | * for the proto, args, struct, assign and printk, and only the name | ||
408 | * is different, it is highly recommended to use the DECLARE_EVENT_CLASS | ||
409 | * | ||
410 | * DECLARE_EVENT_CLASS() macro creates most of the functions for the | ||
411 | * tracepoint. Then DEFINE_EVENT() is use to hook a tracepoint to those | ||
412 | * functions. This DEFINE_EVENT() is an instance of the class and can | ||
413 | * be enabled and disabled separately from other events (either TRACE_EVENT | ||
414 | * or other DEFINE_EVENT()s). | ||
415 | * | ||
416 | * Note, TRACE_EVENT() itself is simply defined as: | ||
417 | * | ||
418 | * #define TRACE_EVENT(name, proto, args, tstruct, assign, printk) \ | ||
419 | * DECLARE_EVENT_CLASS(name, proto, args, tstruct, assign, printk); \ | ||
420 | * DEFINE_EVENT(name, name, proto, args) | ||
421 | * | ||
422 | * The DEFINE_EVENT() also can be declared with conditions and reg functions: | ||
423 | * | ||
424 | * DEFINE_EVENT_CONDITION(template, name, proto, args, cond); | ||
425 | * DEFINE_EVENT_FN(template, name, proto, args, reg, unreg); | ||
426 | */ | ||
427 | DECLARE_EVENT_CLASS(foo_template, | ||
428 | |||
429 | TP_PROTO(const char *foo, int bar), | ||
430 | |||
431 | TP_ARGS(foo, bar), | ||
432 | |||
433 | TP_STRUCT__entry( | ||
434 | __string( foo, foo ) | ||
435 | __field( int, bar ) | ||
436 | ), | ||
437 | |||
438 | TP_fast_assign( | ||
439 | __assign_str(foo, foo); | ||
440 | __entry->bar = bar; | ||
441 | ), | ||
442 | |||
443 | TP_printk("foo %s %d", __get_str(foo), __entry->bar) | ||
444 | ); | ||
445 | |||
446 | /* | ||
447 | * Here's a better way for the previous samples (except, the first | ||
448 | * example had more fields and could not be used here). | ||
449 | */ | ||
450 | DEFINE_EVENT(foo_template, foo_with_template_simple, | ||
451 | TP_PROTO(const char *foo, int bar), | ||
452 | TP_ARGS(foo, bar)); | ||
453 | |||
454 | DEFINE_EVENT_CONDITION(foo_template, foo_with_template_cond, | ||
455 | TP_PROTO(const char *foo, int bar), | ||
456 | TP_ARGS(foo, bar), | ||
457 | TP_CONDITION(!(bar % 8))); | ||
458 | |||
459 | |||
460 | DEFINE_EVENT_FN(foo_template, foo_with_template_fn, | ||
461 | TP_PROTO(const char *foo, int bar), | ||
462 | TP_ARGS(foo, bar), | ||
463 | foo_bar_reg, foo_bar_unreg); | ||
464 | |||
465 | /* | ||
466 | * Anytime two events share basically the same values and have | ||
467 | * the same output, use the DECLARE_EVENT_CLASS() and DEFINE_EVENT() | ||
468 | * when ever possible. | ||
469 | */ | ||
470 | |||
471 | /* | ||
472 | * If the event is similar to the DECLARE_EVENT_CLASS, but you need | ||
473 | * to have a different output, then use DEFINE_EVENT_PRINT() which | ||
474 | * lets you override the TP_printk() of the class. | ||
475 | */ | ||
476 | |||
477 | DEFINE_EVENT_PRINT(foo_template, foo_with_template_print, | ||
478 | TP_PROTO(const char *foo, int bar), | ||
479 | TP_ARGS(foo, bar), | ||
480 | TP_printk("bar %s %d", __get_str(foo), __entry->bar)); | ||
481 | |||
482 | #endif | ||
483 | |||
484 | /***** NOTICE! The #if protection ends here. *****/ | ||
485 | |||
486 | |||
487 | /* | ||
488 | * There are several ways I could have done this. If I left out the | ||
489 | * TRACE_INCLUDE_PATH, then it would default to the kernel source | ||
490 | * include/trace/events directory. | ||
491 | * | ||
492 | * I could specify a path from the define_trace.h file back to this | ||
493 | * file. | ||
494 | * | ||
495 | * #define TRACE_INCLUDE_PATH ../../samples/trace_events | ||
496 | * | ||
497 | * But the safest and easiest way to simply make it use the directory | ||
498 | * that the file is in is to add in the Makefile: | ||
499 | * | ||
500 | * CFLAGS_trace-events-sample.o := -I$(src) | ||
501 | * | ||
502 | * This will make sure the current path is part of the include | ||
503 | * structure for our file so that define_trace.h can find it. | ||
504 | * | ||
505 | * I could have made only the top level directory the include: | ||
506 | * | ||
507 | * CFLAGS_trace-events-sample.o := -I$(PWD) | ||
508 | * | ||
509 | * And then let the path to this directory be the TRACE_INCLUDE_PATH: | ||
510 | * | ||
511 | * #define TRACE_INCLUDE_PATH samples/trace_events | ||
512 | * | ||
513 | * But then if something defines "samples" or "trace_events" as a macro | ||
514 | * then we could risk that being converted too, and give us an unexpected | ||
515 | * result. | ||
516 | */ | ||
517 | #undef TRACE_INCLUDE_PATH | ||
518 | #undef TRACE_INCLUDE_FILE | ||
519 | #define TRACE_INCLUDE_PATH . | ||
520 | /* | ||
521 | * TRACE_INCLUDE_FILE is not needed if the filename and TRACE_SYSTEM are equal | ||
522 | */ | ||
523 | #define TRACE_INCLUDE_FILE trace-events-sample | ||
524 | #include <trace/define_trace.h> | ||
diff --git a/samples/trace_printk/Makefile b/samples/trace_printk/Makefile new file mode 100644 index 000000000..c0df36167 --- /dev/null +++ b/samples/trace_printk/Makefile | |||
@@ -0,0 +1,7 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | # builds a module that calls various trace_printk routines | ||
3 | # then to use one (as root): insmod <module_name.ko> | ||
4 | |||
5 | # This module can also be used to test the trace_printk code. | ||
6 | |||
7 | obj-$(CONFIG_SAMPLE_TRACE_PRINTK) += trace-printk.o | ||
diff --git a/samples/trace_printk/trace-printk.c b/samples/trace_printk/trace-printk.c new file mode 100644 index 000000000..cfc159580 --- /dev/null +++ b/samples/trace_printk/trace-printk.c | |||
@@ -0,0 +1,58 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/kthread.h> | ||
4 | #include <linux/irq_work.h> | ||
5 | |||
6 | /* Must not be static to force gcc to consider these non constant */ | ||
7 | char *trace_printk_test_global_str = | ||
8 | "This is a dynamic string that will use trace_puts\n"; | ||
9 | |||
10 | char *trace_printk_test_global_str_irq = | ||
11 | "(irq) This is a dynamic string that will use trace_puts\n"; | ||
12 | |||
13 | char *trace_printk_test_global_str_fmt = | ||
14 | "%sThis is a %s that will use trace_printk\n"; | ||
15 | |||
16 | static struct irq_work irqwork; | ||
17 | |||
18 | static void trace_printk_irq_work(struct irq_work *work) | ||
19 | { | ||
20 | trace_printk("(irq) This is a static string that will use trace_bputs\n"); | ||
21 | trace_printk(trace_printk_test_global_str_irq); | ||
22 | |||
23 | trace_printk("(irq) This is a %s that will use trace_bprintk()\n", | ||
24 | "static string"); | ||
25 | |||
26 | trace_printk(trace_printk_test_global_str_fmt, | ||
27 | "(irq) ", "dynamic string"); | ||
28 | } | ||
29 | |||
30 | static int __init trace_printk_init(void) | ||
31 | { | ||
32 | init_irq_work(&irqwork, trace_printk_irq_work); | ||
33 | |||
34 | trace_printk("This is a static string that will use trace_bputs\n"); | ||
35 | trace_printk(trace_printk_test_global_str); | ||
36 | |||
37 | /* Kick off printing in irq context */ | ||
38 | irq_work_queue(&irqwork); | ||
39 | irq_work_sync(&irqwork); | ||
40 | |||
41 | trace_printk("This is a %s that will use trace_bprintk()\n", | ||
42 | "static string"); | ||
43 | |||
44 | trace_printk(trace_printk_test_global_str_fmt, "", "dynamic string"); | ||
45 | |||
46 | return 0; | ||
47 | } | ||
48 | |||
49 | static void __exit trace_printk_exit(void) | ||
50 | { | ||
51 | } | ||
52 | |||
53 | module_init(trace_printk_init); | ||
54 | module_exit(trace_printk_exit); | ||
55 | |||
56 | MODULE_AUTHOR("Steven Rostedt"); | ||
57 | MODULE_DESCRIPTION("trace-printk"); | ||
58 | MODULE_LICENSE("GPL"); | ||
diff --git a/samples/uhid/.gitignore b/samples/uhid/.gitignore new file mode 100644 index 000000000..0e0a5a929 --- /dev/null +++ b/samples/uhid/.gitignore | |||
@@ -0,0 +1,2 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | /uhid-example | ||
diff --git a/samples/uhid/Makefile b/samples/uhid/Makefile new file mode 100644 index 000000000..0aa424ec4 --- /dev/null +++ b/samples/uhid/Makefile | |||
@@ -0,0 +1,4 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | userprogs-always-y += uhid-example | ||
3 | |||
4 | userccflags += -I usr/include | ||
diff --git a/samples/uhid/uhid-example.c b/samples/uhid/uhid-example.c new file mode 100644 index 000000000..015cb06a2 --- /dev/null +++ b/samples/uhid/uhid-example.c | |||
@@ -0,0 +1,465 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * UHID Example | ||
4 | * | ||
5 | * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> | ||
6 | * | ||
7 | * The code may be used by anyone for any purpose, | ||
8 | * and can serve as a starting point for developing | ||
9 | * applications using uhid. | ||
10 | */ | ||
11 | |||
12 | /* | ||
13 | * UHID Example | ||
14 | * This example emulates a basic 3 buttons mouse with wheel over UHID. Run this | ||
15 | * program as root and then use the following keys to control the mouse: | ||
16 | * q: Quit the application | ||
17 | * 1: Toggle left button (down, up, ...) | ||
18 | * 2: Toggle right button | ||
19 | * 3: Toggle middle button | ||
20 | * a: Move mouse left | ||
21 | * d: Move mouse right | ||
22 | * w: Move mouse up | ||
23 | * s: Move mouse down | ||
24 | * r: Move wheel up | ||
25 | * f: Move wheel down | ||
26 | * | ||
27 | * Additionally to 3 button mouse, 3 keyboard LEDs are also supported (LED_NUML, | ||
28 | * LED_CAPSL and LED_SCROLLL). The device doesn't generate any related keyboard | ||
29 | * events, though. You need to manually write the EV_LED/LED_XY/1 activation | ||
30 | * input event to the evdev device to see it being sent to this device. | ||
31 | * | ||
32 | * If uhid is not available as /dev/uhid, then you can pass a different path as | ||
33 | * first argument. | ||
34 | * If <linux/uhid.h> is not installed in /usr, then compile this with: | ||
35 | * gcc -o ./uhid_test -Wall -I./include ./samples/uhid/uhid-example.c | ||
36 | * And ignore the warning about kernel headers. However, it is recommended to | ||
37 | * use the installed uhid.h if available. | ||
38 | */ | ||
39 | |||
40 | #include <errno.h> | ||
41 | #include <fcntl.h> | ||
42 | #include <poll.h> | ||
43 | #include <stdbool.h> | ||
44 | #include <stdio.h> | ||
45 | #include <stdlib.h> | ||
46 | #include <string.h> | ||
47 | #include <termios.h> | ||
48 | #include <unistd.h> | ||
49 | #include <linux/uhid.h> | ||
50 | |||
51 | /* | ||
52 | * HID Report Desciptor | ||
53 | * We emulate a basic 3 button mouse with wheel and 3 keyboard LEDs. This is | ||
54 | * the report-descriptor as the kernel will parse it: | ||
55 | * | ||
56 | * INPUT(1)[INPUT] | ||
57 | * Field(0) | ||
58 | * Physical(GenericDesktop.Pointer) | ||
59 | * Application(GenericDesktop.Mouse) | ||
60 | * Usage(3) | ||
61 | * Button.0001 | ||
62 | * Button.0002 | ||
63 | * Button.0003 | ||
64 | * Logical Minimum(0) | ||
65 | * Logical Maximum(1) | ||
66 | * Report Size(1) | ||
67 | * Report Count(3) | ||
68 | * Report Offset(0) | ||
69 | * Flags( Variable Absolute ) | ||
70 | * Field(1) | ||
71 | * Physical(GenericDesktop.Pointer) | ||
72 | * Application(GenericDesktop.Mouse) | ||
73 | * Usage(3) | ||
74 | * GenericDesktop.X | ||
75 | * GenericDesktop.Y | ||
76 | * GenericDesktop.Wheel | ||
77 | * Logical Minimum(-128) | ||
78 | * Logical Maximum(127) | ||
79 | * Report Size(8) | ||
80 | * Report Count(3) | ||
81 | * Report Offset(8) | ||
82 | * Flags( Variable Relative ) | ||
83 | * OUTPUT(2)[OUTPUT] | ||
84 | * Field(0) | ||
85 | * Application(GenericDesktop.Keyboard) | ||
86 | * Usage(3) | ||
87 | * LED.NumLock | ||
88 | * LED.CapsLock | ||
89 | * LED.ScrollLock | ||
90 | * Logical Minimum(0) | ||
91 | * Logical Maximum(1) | ||
92 | * Report Size(1) | ||
93 | * Report Count(3) | ||
94 | * Report Offset(0) | ||
95 | * Flags( Variable Absolute ) | ||
96 | * | ||
97 | * This is the mapping that we expect: | ||
98 | * Button.0001 ---> Key.LeftBtn | ||
99 | * Button.0002 ---> Key.RightBtn | ||
100 | * Button.0003 ---> Key.MiddleBtn | ||
101 | * GenericDesktop.X ---> Relative.X | ||
102 | * GenericDesktop.Y ---> Relative.Y | ||
103 | * GenericDesktop.Wheel ---> Relative.Wheel | ||
104 | * LED.NumLock ---> LED.NumLock | ||
105 | * LED.CapsLock ---> LED.CapsLock | ||
106 | * LED.ScrollLock ---> LED.ScrollLock | ||
107 | * | ||
108 | * This information can be verified by reading /sys/kernel/debug/hid/<dev>/rdesc | ||
109 | * This file should print the same information as showed above. | ||
110 | */ | ||
111 | |||
112 | static unsigned char rdesc[] = { | ||
113 | 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ | ||
114 | 0x09, 0x02, /* USAGE (Mouse) */ | ||
115 | 0xa1, 0x01, /* COLLECTION (Application) */ | ||
116 | 0x09, 0x01, /* USAGE (Pointer) */ | ||
117 | 0xa1, 0x00, /* COLLECTION (Physical) */ | ||
118 | 0x85, 0x01, /* REPORT_ID (1) */ | ||
119 | 0x05, 0x09, /* USAGE_PAGE (Button) */ | ||
120 | 0x19, 0x01, /* USAGE_MINIMUM (Button 1) */ | ||
121 | 0x29, 0x03, /* USAGE_MAXIMUM (Button 3) */ | ||
122 | 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ | ||
123 | 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */ | ||
124 | 0x95, 0x03, /* REPORT_COUNT (3) */ | ||
125 | 0x75, 0x01, /* REPORT_SIZE (1) */ | ||
126 | 0x81, 0x02, /* INPUT (Data,Var,Abs) */ | ||
127 | 0x95, 0x01, /* REPORT_COUNT (1) */ | ||
128 | 0x75, 0x05, /* REPORT_SIZE (5) */ | ||
129 | 0x81, 0x01, /* INPUT (Cnst,Var,Abs) */ | ||
130 | 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ | ||
131 | 0x09, 0x30, /* USAGE (X) */ | ||
132 | 0x09, 0x31, /* USAGE (Y) */ | ||
133 | 0x09, 0x38, /* USAGE (WHEEL) */ | ||
134 | 0x15, 0x81, /* LOGICAL_MINIMUM (-127) */ | ||
135 | 0x25, 0x7f, /* LOGICAL_MAXIMUM (127) */ | ||
136 | 0x75, 0x08, /* REPORT_SIZE (8) */ | ||
137 | 0x95, 0x03, /* REPORT_COUNT (3) */ | ||
138 | 0x81, 0x06, /* INPUT (Data,Var,Rel) */ | ||
139 | 0xc0, /* END_COLLECTION */ | ||
140 | 0xc0, /* END_COLLECTION */ | ||
141 | 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ | ||
142 | 0x09, 0x06, /* USAGE (Keyboard) */ | ||
143 | 0xa1, 0x01, /* COLLECTION (Application) */ | ||
144 | 0x85, 0x02, /* REPORT_ID (2) */ | ||
145 | 0x05, 0x08, /* USAGE_PAGE (Led) */ | ||
146 | 0x19, 0x01, /* USAGE_MINIMUM (1) */ | ||
147 | 0x29, 0x03, /* USAGE_MAXIMUM (3) */ | ||
148 | 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ | ||
149 | 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */ | ||
150 | 0x95, 0x03, /* REPORT_COUNT (3) */ | ||
151 | 0x75, 0x01, /* REPORT_SIZE (1) */ | ||
152 | 0x91, 0x02, /* Output (Data,Var,Abs) */ | ||
153 | 0x95, 0x01, /* REPORT_COUNT (1) */ | ||
154 | 0x75, 0x05, /* REPORT_SIZE (5) */ | ||
155 | 0x91, 0x01, /* Output (Cnst,Var,Abs) */ | ||
156 | 0xc0, /* END_COLLECTION */ | ||
157 | }; | ||
158 | |||
159 | static int uhid_write(int fd, const struct uhid_event *ev) | ||
160 | { | ||
161 | ssize_t ret; | ||
162 | |||
163 | ret = write(fd, ev, sizeof(*ev)); | ||
164 | if (ret < 0) { | ||
165 | fprintf(stderr, "Cannot write to uhid: %m\n"); | ||
166 | return -errno; | ||
167 | } else if (ret != sizeof(*ev)) { | ||
168 | fprintf(stderr, "Wrong size written to uhid: %zd != %zu\n", | ||
169 | ret, sizeof(ev)); | ||
170 | return -EFAULT; | ||
171 | } else { | ||
172 | return 0; | ||
173 | } | ||
174 | } | ||
175 | |||
176 | static int create(int fd) | ||
177 | { | ||
178 | struct uhid_event ev; | ||
179 | |||
180 | memset(&ev, 0, sizeof(ev)); | ||
181 | ev.type = UHID_CREATE; | ||
182 | strcpy((char*)ev.u.create.name, "test-uhid-device"); | ||
183 | ev.u.create.rd_data = rdesc; | ||
184 | ev.u.create.rd_size = sizeof(rdesc); | ||
185 | ev.u.create.bus = BUS_USB; | ||
186 | ev.u.create.vendor = 0x15d9; | ||
187 | ev.u.create.product = 0x0a37; | ||
188 | ev.u.create.version = 0; | ||
189 | ev.u.create.country = 0; | ||
190 | |||
191 | return uhid_write(fd, &ev); | ||
192 | } | ||
193 | |||
194 | static void destroy(int fd) | ||
195 | { | ||
196 | struct uhid_event ev; | ||
197 | |||
198 | memset(&ev, 0, sizeof(ev)); | ||
199 | ev.type = UHID_DESTROY; | ||
200 | |||
201 | uhid_write(fd, &ev); | ||
202 | } | ||
203 | |||
204 | /* This parses raw output reports sent by the kernel to the device. A normal | ||
205 | * uhid program shouldn't do this but instead just forward the raw report. | ||
206 | * However, for ducomentational purposes, we try to detect LED events here and | ||
207 | * print debug messages for it. */ | ||
208 | static void handle_output(struct uhid_event *ev) | ||
209 | { | ||
210 | /* LED messages are adverised via OUTPUT reports; ignore the rest */ | ||
211 | if (ev->u.output.rtype != UHID_OUTPUT_REPORT) | ||
212 | return; | ||
213 | /* LED reports have length 2 bytes */ | ||
214 | if (ev->u.output.size != 2) | ||
215 | return; | ||
216 | /* first byte is report-id which is 0x02 for LEDs in our rdesc */ | ||
217 | if (ev->u.output.data[0] != 0x2) | ||
218 | return; | ||
219 | |||
220 | /* print flags payload */ | ||
221 | fprintf(stderr, "LED output report received with flags %x\n", | ||
222 | ev->u.output.data[1]); | ||
223 | } | ||
224 | |||
225 | static int event(int fd) | ||
226 | { | ||
227 | struct uhid_event ev; | ||
228 | ssize_t ret; | ||
229 | |||
230 | memset(&ev, 0, sizeof(ev)); | ||
231 | ret = read(fd, &ev, sizeof(ev)); | ||
232 | if (ret == 0) { | ||
233 | fprintf(stderr, "Read HUP on uhid-cdev\n"); | ||
234 | return -EFAULT; | ||
235 | } else if (ret < 0) { | ||
236 | fprintf(stderr, "Cannot read uhid-cdev: %m\n"); | ||
237 | return -errno; | ||
238 | } else if (ret != sizeof(ev)) { | ||
239 | fprintf(stderr, "Invalid size read from uhid-dev: %zd != %zu\n", | ||
240 | ret, sizeof(ev)); | ||
241 | return -EFAULT; | ||
242 | } | ||
243 | |||
244 | switch (ev.type) { | ||
245 | case UHID_START: | ||
246 | fprintf(stderr, "UHID_START from uhid-dev\n"); | ||
247 | break; | ||
248 | case UHID_STOP: | ||
249 | fprintf(stderr, "UHID_STOP from uhid-dev\n"); | ||
250 | break; | ||
251 | case UHID_OPEN: | ||
252 | fprintf(stderr, "UHID_OPEN from uhid-dev\n"); | ||
253 | break; | ||
254 | case UHID_CLOSE: | ||
255 | fprintf(stderr, "UHID_CLOSE from uhid-dev\n"); | ||
256 | break; | ||
257 | case UHID_OUTPUT: | ||
258 | fprintf(stderr, "UHID_OUTPUT from uhid-dev\n"); | ||
259 | handle_output(&ev); | ||
260 | break; | ||
261 | case UHID_OUTPUT_EV: | ||
262 | fprintf(stderr, "UHID_OUTPUT_EV from uhid-dev\n"); | ||
263 | break; | ||
264 | default: | ||
265 | fprintf(stderr, "Invalid event from uhid-dev: %u\n", ev.type); | ||
266 | } | ||
267 | |||
268 | return 0; | ||
269 | } | ||
270 | |||
271 | static bool btn1_down; | ||
272 | static bool btn2_down; | ||
273 | static bool btn3_down; | ||
274 | static signed char abs_hor; | ||
275 | static signed char abs_ver; | ||
276 | static signed char wheel; | ||
277 | |||
278 | static int send_event(int fd) | ||
279 | { | ||
280 | struct uhid_event ev; | ||
281 | |||
282 | memset(&ev, 0, sizeof(ev)); | ||
283 | ev.type = UHID_INPUT; | ||
284 | ev.u.input.size = 5; | ||
285 | |||
286 | ev.u.input.data[0] = 0x1; | ||
287 | if (btn1_down) | ||
288 | ev.u.input.data[1] |= 0x1; | ||
289 | if (btn2_down) | ||
290 | ev.u.input.data[1] |= 0x2; | ||
291 | if (btn3_down) | ||
292 | ev.u.input.data[1] |= 0x4; | ||
293 | |||
294 | ev.u.input.data[2] = abs_hor; | ||
295 | ev.u.input.data[3] = abs_ver; | ||
296 | ev.u.input.data[4] = wheel; | ||
297 | |||
298 | return uhid_write(fd, &ev); | ||
299 | } | ||
300 | |||
301 | static int keyboard(int fd) | ||
302 | { | ||
303 | char buf[128]; | ||
304 | ssize_t ret, i; | ||
305 | |||
306 | ret = read(STDIN_FILENO, buf, sizeof(buf)); | ||
307 | if (ret == 0) { | ||
308 | fprintf(stderr, "Read HUP on stdin\n"); | ||
309 | return -EFAULT; | ||
310 | } else if (ret < 0) { | ||
311 | fprintf(stderr, "Cannot read stdin: %m\n"); | ||
312 | return -errno; | ||
313 | } | ||
314 | |||
315 | for (i = 0; i < ret; ++i) { | ||
316 | switch (buf[i]) { | ||
317 | case '1': | ||
318 | btn1_down = !btn1_down; | ||
319 | ret = send_event(fd); | ||
320 | if (ret) | ||
321 | return ret; | ||
322 | break; | ||
323 | case '2': | ||
324 | btn2_down = !btn2_down; | ||
325 | ret = send_event(fd); | ||
326 | if (ret) | ||
327 | return ret; | ||
328 | break; | ||
329 | case '3': | ||
330 | btn3_down = !btn3_down; | ||
331 | ret = send_event(fd); | ||
332 | if (ret) | ||
333 | return ret; | ||
334 | break; | ||
335 | case 'a': | ||
336 | abs_hor = -20; | ||
337 | ret = send_event(fd); | ||
338 | abs_hor = 0; | ||
339 | if (ret) | ||
340 | return ret; | ||
341 | break; | ||
342 | case 'd': | ||
343 | abs_hor = 20; | ||
344 | ret = send_event(fd); | ||
345 | abs_hor = 0; | ||
346 | if (ret) | ||
347 | return ret; | ||
348 | break; | ||
349 | case 'w': | ||
350 | abs_ver = -20; | ||
351 | ret = send_event(fd); | ||
352 | abs_ver = 0; | ||
353 | if (ret) | ||
354 | return ret; | ||
355 | break; | ||
356 | case 's': | ||
357 | abs_ver = 20; | ||
358 | ret = send_event(fd); | ||
359 | abs_ver = 0; | ||
360 | if (ret) | ||
361 | return ret; | ||
362 | break; | ||
363 | case 'r': | ||
364 | wheel = 1; | ||
365 | ret = send_event(fd); | ||
366 | wheel = 0; | ||
367 | if (ret) | ||
368 | return ret; | ||
369 | break; | ||
370 | case 'f': | ||
371 | wheel = -1; | ||
372 | ret = send_event(fd); | ||
373 | wheel = 0; | ||
374 | if (ret) | ||
375 | return ret; | ||
376 | break; | ||
377 | case 'q': | ||
378 | return -ECANCELED; | ||
379 | default: | ||
380 | fprintf(stderr, "Invalid input: %c\n", buf[i]); | ||
381 | } | ||
382 | } | ||
383 | |||
384 | return 0; | ||
385 | } | ||
386 | |||
387 | int main(int argc, char **argv) | ||
388 | { | ||
389 | int fd; | ||
390 | const char *path = "/dev/uhid"; | ||
391 | struct pollfd pfds[2]; | ||
392 | int ret; | ||
393 | struct termios state; | ||
394 | |||
395 | ret = tcgetattr(STDIN_FILENO, &state); | ||
396 | if (ret) { | ||
397 | fprintf(stderr, "Cannot get tty state\n"); | ||
398 | } else { | ||
399 | state.c_lflag &= ~ICANON; | ||
400 | state.c_cc[VMIN] = 1; | ||
401 | ret = tcsetattr(STDIN_FILENO, TCSANOW, &state); | ||
402 | if (ret) | ||
403 | fprintf(stderr, "Cannot set tty state\n"); | ||
404 | } | ||
405 | |||
406 | if (argc >= 2) { | ||
407 | if (!strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) { | ||
408 | fprintf(stderr, "Usage: %s [%s]\n", argv[0], path); | ||
409 | return EXIT_SUCCESS; | ||
410 | } else { | ||
411 | path = argv[1]; | ||
412 | } | ||
413 | } | ||
414 | |||
415 | fprintf(stderr, "Open uhid-cdev %s\n", path); | ||
416 | fd = open(path, O_RDWR | O_CLOEXEC); | ||
417 | if (fd < 0) { | ||
418 | fprintf(stderr, "Cannot open uhid-cdev %s: %m\n", path); | ||
419 | return EXIT_FAILURE; | ||
420 | } | ||
421 | |||
422 | fprintf(stderr, "Create uhid device\n"); | ||
423 | ret = create(fd); | ||
424 | if (ret) { | ||
425 | close(fd); | ||
426 | return EXIT_FAILURE; | ||
427 | } | ||
428 | |||
429 | pfds[0].fd = STDIN_FILENO; | ||
430 | pfds[0].events = POLLIN; | ||
431 | pfds[1].fd = fd; | ||
432 | pfds[1].events = POLLIN; | ||
433 | |||
434 | fprintf(stderr, "Press 'q' to quit...\n"); | ||
435 | while (1) { | ||
436 | ret = poll(pfds, 2, -1); | ||
437 | if (ret < 0) { | ||
438 | fprintf(stderr, "Cannot poll for fds: %m\n"); | ||
439 | break; | ||
440 | } | ||
441 | if (pfds[0].revents & POLLHUP) { | ||
442 | fprintf(stderr, "Received HUP on stdin\n"); | ||
443 | break; | ||
444 | } | ||
445 | if (pfds[1].revents & POLLHUP) { | ||
446 | fprintf(stderr, "Received HUP on uhid-cdev\n"); | ||
447 | break; | ||
448 | } | ||
449 | |||
450 | if (pfds[0].revents & POLLIN) { | ||
451 | ret = keyboard(fd); | ||
452 | if (ret) | ||
453 | break; | ||
454 | } | ||
455 | if (pfds[1].revents & POLLIN) { | ||
456 | ret = event(fd); | ||
457 | if (ret) | ||
458 | break; | ||
459 | } | ||
460 | } | ||
461 | |||
462 | fprintf(stderr, "Destroy uhid device\n"); | ||
463 | destroy(fd); | ||
464 | return EXIT_SUCCESS; | ||
465 | } | ||
diff --git a/samples/v4l/Makefile b/samples/v4l/Makefile new file mode 100644 index 000000000..f86ab1245 --- /dev/null +++ b/samples/v4l/Makefile | |||
@@ -0,0 +1,2 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | obj-$(CONFIG_VIDEO_PCI_SKELETON) := v4l2-pci-skeleton.o | ||
diff --git a/samples/v4l/v4l2-pci-skeleton.c b/samples/v4l/v4l2-pci-skeleton.c new file mode 100644 index 000000000..3fa6582b4 --- /dev/null +++ b/samples/v4l/v4l2-pci-skeleton.c | |||
@@ -0,0 +1,915 @@ | |||
1 | /* | ||
2 | * This is a V4L2 PCI Skeleton Driver. It gives an initial skeleton source | ||
3 | * for use with other PCI drivers. | ||
4 | * | ||
5 | * This skeleton PCI driver assumes that the card has an S-Video connector as | ||
6 | * input 0 and an HDMI connector as input 1. | ||
7 | * | ||
8 | * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you may redistribute it and/or modify | ||
11 | * it under the terms of the GNU General Public License as published by | ||
12 | * the Free Software Foundation; version 2 of the License. | ||
13 | * | ||
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
15 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
16 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
17 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
18 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
19 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
20 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
21 | * SOFTWARE. | ||
22 | */ | ||
23 | |||
24 | #include <linux/types.h> | ||
25 | #include <linux/kernel.h> | ||
26 | #include <linux/module.h> | ||
27 | #include <linux/init.h> | ||
28 | #include <linux/kmod.h> | ||
29 | #include <linux/mutex.h> | ||
30 | #include <linux/pci.h> | ||
31 | #include <linux/interrupt.h> | ||
32 | #include <linux/videodev2.h> | ||
33 | #include <linux/v4l2-dv-timings.h> | ||
34 | #include <media/v4l2-device.h> | ||
35 | #include <media/v4l2-dev.h> | ||
36 | #include <media/v4l2-ioctl.h> | ||
37 | #include <media/v4l2-dv-timings.h> | ||
38 | #include <media/v4l2-ctrls.h> | ||
39 | #include <media/v4l2-event.h> | ||
40 | #include <media/videobuf2-v4l2.h> | ||
41 | #include <media/videobuf2-dma-contig.h> | ||
42 | |||
43 | MODULE_DESCRIPTION("V4L2 PCI Skeleton Driver"); | ||
44 | MODULE_AUTHOR("Hans Verkuil"); | ||
45 | MODULE_LICENSE("GPL v2"); | ||
46 | |||
47 | /** | ||
48 | * struct skeleton - All internal data for one instance of device | ||
49 | * @pdev: PCI device | ||
50 | * @v4l2_dev: top-level v4l2 device struct | ||
51 | * @vdev: video node structure | ||
52 | * @ctrl_handler: control handler structure | ||
53 | * @lock: ioctl serialization mutex | ||
54 | * @std: current SDTV standard | ||
55 | * @timings: current HDTV timings | ||
56 | * @format: current pix format | ||
57 | * @input: current video input (0 = SDTV, 1 = HDTV) | ||
58 | * @queue: vb2 video capture queue | ||
59 | * @qlock: spinlock controlling access to buf_list and sequence | ||
60 | * @buf_list: list of buffers queued for DMA | ||
61 | * @field: the field (TOP/BOTTOM/other) of the current buffer | ||
62 | * @sequence: frame sequence counter | ||
63 | */ | ||
64 | struct skeleton { | ||
65 | struct pci_dev *pdev; | ||
66 | struct v4l2_device v4l2_dev; | ||
67 | struct video_device vdev; | ||
68 | struct v4l2_ctrl_handler ctrl_handler; | ||
69 | struct mutex lock; | ||
70 | v4l2_std_id std; | ||
71 | struct v4l2_dv_timings timings; | ||
72 | struct v4l2_pix_format format; | ||
73 | unsigned input; | ||
74 | |||
75 | struct vb2_queue queue; | ||
76 | |||
77 | spinlock_t qlock; | ||
78 | struct list_head buf_list; | ||
79 | unsigned field; | ||
80 | unsigned sequence; | ||
81 | }; | ||
82 | |||
83 | struct skel_buffer { | ||
84 | struct vb2_v4l2_buffer vb; | ||
85 | struct list_head list; | ||
86 | }; | ||
87 | |||
88 | static inline struct skel_buffer *to_skel_buffer(struct vb2_v4l2_buffer *vbuf) | ||
89 | { | ||
90 | return container_of(vbuf, struct skel_buffer, vb); | ||
91 | } | ||
92 | |||
93 | static const struct pci_device_id skeleton_pci_tbl[] = { | ||
94 | /* { PCI_DEVICE(PCI_VENDOR_ID_, PCI_DEVICE_ID_) }, */ | ||
95 | { 0, } | ||
96 | }; | ||
97 | MODULE_DEVICE_TABLE(pci, skeleton_pci_tbl); | ||
98 | |||
99 | /* | ||
100 | * HDTV: this structure has the capabilities of the HDTV receiver. | ||
101 | * It is used to constrain the huge list of possible formats based | ||
102 | * upon the hardware capabilities. | ||
103 | */ | ||
104 | static const struct v4l2_dv_timings_cap skel_timings_cap = { | ||
105 | .type = V4L2_DV_BT_656_1120, | ||
106 | /* keep this initialization for compatibility with GCC < 4.4.6 */ | ||
107 | .reserved = { 0 }, | ||
108 | V4L2_INIT_BT_TIMINGS( | ||
109 | 720, 1920, /* min/max width */ | ||
110 | 480, 1080, /* min/max height */ | ||
111 | 27000000, 74250000, /* min/max pixelclock*/ | ||
112 | V4L2_DV_BT_STD_CEA861, /* Supported standards */ | ||
113 | /* capabilities */ | ||
114 | V4L2_DV_BT_CAP_INTERLACED | V4L2_DV_BT_CAP_PROGRESSIVE | ||
115 | ) | ||
116 | }; | ||
117 | |||
118 | /* | ||
119 | * Supported SDTV standards. This does the same job as skel_timings_cap, but | ||
120 | * for standard TV formats. | ||
121 | */ | ||
122 | #define SKEL_TVNORMS V4L2_STD_ALL | ||
123 | |||
124 | /* | ||
125 | * Interrupt handler: typically interrupts happen after a new frame has been | ||
126 | * captured. It is the job of the handler to remove the new frame from the | ||
127 | * internal list and give it back to the vb2 framework, updating the sequence | ||
128 | * counter, field and timestamp at the same time. | ||
129 | */ | ||
130 | static irqreturn_t skeleton_irq(int irq, void *dev_id) | ||
131 | { | ||
132 | #ifdef TODO | ||
133 | struct skeleton *skel = dev_id; | ||
134 | |||
135 | /* handle interrupt */ | ||
136 | |||
137 | /* Once a new frame has been captured, mark it as done like this: */ | ||
138 | if (captured_new_frame) { | ||
139 | ... | ||
140 | spin_lock(&skel->qlock); | ||
141 | list_del(&new_buf->list); | ||
142 | spin_unlock(&skel->qlock); | ||
143 | new_buf->vb.vb2_buf.timestamp = ktime_get_ns(); | ||
144 | new_buf->vb.sequence = skel->sequence++; | ||
145 | new_buf->vb.field = skel->field; | ||
146 | if (skel->format.field == V4L2_FIELD_ALTERNATE) { | ||
147 | if (skel->field == V4L2_FIELD_BOTTOM) | ||
148 | skel->field = V4L2_FIELD_TOP; | ||
149 | else if (skel->field == V4L2_FIELD_TOP) | ||
150 | skel->field = V4L2_FIELD_BOTTOM; | ||
151 | } | ||
152 | vb2_buffer_done(&new_buf->vb.vb2_buf, VB2_BUF_STATE_DONE); | ||
153 | } | ||
154 | #endif | ||
155 | return IRQ_HANDLED; | ||
156 | } | ||
157 | |||
158 | /* | ||
159 | * Setup the constraints of the queue: besides setting the number of planes | ||
160 | * per buffer and the size and allocation context of each plane, it also | ||
161 | * checks if sufficient buffers have been allocated. Usually 3 is a good | ||
162 | * minimum number: many DMA engines need a minimum of 2 buffers in the | ||
163 | * queue and you need to have another available for userspace processing. | ||
164 | */ | ||
165 | static int queue_setup(struct vb2_queue *vq, | ||
166 | unsigned int *nbuffers, unsigned int *nplanes, | ||
167 | unsigned int sizes[], struct device *alloc_devs[]) | ||
168 | { | ||
169 | struct skeleton *skel = vb2_get_drv_priv(vq); | ||
170 | |||
171 | skel->field = skel->format.field; | ||
172 | if (skel->field == V4L2_FIELD_ALTERNATE) { | ||
173 | /* | ||
174 | * You cannot use read() with FIELD_ALTERNATE since the field | ||
175 | * information (TOP/BOTTOM) cannot be passed back to the user. | ||
176 | */ | ||
177 | if (vb2_fileio_is_active(vq)) | ||
178 | return -EINVAL; | ||
179 | skel->field = V4L2_FIELD_TOP; | ||
180 | } | ||
181 | |||
182 | if (vq->num_buffers + *nbuffers < 3) | ||
183 | *nbuffers = 3 - vq->num_buffers; | ||
184 | |||
185 | if (*nplanes) | ||
186 | return sizes[0] < skel->format.sizeimage ? -EINVAL : 0; | ||
187 | *nplanes = 1; | ||
188 | sizes[0] = skel->format.sizeimage; | ||
189 | return 0; | ||
190 | } | ||
191 | |||
192 | /* | ||
193 | * Prepare the buffer for queueing to the DMA engine: check and set the | ||
194 | * payload size. | ||
195 | */ | ||
196 | static int buffer_prepare(struct vb2_buffer *vb) | ||
197 | { | ||
198 | struct skeleton *skel = vb2_get_drv_priv(vb->vb2_queue); | ||
199 | unsigned long size = skel->format.sizeimage; | ||
200 | |||
201 | if (vb2_plane_size(vb, 0) < size) { | ||
202 | dev_err(&skel->pdev->dev, "buffer too small (%lu < %lu)\n", | ||
203 | vb2_plane_size(vb, 0), size); | ||
204 | return -EINVAL; | ||
205 | } | ||
206 | |||
207 | vb2_set_plane_payload(vb, 0, size); | ||
208 | return 0; | ||
209 | } | ||
210 | |||
211 | /* | ||
212 | * Queue this buffer to the DMA engine. | ||
213 | */ | ||
214 | static void buffer_queue(struct vb2_buffer *vb) | ||
215 | { | ||
216 | struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); | ||
217 | struct skeleton *skel = vb2_get_drv_priv(vb->vb2_queue); | ||
218 | struct skel_buffer *buf = to_skel_buffer(vbuf); | ||
219 | unsigned long flags; | ||
220 | |||
221 | spin_lock_irqsave(&skel->qlock, flags); | ||
222 | list_add_tail(&buf->list, &skel->buf_list); | ||
223 | |||
224 | /* TODO: Update any DMA pointers if necessary */ | ||
225 | |||
226 | spin_unlock_irqrestore(&skel->qlock, flags); | ||
227 | } | ||
228 | |||
229 | static void return_all_buffers(struct skeleton *skel, | ||
230 | enum vb2_buffer_state state) | ||
231 | { | ||
232 | struct skel_buffer *buf, *node; | ||
233 | unsigned long flags; | ||
234 | |||
235 | spin_lock_irqsave(&skel->qlock, flags); | ||
236 | list_for_each_entry_safe(buf, node, &skel->buf_list, list) { | ||
237 | vb2_buffer_done(&buf->vb.vb2_buf, state); | ||
238 | list_del(&buf->list); | ||
239 | } | ||
240 | spin_unlock_irqrestore(&skel->qlock, flags); | ||
241 | } | ||
242 | |||
243 | /* | ||
244 | * Start streaming. First check if the minimum number of buffers have been | ||
245 | * queued. If not, then return -ENOBUFS and the vb2 framework will call | ||
246 | * this function again the next time a buffer has been queued until enough | ||
247 | * buffers are available to actually start the DMA engine. | ||
248 | */ | ||
249 | static int start_streaming(struct vb2_queue *vq, unsigned int count) | ||
250 | { | ||
251 | struct skeleton *skel = vb2_get_drv_priv(vq); | ||
252 | int ret = 0; | ||
253 | |||
254 | skel->sequence = 0; | ||
255 | |||
256 | /* TODO: start DMA */ | ||
257 | |||
258 | if (ret) { | ||
259 | /* | ||
260 | * In case of an error, return all active buffers to the | ||
261 | * QUEUED state | ||
262 | */ | ||
263 | return_all_buffers(skel, VB2_BUF_STATE_QUEUED); | ||
264 | } | ||
265 | return ret; | ||
266 | } | ||
267 | |||
268 | /* | ||
269 | * Stop the DMA engine. Any remaining buffers in the DMA queue are dequeued | ||
270 | * and passed on to the vb2 framework marked as STATE_ERROR. | ||
271 | */ | ||
272 | static void stop_streaming(struct vb2_queue *vq) | ||
273 | { | ||
274 | struct skeleton *skel = vb2_get_drv_priv(vq); | ||
275 | |||
276 | /* TODO: stop DMA */ | ||
277 | |||
278 | /* Release all active buffers */ | ||
279 | return_all_buffers(skel, VB2_BUF_STATE_ERROR); | ||
280 | } | ||
281 | |||
282 | /* | ||
283 | * The vb2 queue ops. Note that since q->lock is set we can use the standard | ||
284 | * vb2_ops_wait_prepare/finish helper functions. If q->lock would be NULL, | ||
285 | * then this driver would have to provide these ops. | ||
286 | */ | ||
287 | static const struct vb2_ops skel_qops = { | ||
288 | .queue_setup = queue_setup, | ||
289 | .buf_prepare = buffer_prepare, | ||
290 | .buf_queue = buffer_queue, | ||
291 | .start_streaming = start_streaming, | ||
292 | .stop_streaming = stop_streaming, | ||
293 | .wait_prepare = vb2_ops_wait_prepare, | ||
294 | .wait_finish = vb2_ops_wait_finish, | ||
295 | }; | ||
296 | |||
297 | /* | ||
298 | * Required ioctl querycap. Note that the version field is prefilled with | ||
299 | * the version of the kernel. | ||
300 | */ | ||
301 | static int skeleton_querycap(struct file *file, void *priv, | ||
302 | struct v4l2_capability *cap) | ||
303 | { | ||
304 | struct skeleton *skel = video_drvdata(file); | ||
305 | |||
306 | strlcpy(cap->driver, KBUILD_MODNAME, sizeof(cap->driver)); | ||
307 | strlcpy(cap->card, "V4L2 PCI Skeleton", sizeof(cap->card)); | ||
308 | snprintf(cap->bus_info, sizeof(cap->bus_info), "PCI:%s", | ||
309 | pci_name(skel->pdev)); | ||
310 | return 0; | ||
311 | } | ||
312 | |||
313 | /* | ||
314 | * Helper function to check and correct struct v4l2_pix_format. It's used | ||
315 | * not only in VIDIOC_TRY/S_FMT, but also elsewhere if changes to the SDTV | ||
316 | * standard, HDTV timings or the video input would require updating the | ||
317 | * current format. | ||
318 | */ | ||
319 | static void skeleton_fill_pix_format(struct skeleton *skel, | ||
320 | struct v4l2_pix_format *pix) | ||
321 | { | ||
322 | pix->pixelformat = V4L2_PIX_FMT_YUYV; | ||
323 | if (skel->input == 0) { | ||
324 | /* S-Video input */ | ||
325 | pix->width = 720; | ||
326 | pix->height = (skel->std & V4L2_STD_525_60) ? 480 : 576; | ||
327 | pix->field = V4L2_FIELD_INTERLACED; | ||
328 | pix->colorspace = V4L2_COLORSPACE_SMPTE170M; | ||
329 | } else { | ||
330 | /* HDMI input */ | ||
331 | pix->width = skel->timings.bt.width; | ||
332 | pix->height = skel->timings.bt.height; | ||
333 | if (skel->timings.bt.interlaced) { | ||
334 | pix->field = V4L2_FIELD_ALTERNATE; | ||
335 | pix->height /= 2; | ||
336 | } else { | ||
337 | pix->field = V4L2_FIELD_NONE; | ||
338 | } | ||
339 | pix->colorspace = V4L2_COLORSPACE_REC709; | ||
340 | } | ||
341 | |||
342 | /* | ||
343 | * The YUYV format is four bytes for every two pixels, so bytesperline | ||
344 | * is width * 2. | ||
345 | */ | ||
346 | pix->bytesperline = pix->width * 2; | ||
347 | pix->sizeimage = pix->bytesperline * pix->height; | ||
348 | pix->priv = 0; | ||
349 | } | ||
350 | |||
351 | static int skeleton_try_fmt_vid_cap(struct file *file, void *priv, | ||
352 | struct v4l2_format *f) | ||
353 | { | ||
354 | struct skeleton *skel = video_drvdata(file); | ||
355 | struct v4l2_pix_format *pix = &f->fmt.pix; | ||
356 | |||
357 | /* | ||
358 | * Due to historical reasons providing try_fmt with an unsupported | ||
359 | * pixelformat will return -EINVAL for video receivers. Webcam drivers, | ||
360 | * however, will silently correct the pixelformat. Some video capture | ||
361 | * applications rely on this behavior... | ||
362 | */ | ||
363 | if (pix->pixelformat != V4L2_PIX_FMT_YUYV) | ||
364 | return -EINVAL; | ||
365 | skeleton_fill_pix_format(skel, pix); | ||
366 | return 0; | ||
367 | } | ||
368 | |||
369 | static int skeleton_s_fmt_vid_cap(struct file *file, void *priv, | ||
370 | struct v4l2_format *f) | ||
371 | { | ||
372 | struct skeleton *skel = video_drvdata(file); | ||
373 | int ret; | ||
374 | |||
375 | ret = skeleton_try_fmt_vid_cap(file, priv, f); | ||
376 | if (ret) | ||
377 | return ret; | ||
378 | |||
379 | /* | ||
380 | * It is not allowed to change the format while buffers for use with | ||
381 | * streaming have already been allocated. | ||
382 | */ | ||
383 | if (vb2_is_busy(&skel->queue)) | ||
384 | return -EBUSY; | ||
385 | |||
386 | /* TODO: change format */ | ||
387 | skel->format = f->fmt.pix; | ||
388 | return 0; | ||
389 | } | ||
390 | |||
391 | static int skeleton_g_fmt_vid_cap(struct file *file, void *priv, | ||
392 | struct v4l2_format *f) | ||
393 | { | ||
394 | struct skeleton *skel = video_drvdata(file); | ||
395 | |||
396 | f->fmt.pix = skel->format; | ||
397 | return 0; | ||
398 | } | ||
399 | |||
400 | static int skeleton_enum_fmt_vid_cap(struct file *file, void *priv, | ||
401 | struct v4l2_fmtdesc *f) | ||
402 | { | ||
403 | if (f->index != 0) | ||
404 | return -EINVAL; | ||
405 | |||
406 | f->pixelformat = V4L2_PIX_FMT_YUYV; | ||
407 | return 0; | ||
408 | } | ||
409 | |||
410 | static int skeleton_s_std(struct file *file, void *priv, v4l2_std_id std) | ||
411 | { | ||
412 | struct skeleton *skel = video_drvdata(file); | ||
413 | |||
414 | /* S_STD is not supported on the HDMI input */ | ||
415 | if (skel->input) | ||
416 | return -ENODATA; | ||
417 | |||
418 | /* | ||
419 | * No change, so just return. Some applications call S_STD again after | ||
420 | * the buffers for streaming have been set up, so we have to allow for | ||
421 | * this behavior. | ||
422 | */ | ||
423 | if (std == skel->std) | ||
424 | return 0; | ||
425 | |||
426 | /* | ||
427 | * Changing the standard implies a format change, which is not allowed | ||
428 | * while buffers for use with streaming have already been allocated. | ||
429 | */ | ||
430 | if (vb2_is_busy(&skel->queue)) | ||
431 | return -EBUSY; | ||
432 | |||
433 | /* TODO: handle changing std */ | ||
434 | |||
435 | skel->std = std; | ||
436 | |||
437 | /* Update the internal format */ | ||
438 | skeleton_fill_pix_format(skel, &skel->format); | ||
439 | return 0; | ||
440 | } | ||
441 | |||
442 | static int skeleton_g_std(struct file *file, void *priv, v4l2_std_id *std) | ||
443 | { | ||
444 | struct skeleton *skel = video_drvdata(file); | ||
445 | |||
446 | /* G_STD is not supported on the HDMI input */ | ||
447 | if (skel->input) | ||
448 | return -ENODATA; | ||
449 | |||
450 | *std = skel->std; | ||
451 | return 0; | ||
452 | } | ||
453 | |||
454 | /* | ||
455 | * Query the current standard as seen by the hardware. This function shall | ||
456 | * never actually change the standard, it just detects and reports. | ||
457 | * The framework will initially set *std to tvnorms (i.e. the set of | ||
458 | * supported standards by this input), and this function should just AND | ||
459 | * this value. If there is no signal, then *std should be set to 0. | ||
460 | */ | ||
461 | static int skeleton_querystd(struct file *file, void *priv, v4l2_std_id *std) | ||
462 | { | ||
463 | struct skeleton *skel = video_drvdata(file); | ||
464 | |||
465 | /* QUERY_STD is not supported on the HDMI input */ | ||
466 | if (skel->input) | ||
467 | return -ENODATA; | ||
468 | |||
469 | #ifdef TODO | ||
470 | /* | ||
471 | * Query currently seen standard. Initial value of *std is | ||
472 | * V4L2_STD_ALL. This function should look something like this: | ||
473 | */ | ||
474 | get_signal_info(); | ||
475 | if (no_signal) { | ||
476 | *std = 0; | ||
477 | return 0; | ||
478 | } | ||
479 | /* Use signal information to reduce the number of possible standards */ | ||
480 | if (signal_has_525_lines) | ||
481 | *std &= V4L2_STD_525_60; | ||
482 | else | ||
483 | *std &= V4L2_STD_625_50; | ||
484 | #endif | ||
485 | return 0; | ||
486 | } | ||
487 | |||
488 | static int skeleton_s_dv_timings(struct file *file, void *_fh, | ||
489 | struct v4l2_dv_timings *timings) | ||
490 | { | ||
491 | struct skeleton *skel = video_drvdata(file); | ||
492 | |||
493 | /* S_DV_TIMINGS is not supported on the S-Video input */ | ||
494 | if (skel->input == 0) | ||
495 | return -ENODATA; | ||
496 | |||
497 | /* Quick sanity check */ | ||
498 | if (!v4l2_valid_dv_timings(timings, &skel_timings_cap, NULL, NULL)) | ||
499 | return -EINVAL; | ||
500 | |||
501 | /* Check if the timings are part of the CEA-861 timings. */ | ||
502 | if (!v4l2_find_dv_timings_cap(timings, &skel_timings_cap, | ||
503 | 0, NULL, NULL)) | ||
504 | return -EINVAL; | ||
505 | |||
506 | /* Return 0 if the new timings are the same as the current timings. */ | ||
507 | if (v4l2_match_dv_timings(timings, &skel->timings, 0, false)) | ||
508 | return 0; | ||
509 | |||
510 | /* | ||
511 | * Changing the timings implies a format change, which is not allowed | ||
512 | * while buffers for use with streaming have already been allocated. | ||
513 | */ | ||
514 | if (vb2_is_busy(&skel->queue)) | ||
515 | return -EBUSY; | ||
516 | |||
517 | /* TODO: Configure new timings */ | ||
518 | |||
519 | /* Save timings */ | ||
520 | skel->timings = *timings; | ||
521 | |||
522 | /* Update the internal format */ | ||
523 | skeleton_fill_pix_format(skel, &skel->format); | ||
524 | return 0; | ||
525 | } | ||
526 | |||
527 | static int skeleton_g_dv_timings(struct file *file, void *_fh, | ||
528 | struct v4l2_dv_timings *timings) | ||
529 | { | ||
530 | struct skeleton *skel = video_drvdata(file); | ||
531 | |||
532 | /* G_DV_TIMINGS is not supported on the S-Video input */ | ||
533 | if (skel->input == 0) | ||
534 | return -ENODATA; | ||
535 | |||
536 | *timings = skel->timings; | ||
537 | return 0; | ||
538 | } | ||
539 | |||
540 | static int skeleton_enum_dv_timings(struct file *file, void *_fh, | ||
541 | struct v4l2_enum_dv_timings *timings) | ||
542 | { | ||
543 | struct skeleton *skel = video_drvdata(file); | ||
544 | |||
545 | /* ENUM_DV_TIMINGS is not supported on the S-Video input */ | ||
546 | if (skel->input == 0) | ||
547 | return -ENODATA; | ||
548 | |||
549 | return v4l2_enum_dv_timings_cap(timings, &skel_timings_cap, | ||
550 | NULL, NULL); | ||
551 | } | ||
552 | |||
553 | /* | ||
554 | * Query the current timings as seen by the hardware. This function shall | ||
555 | * never actually change the timings, it just detects and reports. | ||
556 | * If no signal is detected, then return -ENOLINK. If the hardware cannot | ||
557 | * lock to the signal, then return -ENOLCK. If the signal is out of range | ||
558 | * of the capabilities of the system (e.g., it is possible that the receiver | ||
559 | * can lock but that the DMA engine it is connected to cannot handle | ||
560 | * pixelclocks above a certain frequency), then -ERANGE is returned. | ||
561 | */ | ||
562 | static int skeleton_query_dv_timings(struct file *file, void *_fh, | ||
563 | struct v4l2_dv_timings *timings) | ||
564 | { | ||
565 | struct skeleton *skel = video_drvdata(file); | ||
566 | |||
567 | /* QUERY_DV_TIMINGS is not supported on the S-Video input */ | ||
568 | if (skel->input == 0) | ||
569 | return -ENODATA; | ||
570 | |||
571 | #ifdef TODO | ||
572 | /* | ||
573 | * Query currently seen timings. This function should look | ||
574 | * something like this: | ||
575 | */ | ||
576 | detect_timings(); | ||
577 | if (no_signal) | ||
578 | return -ENOLINK; | ||
579 | if (cannot_lock_to_signal) | ||
580 | return -ENOLCK; | ||
581 | if (signal_out_of_range_of_capabilities) | ||
582 | return -ERANGE; | ||
583 | |||
584 | /* Useful for debugging */ | ||
585 | v4l2_print_dv_timings(skel->v4l2_dev.name, "query_dv_timings:", | ||
586 | timings, true); | ||
587 | #endif | ||
588 | return 0; | ||
589 | } | ||
590 | |||
591 | static int skeleton_dv_timings_cap(struct file *file, void *fh, | ||
592 | struct v4l2_dv_timings_cap *cap) | ||
593 | { | ||
594 | struct skeleton *skel = video_drvdata(file); | ||
595 | |||
596 | /* DV_TIMINGS_CAP is not supported on the S-Video input */ | ||
597 | if (skel->input == 0) | ||
598 | return -ENODATA; | ||
599 | *cap = skel_timings_cap; | ||
600 | return 0; | ||
601 | } | ||
602 | |||
603 | static int skeleton_enum_input(struct file *file, void *priv, | ||
604 | struct v4l2_input *i) | ||
605 | { | ||
606 | if (i->index > 1) | ||
607 | return -EINVAL; | ||
608 | |||
609 | i->type = V4L2_INPUT_TYPE_CAMERA; | ||
610 | if (i->index == 0) { | ||
611 | i->std = SKEL_TVNORMS; | ||
612 | strlcpy(i->name, "S-Video", sizeof(i->name)); | ||
613 | i->capabilities = V4L2_IN_CAP_STD; | ||
614 | } else { | ||
615 | i->std = 0; | ||
616 | strlcpy(i->name, "HDMI", sizeof(i->name)); | ||
617 | i->capabilities = V4L2_IN_CAP_DV_TIMINGS; | ||
618 | } | ||
619 | return 0; | ||
620 | } | ||
621 | |||
622 | static int skeleton_s_input(struct file *file, void *priv, unsigned int i) | ||
623 | { | ||
624 | struct skeleton *skel = video_drvdata(file); | ||
625 | |||
626 | if (i > 1) | ||
627 | return -EINVAL; | ||
628 | |||
629 | /* | ||
630 | * Changing the input implies a format change, which is not allowed | ||
631 | * while buffers for use with streaming have already been allocated. | ||
632 | */ | ||
633 | if (vb2_is_busy(&skel->queue)) | ||
634 | return -EBUSY; | ||
635 | |||
636 | skel->input = i; | ||
637 | /* | ||
638 | * Update tvnorms. The tvnorms value is used by the core to implement | ||
639 | * VIDIOC_ENUMSTD so it has to be correct. If tvnorms == 0, then | ||
640 | * ENUMSTD will return -ENODATA. | ||
641 | */ | ||
642 | skel->vdev.tvnorms = i ? 0 : SKEL_TVNORMS; | ||
643 | |||
644 | /* Update the internal format */ | ||
645 | skeleton_fill_pix_format(skel, &skel->format); | ||
646 | return 0; | ||
647 | } | ||
648 | |||
649 | static int skeleton_g_input(struct file *file, void *priv, unsigned int *i) | ||
650 | { | ||
651 | struct skeleton *skel = video_drvdata(file); | ||
652 | |||
653 | *i = skel->input; | ||
654 | return 0; | ||
655 | } | ||
656 | |||
657 | /* The control handler. */ | ||
658 | static int skeleton_s_ctrl(struct v4l2_ctrl *ctrl) | ||
659 | { | ||
660 | /*struct skeleton *skel = | ||
661 | container_of(ctrl->handler, struct skeleton, ctrl_handler);*/ | ||
662 | |||
663 | switch (ctrl->id) { | ||
664 | case V4L2_CID_BRIGHTNESS: | ||
665 | /* TODO: set brightness to ctrl->val */ | ||
666 | break; | ||
667 | case V4L2_CID_CONTRAST: | ||
668 | /* TODO: set contrast to ctrl->val */ | ||
669 | break; | ||
670 | case V4L2_CID_SATURATION: | ||
671 | /* TODO: set saturation to ctrl->val */ | ||
672 | break; | ||
673 | case V4L2_CID_HUE: | ||
674 | /* TODO: set hue to ctrl->val */ | ||
675 | break; | ||
676 | default: | ||
677 | return -EINVAL; | ||
678 | } | ||
679 | return 0; | ||
680 | } | ||
681 | |||
682 | /* ------------------------------------------------------------------ | ||
683 | File operations for the device | ||
684 | ------------------------------------------------------------------*/ | ||
685 | |||
686 | static const struct v4l2_ctrl_ops skel_ctrl_ops = { | ||
687 | .s_ctrl = skeleton_s_ctrl, | ||
688 | }; | ||
689 | |||
690 | /* | ||
691 | * The set of all supported ioctls. Note that all the streaming ioctls | ||
692 | * use the vb2 helper functions that take care of all the locking and | ||
693 | * that also do ownership tracking (i.e. only the filehandle that requested | ||
694 | * the buffers can call the streaming ioctls, all other filehandles will | ||
695 | * receive -EBUSY if they attempt to call the same streaming ioctls). | ||
696 | * | ||
697 | * The last three ioctls also use standard helper functions: these implement | ||
698 | * standard behavior for drivers with controls. | ||
699 | */ | ||
700 | static const struct v4l2_ioctl_ops skel_ioctl_ops = { | ||
701 | .vidioc_querycap = skeleton_querycap, | ||
702 | .vidioc_try_fmt_vid_cap = skeleton_try_fmt_vid_cap, | ||
703 | .vidioc_s_fmt_vid_cap = skeleton_s_fmt_vid_cap, | ||
704 | .vidioc_g_fmt_vid_cap = skeleton_g_fmt_vid_cap, | ||
705 | .vidioc_enum_fmt_vid_cap = skeleton_enum_fmt_vid_cap, | ||
706 | |||
707 | .vidioc_g_std = skeleton_g_std, | ||
708 | .vidioc_s_std = skeleton_s_std, | ||
709 | .vidioc_querystd = skeleton_querystd, | ||
710 | |||
711 | .vidioc_s_dv_timings = skeleton_s_dv_timings, | ||
712 | .vidioc_g_dv_timings = skeleton_g_dv_timings, | ||
713 | .vidioc_enum_dv_timings = skeleton_enum_dv_timings, | ||
714 | .vidioc_query_dv_timings = skeleton_query_dv_timings, | ||
715 | .vidioc_dv_timings_cap = skeleton_dv_timings_cap, | ||
716 | |||
717 | .vidioc_enum_input = skeleton_enum_input, | ||
718 | .vidioc_g_input = skeleton_g_input, | ||
719 | .vidioc_s_input = skeleton_s_input, | ||
720 | |||
721 | .vidioc_reqbufs = vb2_ioctl_reqbufs, | ||
722 | .vidioc_create_bufs = vb2_ioctl_create_bufs, | ||
723 | .vidioc_querybuf = vb2_ioctl_querybuf, | ||
724 | .vidioc_qbuf = vb2_ioctl_qbuf, | ||
725 | .vidioc_dqbuf = vb2_ioctl_dqbuf, | ||
726 | .vidioc_expbuf = vb2_ioctl_expbuf, | ||
727 | .vidioc_streamon = vb2_ioctl_streamon, | ||
728 | .vidioc_streamoff = vb2_ioctl_streamoff, | ||
729 | |||
730 | .vidioc_log_status = v4l2_ctrl_log_status, | ||
731 | .vidioc_subscribe_event = v4l2_ctrl_subscribe_event, | ||
732 | .vidioc_unsubscribe_event = v4l2_event_unsubscribe, | ||
733 | }; | ||
734 | |||
735 | /* | ||
736 | * The set of file operations. Note that all these ops are standard core | ||
737 | * helper functions. | ||
738 | */ | ||
739 | static const struct v4l2_file_operations skel_fops = { | ||
740 | .owner = THIS_MODULE, | ||
741 | .open = v4l2_fh_open, | ||
742 | .release = vb2_fop_release, | ||
743 | .unlocked_ioctl = video_ioctl2, | ||
744 | .read = vb2_fop_read, | ||
745 | .mmap = vb2_fop_mmap, | ||
746 | .poll = vb2_fop_poll, | ||
747 | }; | ||
748 | |||
749 | /* | ||
750 | * The initial setup of this device instance. Note that the initial state of | ||
751 | * the driver should be complete. So the initial format, standard, timings | ||
752 | * and video input should all be initialized to some reasonable value. | ||
753 | */ | ||
754 | static int skeleton_probe(struct pci_dev *pdev, const struct pci_device_id *ent) | ||
755 | { | ||
756 | /* The initial timings are chosen to be 720p60. */ | ||
757 | static const struct v4l2_dv_timings timings_def = | ||
758 | V4L2_DV_BT_CEA_1280X720P60; | ||
759 | struct skeleton *skel; | ||
760 | struct video_device *vdev; | ||
761 | struct v4l2_ctrl_handler *hdl; | ||
762 | struct vb2_queue *q; | ||
763 | int ret; | ||
764 | |||
765 | /* Enable PCI */ | ||
766 | ret = pci_enable_device(pdev); | ||
767 | if (ret) | ||
768 | return ret; | ||
769 | ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); | ||
770 | if (ret) { | ||
771 | dev_err(&pdev->dev, "no suitable DMA available.\n"); | ||
772 | goto disable_pci; | ||
773 | } | ||
774 | |||
775 | /* Allocate a new instance */ | ||
776 | skel = devm_kzalloc(&pdev->dev, sizeof(struct skeleton), GFP_KERNEL); | ||
777 | if (!skel) { | ||
778 | ret = -ENOMEM; | ||
779 | goto disable_pci; | ||
780 | } | ||
781 | |||
782 | /* Allocate the interrupt */ | ||
783 | ret = devm_request_irq(&pdev->dev, pdev->irq, | ||
784 | skeleton_irq, 0, KBUILD_MODNAME, skel); | ||
785 | if (ret) { | ||
786 | dev_err(&pdev->dev, "request_irq failed\n"); | ||
787 | goto disable_pci; | ||
788 | } | ||
789 | skel->pdev = pdev; | ||
790 | |||
791 | /* Fill in the initial format-related settings */ | ||
792 | skel->timings = timings_def; | ||
793 | skel->std = V4L2_STD_625_50; | ||
794 | skeleton_fill_pix_format(skel, &skel->format); | ||
795 | |||
796 | /* Initialize the top-level structure */ | ||
797 | ret = v4l2_device_register(&pdev->dev, &skel->v4l2_dev); | ||
798 | if (ret) | ||
799 | goto disable_pci; | ||
800 | |||
801 | mutex_init(&skel->lock); | ||
802 | |||
803 | /* Add the controls */ | ||
804 | hdl = &skel->ctrl_handler; | ||
805 | v4l2_ctrl_handler_init(hdl, 4); | ||
806 | v4l2_ctrl_new_std(hdl, &skel_ctrl_ops, | ||
807 | V4L2_CID_BRIGHTNESS, 0, 255, 1, 127); | ||
808 | v4l2_ctrl_new_std(hdl, &skel_ctrl_ops, | ||
809 | V4L2_CID_CONTRAST, 0, 255, 1, 16); | ||
810 | v4l2_ctrl_new_std(hdl, &skel_ctrl_ops, | ||
811 | V4L2_CID_SATURATION, 0, 255, 1, 127); | ||
812 | v4l2_ctrl_new_std(hdl, &skel_ctrl_ops, | ||
813 | V4L2_CID_HUE, -128, 127, 1, 0); | ||
814 | if (hdl->error) { | ||
815 | ret = hdl->error; | ||
816 | goto free_hdl; | ||
817 | } | ||
818 | skel->v4l2_dev.ctrl_handler = hdl; | ||
819 | |||
820 | /* Initialize the vb2 queue */ | ||
821 | q = &skel->queue; | ||
822 | q->type = V4L2_BUF_TYPE_VIDEO_CAPTURE; | ||
823 | q->io_modes = VB2_MMAP | VB2_DMABUF | VB2_READ; | ||
824 | q->dev = &pdev->dev; | ||
825 | q->drv_priv = skel; | ||
826 | q->buf_struct_size = sizeof(struct skel_buffer); | ||
827 | q->ops = &skel_qops; | ||
828 | q->mem_ops = &vb2_dma_contig_memops; | ||
829 | q->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC; | ||
830 | /* | ||
831 | * Assume that this DMA engine needs to have at least two buffers | ||
832 | * available before it can be started. The start_streaming() op | ||
833 | * won't be called until at least this many buffers are queued up. | ||
834 | */ | ||
835 | q->min_buffers_needed = 2; | ||
836 | /* | ||
837 | * The serialization lock for the streaming ioctls. This is the same | ||
838 | * as the main serialization lock, but if some of the non-streaming | ||
839 | * ioctls could take a long time to execute, then you might want to | ||
840 | * have a different lock here to prevent VIDIOC_DQBUF from being | ||
841 | * blocked while waiting for another action to finish. This is | ||
842 | * generally not needed for PCI devices, but USB devices usually do | ||
843 | * want a separate lock here. | ||
844 | */ | ||
845 | q->lock = &skel->lock; | ||
846 | /* | ||
847 | * Since this driver can only do 32-bit DMA we must make sure that | ||
848 | * the vb2 core will allocate the buffers in 32-bit DMA memory. | ||
849 | */ | ||
850 | q->gfp_flags = GFP_DMA32; | ||
851 | ret = vb2_queue_init(q); | ||
852 | if (ret) | ||
853 | goto free_hdl; | ||
854 | |||
855 | INIT_LIST_HEAD(&skel->buf_list); | ||
856 | spin_lock_init(&skel->qlock); | ||
857 | |||
858 | /* Initialize the video_device structure */ | ||
859 | vdev = &skel->vdev; | ||
860 | strlcpy(vdev->name, KBUILD_MODNAME, sizeof(vdev->name)); | ||
861 | /* | ||
862 | * There is nothing to clean up, so release is set to an empty release | ||
863 | * function. The release callback must be non-NULL. | ||
864 | */ | ||
865 | vdev->release = video_device_release_empty; | ||
866 | vdev->fops = &skel_fops, | ||
867 | vdev->ioctl_ops = &skel_ioctl_ops, | ||
868 | vdev->device_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_READWRITE | | ||
869 | V4L2_CAP_STREAMING; | ||
870 | /* | ||
871 | * The main serialization lock. All ioctls are serialized by this | ||
872 | * lock. Exception: if q->lock is set, then the streaming ioctls | ||
873 | * are serialized by that separate lock. | ||
874 | */ | ||
875 | vdev->lock = &skel->lock; | ||
876 | vdev->queue = q; | ||
877 | vdev->v4l2_dev = &skel->v4l2_dev; | ||
878 | /* Supported SDTV standards, if any */ | ||
879 | vdev->tvnorms = SKEL_TVNORMS; | ||
880 | video_set_drvdata(vdev, skel); | ||
881 | |||
882 | ret = video_register_device(vdev, VFL_TYPE_VIDEO, -1); | ||
883 | if (ret) | ||
884 | goto free_hdl; | ||
885 | |||
886 | dev_info(&pdev->dev, "V4L2 PCI Skeleton Driver loaded\n"); | ||
887 | return 0; | ||
888 | |||
889 | free_hdl: | ||
890 | v4l2_ctrl_handler_free(&skel->ctrl_handler); | ||
891 | v4l2_device_unregister(&skel->v4l2_dev); | ||
892 | disable_pci: | ||
893 | pci_disable_device(pdev); | ||
894 | return ret; | ||
895 | } | ||
896 | |||
897 | static void skeleton_remove(struct pci_dev *pdev) | ||
898 | { | ||
899 | struct v4l2_device *v4l2_dev = pci_get_drvdata(pdev); | ||
900 | struct skeleton *skel = container_of(v4l2_dev, struct skeleton, v4l2_dev); | ||
901 | |||
902 | video_unregister_device(&skel->vdev); | ||
903 | v4l2_ctrl_handler_free(&skel->ctrl_handler); | ||
904 | v4l2_device_unregister(&skel->v4l2_dev); | ||
905 | pci_disable_device(skel->pdev); | ||
906 | } | ||
907 | |||
908 | static struct pci_driver skeleton_driver = { | ||
909 | .name = KBUILD_MODNAME, | ||
910 | .probe = skeleton_probe, | ||
911 | .remove = skeleton_remove, | ||
912 | .id_table = skeleton_pci_tbl, | ||
913 | }; | ||
914 | |||
915 | module_pci_driver(skeleton_driver); | ||
diff --git a/samples/vfio-mdev/Makefile b/samples/vfio-mdev/Makefile new file mode 100644 index 000000000..10d179c4f --- /dev/null +++ b/samples/vfio-mdev/Makefile | |||
@@ -0,0 +1,5 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | obj-$(CONFIG_SAMPLE_VFIO_MDEV_MTTY) += mtty.o | ||
3 | obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY) += mdpy.o | ||
4 | obj-$(CONFIG_SAMPLE_VFIO_MDEV_MDPY_FB) += mdpy-fb.o | ||
5 | obj-$(CONFIG_SAMPLE_VFIO_MDEV_MBOCHS) += mbochs.o | ||
diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c new file mode 100644 index 000000000..e03068917 --- /dev/null +++ b/samples/vfio-mdev/mbochs.c | |||
@@ -0,0 +1,1485 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Mediated virtual PCI display host device driver | ||
4 | * | ||
5 | * Emulate enough of qemu stdvga to make bochs-drm.ko happy. That is | ||
6 | * basically the vram memory bar and the bochs dispi interface vbe | ||
7 | * registers in the mmio register bar. Specifically it does *not* | ||
8 | * include any legacy vga stuff. Device looks a lot like "qemu -device | ||
9 | * secondary-vga". | ||
10 | * | ||
11 | * (c) Gerd Hoffmann <kraxel@redhat.com> | ||
12 | * | ||
13 | * based on mtty driver which is: | ||
14 | * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. | ||
15 | * Author: Neo Jia <cjia@nvidia.com> | ||
16 | * Kirti Wankhede <kwankhede@nvidia.com> | ||
17 | * | ||
18 | * This program is free software; you can redistribute it and/or modify | ||
19 | * it under the terms of the GNU General Public License version 2 as | ||
20 | * published by the Free Software Foundation. | ||
21 | */ | ||
22 | #include <linux/init.h> | ||
23 | #include <linux/module.h> | ||
24 | #include <linux/device.h> | ||
25 | #include <linux/kernel.h> | ||
26 | #include <linux/slab.h> | ||
27 | #include <linux/vmalloc.h> | ||
28 | #include <linux/cdev.h> | ||
29 | #include <linux/vfio.h> | ||
30 | #include <linux/iommu.h> | ||
31 | #include <linux/sysfs.h> | ||
32 | #include <linux/mdev.h> | ||
33 | #include <linux/pci.h> | ||
34 | #include <linux/dma-buf.h> | ||
35 | #include <linux/highmem.h> | ||
36 | #include <drm/drm_fourcc.h> | ||
37 | #include <drm/drm_rect.h> | ||
38 | #include <drm/drm_modeset_lock.h> | ||
39 | #include <drm/drm_property.h> | ||
40 | #include <drm/drm_plane.h> | ||
41 | |||
42 | |||
43 | #define VBE_DISPI_INDEX_ID 0x0 | ||
44 | #define VBE_DISPI_INDEX_XRES 0x1 | ||
45 | #define VBE_DISPI_INDEX_YRES 0x2 | ||
46 | #define VBE_DISPI_INDEX_BPP 0x3 | ||
47 | #define VBE_DISPI_INDEX_ENABLE 0x4 | ||
48 | #define VBE_DISPI_INDEX_BANK 0x5 | ||
49 | #define VBE_DISPI_INDEX_VIRT_WIDTH 0x6 | ||
50 | #define VBE_DISPI_INDEX_VIRT_HEIGHT 0x7 | ||
51 | #define VBE_DISPI_INDEX_X_OFFSET 0x8 | ||
52 | #define VBE_DISPI_INDEX_Y_OFFSET 0x9 | ||
53 | #define VBE_DISPI_INDEX_VIDEO_MEMORY_64K 0xa | ||
54 | #define VBE_DISPI_INDEX_COUNT 0xb | ||
55 | |||
56 | #define VBE_DISPI_ID0 0xB0C0 | ||
57 | #define VBE_DISPI_ID1 0xB0C1 | ||
58 | #define VBE_DISPI_ID2 0xB0C2 | ||
59 | #define VBE_DISPI_ID3 0xB0C3 | ||
60 | #define VBE_DISPI_ID4 0xB0C4 | ||
61 | #define VBE_DISPI_ID5 0xB0C5 | ||
62 | |||
63 | #define VBE_DISPI_DISABLED 0x00 | ||
64 | #define VBE_DISPI_ENABLED 0x01 | ||
65 | #define VBE_DISPI_GETCAPS 0x02 | ||
66 | #define VBE_DISPI_8BIT_DAC 0x20 | ||
67 | #define VBE_DISPI_LFB_ENABLED 0x40 | ||
68 | #define VBE_DISPI_NOCLEARMEM 0x80 | ||
69 | |||
70 | |||
71 | #define MBOCHS_NAME "mbochs" | ||
72 | #define MBOCHS_CLASS_NAME "mbochs" | ||
73 | |||
74 | #define MBOCHS_EDID_REGION_INDEX VFIO_PCI_NUM_REGIONS | ||
75 | #define MBOCHS_NUM_REGIONS (MBOCHS_EDID_REGION_INDEX+1) | ||
76 | |||
77 | #define MBOCHS_CONFIG_SPACE_SIZE 0xff | ||
78 | #define MBOCHS_MMIO_BAR_OFFSET PAGE_SIZE | ||
79 | #define MBOCHS_MMIO_BAR_SIZE PAGE_SIZE | ||
80 | #define MBOCHS_EDID_OFFSET (MBOCHS_MMIO_BAR_OFFSET + \ | ||
81 | MBOCHS_MMIO_BAR_SIZE) | ||
82 | #define MBOCHS_EDID_SIZE PAGE_SIZE | ||
83 | #define MBOCHS_MEMORY_BAR_OFFSET (MBOCHS_EDID_OFFSET + \ | ||
84 | MBOCHS_EDID_SIZE) | ||
85 | |||
86 | #define MBOCHS_EDID_BLOB_OFFSET (MBOCHS_EDID_SIZE/2) | ||
87 | |||
88 | #define STORE_LE16(addr, val) (*(u16 *)addr = val) | ||
89 | #define STORE_LE32(addr, val) (*(u32 *)addr = val) | ||
90 | |||
91 | |||
92 | MODULE_LICENSE("GPL v2"); | ||
93 | |||
94 | static int max_mbytes = 256; | ||
95 | module_param_named(count, max_mbytes, int, 0444); | ||
96 | MODULE_PARM_DESC(mem, "megabytes available to " MBOCHS_NAME " devices"); | ||
97 | |||
98 | |||
99 | #define MBOCHS_TYPE_1 "small" | ||
100 | #define MBOCHS_TYPE_2 "medium" | ||
101 | #define MBOCHS_TYPE_3 "large" | ||
102 | |||
103 | static const struct mbochs_type { | ||
104 | const char *name; | ||
105 | u32 mbytes; | ||
106 | u32 max_x; | ||
107 | u32 max_y; | ||
108 | } mbochs_types[] = { | ||
109 | { | ||
110 | .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_1, | ||
111 | .mbytes = 4, | ||
112 | .max_x = 800, | ||
113 | .max_y = 600, | ||
114 | }, { | ||
115 | .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_2, | ||
116 | .mbytes = 16, | ||
117 | .max_x = 1920, | ||
118 | .max_y = 1440, | ||
119 | }, { | ||
120 | .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_3, | ||
121 | .mbytes = 64, | ||
122 | .max_x = 0, | ||
123 | .max_y = 0, | ||
124 | }, | ||
125 | }; | ||
126 | |||
127 | |||
128 | static dev_t mbochs_devt; | ||
129 | static struct class *mbochs_class; | ||
130 | static struct cdev mbochs_cdev; | ||
131 | static struct device mbochs_dev; | ||
132 | static int mbochs_used_mbytes; | ||
133 | |||
134 | struct vfio_region_info_ext { | ||
135 | struct vfio_region_info base; | ||
136 | struct vfio_region_info_cap_type type; | ||
137 | }; | ||
138 | |||
139 | struct mbochs_mode { | ||
140 | u32 drm_format; | ||
141 | u32 bytepp; | ||
142 | u32 width; | ||
143 | u32 height; | ||
144 | u32 stride; | ||
145 | u32 __pad; | ||
146 | u64 offset; | ||
147 | u64 size; | ||
148 | }; | ||
149 | |||
150 | struct mbochs_dmabuf { | ||
151 | struct mbochs_mode mode; | ||
152 | u32 id; | ||
153 | struct page **pages; | ||
154 | pgoff_t pagecount; | ||
155 | struct dma_buf *buf; | ||
156 | struct mdev_state *mdev_state; | ||
157 | struct list_head next; | ||
158 | bool unlinked; | ||
159 | }; | ||
160 | |||
161 | /* State of each mdev device */ | ||
162 | struct mdev_state { | ||
163 | u8 *vconfig; | ||
164 | u64 bar_mask[3]; | ||
165 | u32 memory_bar_mask; | ||
166 | struct mutex ops_lock; | ||
167 | struct mdev_device *mdev; | ||
168 | |||
169 | const struct mbochs_type *type; | ||
170 | u16 vbe[VBE_DISPI_INDEX_COUNT]; | ||
171 | u64 memsize; | ||
172 | struct page **pages; | ||
173 | pgoff_t pagecount; | ||
174 | struct vfio_region_gfx_edid edid_regs; | ||
175 | u8 edid_blob[0x400]; | ||
176 | |||
177 | struct list_head dmabufs; | ||
178 | u32 active_id; | ||
179 | u32 next_id; | ||
180 | }; | ||
181 | |||
182 | static const char *vbe_name_list[VBE_DISPI_INDEX_COUNT] = { | ||
183 | [VBE_DISPI_INDEX_ID] = "id", | ||
184 | [VBE_DISPI_INDEX_XRES] = "xres", | ||
185 | [VBE_DISPI_INDEX_YRES] = "yres", | ||
186 | [VBE_DISPI_INDEX_BPP] = "bpp", | ||
187 | [VBE_DISPI_INDEX_ENABLE] = "enable", | ||
188 | [VBE_DISPI_INDEX_BANK] = "bank", | ||
189 | [VBE_DISPI_INDEX_VIRT_WIDTH] = "virt-width", | ||
190 | [VBE_DISPI_INDEX_VIRT_HEIGHT] = "virt-height", | ||
191 | [VBE_DISPI_INDEX_X_OFFSET] = "x-offset", | ||
192 | [VBE_DISPI_INDEX_Y_OFFSET] = "y-offset", | ||
193 | [VBE_DISPI_INDEX_VIDEO_MEMORY_64K] = "video-mem", | ||
194 | }; | ||
195 | |||
196 | static const char *vbe_name(u32 index) | ||
197 | { | ||
198 | if (index < ARRAY_SIZE(vbe_name_list)) | ||
199 | return vbe_name_list[index]; | ||
200 | return "(invalid)"; | ||
201 | } | ||
202 | |||
203 | static struct page *__mbochs_get_page(struct mdev_state *mdev_state, | ||
204 | pgoff_t pgoff); | ||
205 | static struct page *mbochs_get_page(struct mdev_state *mdev_state, | ||
206 | pgoff_t pgoff); | ||
207 | |||
208 | static const struct mbochs_type *mbochs_find_type(struct kobject *kobj) | ||
209 | { | ||
210 | int i; | ||
211 | |||
212 | for (i = 0; i < ARRAY_SIZE(mbochs_types); i++) | ||
213 | if (strcmp(mbochs_types[i].name, kobj->name) == 0) | ||
214 | return mbochs_types + i; | ||
215 | return NULL; | ||
216 | } | ||
217 | |||
218 | static void mbochs_create_config_space(struct mdev_state *mdev_state) | ||
219 | { | ||
220 | STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID], | ||
221 | 0x1234); | ||
222 | STORE_LE16((u16 *) &mdev_state->vconfig[PCI_DEVICE_ID], | ||
223 | 0x1111); | ||
224 | STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_VENDOR_ID], | ||
225 | PCI_SUBVENDOR_ID_REDHAT_QUMRANET); | ||
226 | STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_ID], | ||
227 | PCI_SUBDEVICE_ID_QEMU); | ||
228 | |||
229 | STORE_LE16((u16 *) &mdev_state->vconfig[PCI_COMMAND], | ||
230 | PCI_COMMAND_IO | PCI_COMMAND_MEMORY); | ||
231 | STORE_LE16((u16 *) &mdev_state->vconfig[PCI_CLASS_DEVICE], | ||
232 | PCI_CLASS_DISPLAY_OTHER); | ||
233 | mdev_state->vconfig[PCI_CLASS_REVISION] = 0x01; | ||
234 | |||
235 | STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_0], | ||
236 | PCI_BASE_ADDRESS_SPACE_MEMORY | | ||
237 | PCI_BASE_ADDRESS_MEM_TYPE_32 | | ||
238 | PCI_BASE_ADDRESS_MEM_PREFETCH); | ||
239 | mdev_state->bar_mask[0] = ~(mdev_state->memsize) + 1; | ||
240 | |||
241 | STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_2], | ||
242 | PCI_BASE_ADDRESS_SPACE_MEMORY | | ||
243 | PCI_BASE_ADDRESS_MEM_TYPE_32); | ||
244 | mdev_state->bar_mask[2] = ~(MBOCHS_MMIO_BAR_SIZE) + 1; | ||
245 | } | ||
246 | |||
247 | static int mbochs_check_framebuffer(struct mdev_state *mdev_state, | ||
248 | struct mbochs_mode *mode) | ||
249 | { | ||
250 | struct device *dev = mdev_dev(mdev_state->mdev); | ||
251 | u16 *vbe = mdev_state->vbe; | ||
252 | u32 virt_width; | ||
253 | |||
254 | WARN_ON(!mutex_is_locked(&mdev_state->ops_lock)); | ||
255 | |||
256 | if (!(vbe[VBE_DISPI_INDEX_ENABLE] & VBE_DISPI_ENABLED)) | ||
257 | goto nofb; | ||
258 | |||
259 | memset(mode, 0, sizeof(*mode)); | ||
260 | switch (vbe[VBE_DISPI_INDEX_BPP]) { | ||
261 | case 32: | ||
262 | mode->drm_format = DRM_FORMAT_XRGB8888; | ||
263 | mode->bytepp = 4; | ||
264 | break; | ||
265 | default: | ||
266 | dev_info_ratelimited(dev, "%s: bpp %d not supported\n", | ||
267 | __func__, vbe[VBE_DISPI_INDEX_BPP]); | ||
268 | goto nofb; | ||
269 | } | ||
270 | |||
271 | mode->width = vbe[VBE_DISPI_INDEX_XRES]; | ||
272 | mode->height = vbe[VBE_DISPI_INDEX_YRES]; | ||
273 | virt_width = vbe[VBE_DISPI_INDEX_VIRT_WIDTH]; | ||
274 | if (virt_width < mode->width) | ||
275 | virt_width = mode->width; | ||
276 | mode->stride = virt_width * mode->bytepp; | ||
277 | mode->size = (u64)mode->stride * mode->height; | ||
278 | mode->offset = ((u64)vbe[VBE_DISPI_INDEX_X_OFFSET] * mode->bytepp + | ||
279 | (u64)vbe[VBE_DISPI_INDEX_Y_OFFSET] * mode->stride); | ||
280 | |||
281 | if (mode->width < 64 || mode->height < 64) { | ||
282 | dev_info_ratelimited(dev, "%s: invalid resolution %dx%d\n", | ||
283 | __func__, mode->width, mode->height); | ||
284 | goto nofb; | ||
285 | } | ||
286 | if (mode->offset + mode->size > mdev_state->memsize) { | ||
287 | dev_info_ratelimited(dev, "%s: framebuffer memory overflow\n", | ||
288 | __func__); | ||
289 | goto nofb; | ||
290 | } | ||
291 | |||
292 | return 0; | ||
293 | |||
294 | nofb: | ||
295 | memset(mode, 0, sizeof(*mode)); | ||
296 | return -EINVAL; | ||
297 | } | ||
298 | |||
299 | static bool mbochs_modes_equal(struct mbochs_mode *mode1, | ||
300 | struct mbochs_mode *mode2) | ||
301 | { | ||
302 | return memcmp(mode1, mode2, sizeof(struct mbochs_mode)) == 0; | ||
303 | } | ||
304 | |||
305 | static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset, | ||
306 | char *buf, u32 count) | ||
307 | { | ||
308 | struct device *dev = mdev_dev(mdev_state->mdev); | ||
309 | int index = (offset - PCI_BASE_ADDRESS_0) / 0x04; | ||
310 | u32 cfg_addr; | ||
311 | |||
312 | switch (offset) { | ||
313 | case PCI_BASE_ADDRESS_0: | ||
314 | case PCI_BASE_ADDRESS_2: | ||
315 | cfg_addr = *(u32 *)buf; | ||
316 | |||
317 | if (cfg_addr == 0xffffffff) { | ||
318 | cfg_addr = (cfg_addr & mdev_state->bar_mask[index]); | ||
319 | } else { | ||
320 | cfg_addr &= PCI_BASE_ADDRESS_MEM_MASK; | ||
321 | if (cfg_addr) | ||
322 | dev_info(dev, "BAR #%d @ 0x%x\n", | ||
323 | index, cfg_addr); | ||
324 | } | ||
325 | |||
326 | cfg_addr |= (mdev_state->vconfig[offset] & | ||
327 | ~PCI_BASE_ADDRESS_MEM_MASK); | ||
328 | STORE_LE32(&mdev_state->vconfig[offset], cfg_addr); | ||
329 | break; | ||
330 | } | ||
331 | } | ||
332 | |||
333 | static void handle_mmio_write(struct mdev_state *mdev_state, u16 offset, | ||
334 | char *buf, u32 count) | ||
335 | { | ||
336 | struct device *dev = mdev_dev(mdev_state->mdev); | ||
337 | int index; | ||
338 | u16 reg16; | ||
339 | |||
340 | switch (offset) { | ||
341 | case 0x400 ... 0x41f: /* vga ioports remapped */ | ||
342 | goto unhandled; | ||
343 | case 0x500 ... 0x515: /* bochs dispi interface */ | ||
344 | if (count != 2) | ||
345 | goto unhandled; | ||
346 | index = (offset - 0x500) / 2; | ||
347 | reg16 = *(u16 *)buf; | ||
348 | if (index < ARRAY_SIZE(mdev_state->vbe)) | ||
349 | mdev_state->vbe[index] = reg16; | ||
350 | dev_dbg(dev, "%s: vbe write %d = %d (%s)\n", | ||
351 | __func__, index, reg16, vbe_name(index)); | ||
352 | break; | ||
353 | case 0x600 ... 0x607: /* qemu extended regs */ | ||
354 | goto unhandled; | ||
355 | default: | ||
356 | unhandled: | ||
357 | dev_dbg(dev, "%s: @0x%03x, count %d (unhandled)\n", | ||
358 | __func__, offset, count); | ||
359 | break; | ||
360 | } | ||
361 | } | ||
362 | |||
363 | static void handle_mmio_read(struct mdev_state *mdev_state, u16 offset, | ||
364 | char *buf, u32 count) | ||
365 | { | ||
366 | struct device *dev = mdev_dev(mdev_state->mdev); | ||
367 | struct vfio_region_gfx_edid *edid; | ||
368 | u16 reg16 = 0; | ||
369 | int index; | ||
370 | |||
371 | switch (offset) { | ||
372 | case 0x000 ... 0x3ff: /* edid block */ | ||
373 | edid = &mdev_state->edid_regs; | ||
374 | if (edid->link_state != VFIO_DEVICE_GFX_LINK_STATE_UP || | ||
375 | offset >= edid->edid_size) { | ||
376 | memset(buf, 0, count); | ||
377 | break; | ||
378 | } | ||
379 | memcpy(buf, mdev_state->edid_blob + offset, count); | ||
380 | break; | ||
381 | case 0x500 ... 0x515: /* bochs dispi interface */ | ||
382 | if (count != 2) | ||
383 | goto unhandled; | ||
384 | index = (offset - 0x500) / 2; | ||
385 | if (index < ARRAY_SIZE(mdev_state->vbe)) | ||
386 | reg16 = mdev_state->vbe[index]; | ||
387 | dev_dbg(dev, "%s: vbe read %d = %d (%s)\n", | ||
388 | __func__, index, reg16, vbe_name(index)); | ||
389 | *(u16 *)buf = reg16; | ||
390 | break; | ||
391 | default: | ||
392 | unhandled: | ||
393 | dev_dbg(dev, "%s: @0x%03x, count %d (unhandled)\n", | ||
394 | __func__, offset, count); | ||
395 | memset(buf, 0, count); | ||
396 | break; | ||
397 | } | ||
398 | } | ||
399 | |||
400 | static void handle_edid_regs(struct mdev_state *mdev_state, u16 offset, | ||
401 | char *buf, u32 count, bool is_write) | ||
402 | { | ||
403 | char *regs = (void *)&mdev_state->edid_regs; | ||
404 | |||
405 | if (offset + count > sizeof(mdev_state->edid_regs)) | ||
406 | return; | ||
407 | if (count != 4) | ||
408 | return; | ||
409 | if (offset % 4) | ||
410 | return; | ||
411 | |||
412 | if (is_write) { | ||
413 | switch (offset) { | ||
414 | case offsetof(struct vfio_region_gfx_edid, link_state): | ||
415 | case offsetof(struct vfio_region_gfx_edid, edid_size): | ||
416 | memcpy(regs + offset, buf, count); | ||
417 | break; | ||
418 | default: | ||
419 | /* read-only regs */ | ||
420 | break; | ||
421 | } | ||
422 | } else { | ||
423 | memcpy(buf, regs + offset, count); | ||
424 | } | ||
425 | } | ||
426 | |||
427 | static void handle_edid_blob(struct mdev_state *mdev_state, u16 offset, | ||
428 | char *buf, u32 count, bool is_write) | ||
429 | { | ||
430 | if (offset + count > mdev_state->edid_regs.edid_max_size) | ||
431 | return; | ||
432 | if (is_write) | ||
433 | memcpy(mdev_state->edid_blob + offset, buf, count); | ||
434 | else | ||
435 | memcpy(buf, mdev_state->edid_blob + offset, count); | ||
436 | } | ||
437 | |||
438 | static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count, | ||
439 | loff_t pos, bool is_write) | ||
440 | { | ||
441 | struct mdev_state *mdev_state = mdev_get_drvdata(mdev); | ||
442 | struct device *dev = mdev_dev(mdev); | ||
443 | struct page *pg; | ||
444 | loff_t poff; | ||
445 | char *map; | ||
446 | int ret = 0; | ||
447 | |||
448 | mutex_lock(&mdev_state->ops_lock); | ||
449 | |||
450 | if (pos < MBOCHS_CONFIG_SPACE_SIZE) { | ||
451 | if (is_write) | ||
452 | handle_pci_cfg_write(mdev_state, pos, buf, count); | ||
453 | else | ||
454 | memcpy(buf, (mdev_state->vconfig + pos), count); | ||
455 | |||
456 | } else if (pos >= MBOCHS_MMIO_BAR_OFFSET && | ||
457 | pos + count <= (MBOCHS_MMIO_BAR_OFFSET + | ||
458 | MBOCHS_MMIO_BAR_SIZE)) { | ||
459 | pos -= MBOCHS_MMIO_BAR_OFFSET; | ||
460 | if (is_write) | ||
461 | handle_mmio_write(mdev_state, pos, buf, count); | ||
462 | else | ||
463 | handle_mmio_read(mdev_state, pos, buf, count); | ||
464 | |||
465 | } else if (pos >= MBOCHS_EDID_OFFSET && | ||
466 | pos + count <= (MBOCHS_EDID_OFFSET + | ||
467 | MBOCHS_EDID_SIZE)) { | ||
468 | pos -= MBOCHS_EDID_OFFSET; | ||
469 | if (pos < MBOCHS_EDID_BLOB_OFFSET) { | ||
470 | handle_edid_regs(mdev_state, pos, buf, count, is_write); | ||
471 | } else { | ||
472 | pos -= MBOCHS_EDID_BLOB_OFFSET; | ||
473 | handle_edid_blob(mdev_state, pos, buf, count, is_write); | ||
474 | } | ||
475 | |||
476 | } else if (pos >= MBOCHS_MEMORY_BAR_OFFSET && | ||
477 | pos + count <= | ||
478 | MBOCHS_MEMORY_BAR_OFFSET + mdev_state->memsize) { | ||
479 | pos -= MBOCHS_MMIO_BAR_OFFSET; | ||
480 | poff = pos & ~PAGE_MASK; | ||
481 | pg = __mbochs_get_page(mdev_state, pos >> PAGE_SHIFT); | ||
482 | map = kmap(pg); | ||
483 | if (is_write) | ||
484 | memcpy(map + poff, buf, count); | ||
485 | else | ||
486 | memcpy(buf, map + poff, count); | ||
487 | kunmap(pg); | ||
488 | put_page(pg); | ||
489 | |||
490 | } else { | ||
491 | dev_dbg(dev, "%s: %s @0x%llx (unhandled)\n", | ||
492 | __func__, is_write ? "WR" : "RD", pos); | ||
493 | ret = -1; | ||
494 | goto accessfailed; | ||
495 | } | ||
496 | |||
497 | ret = count; | ||
498 | |||
499 | |||
500 | accessfailed: | ||
501 | mutex_unlock(&mdev_state->ops_lock); | ||
502 | |||
503 | return ret; | ||
504 | } | ||
505 | |||
506 | static int mbochs_reset(struct mdev_device *mdev) | ||
507 | { | ||
508 | struct mdev_state *mdev_state = mdev_get_drvdata(mdev); | ||
509 | u32 size64k = mdev_state->memsize / (64 * 1024); | ||
510 | int i; | ||
511 | |||
512 | for (i = 0; i < ARRAY_SIZE(mdev_state->vbe); i++) | ||
513 | mdev_state->vbe[i] = 0; | ||
514 | mdev_state->vbe[VBE_DISPI_INDEX_ID] = VBE_DISPI_ID5; | ||
515 | mdev_state->vbe[VBE_DISPI_INDEX_VIDEO_MEMORY_64K] = size64k; | ||
516 | return 0; | ||
517 | } | ||
518 | |||
519 | static int mbochs_create(struct kobject *kobj, struct mdev_device *mdev) | ||
520 | { | ||
521 | const struct mbochs_type *type = mbochs_find_type(kobj); | ||
522 | struct device *dev = mdev_dev(mdev); | ||
523 | struct mdev_state *mdev_state; | ||
524 | |||
525 | if (!type) | ||
526 | type = &mbochs_types[0]; | ||
527 | if (type->mbytes + mbochs_used_mbytes > max_mbytes) | ||
528 | return -ENOMEM; | ||
529 | |||
530 | mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL); | ||
531 | if (mdev_state == NULL) | ||
532 | return -ENOMEM; | ||
533 | |||
534 | mdev_state->vconfig = kzalloc(MBOCHS_CONFIG_SPACE_SIZE, GFP_KERNEL); | ||
535 | if (mdev_state->vconfig == NULL) | ||
536 | goto err_mem; | ||
537 | |||
538 | mdev_state->memsize = type->mbytes * 1024 * 1024; | ||
539 | mdev_state->pagecount = mdev_state->memsize >> PAGE_SHIFT; | ||
540 | mdev_state->pages = kcalloc(mdev_state->pagecount, | ||
541 | sizeof(struct page *), | ||
542 | GFP_KERNEL); | ||
543 | if (!mdev_state->pages) | ||
544 | goto err_mem; | ||
545 | |||
546 | dev_info(dev, "%s: %s, %d MB, %ld pages\n", __func__, | ||
547 | kobj->name, type->mbytes, mdev_state->pagecount); | ||
548 | |||
549 | mutex_init(&mdev_state->ops_lock); | ||
550 | mdev_state->mdev = mdev; | ||
551 | mdev_set_drvdata(mdev, mdev_state); | ||
552 | INIT_LIST_HEAD(&mdev_state->dmabufs); | ||
553 | mdev_state->next_id = 1; | ||
554 | |||
555 | mdev_state->type = type; | ||
556 | mdev_state->edid_regs.max_xres = type->max_x; | ||
557 | mdev_state->edid_regs.max_yres = type->max_y; | ||
558 | mdev_state->edid_regs.edid_offset = MBOCHS_EDID_BLOB_OFFSET; | ||
559 | mdev_state->edid_regs.edid_max_size = sizeof(mdev_state->edid_blob); | ||
560 | mbochs_create_config_space(mdev_state); | ||
561 | mbochs_reset(mdev); | ||
562 | |||
563 | mbochs_used_mbytes += type->mbytes; | ||
564 | return 0; | ||
565 | |||
566 | err_mem: | ||
567 | kfree(mdev_state->vconfig); | ||
568 | kfree(mdev_state); | ||
569 | return -ENOMEM; | ||
570 | } | ||
571 | |||
572 | static int mbochs_remove(struct mdev_device *mdev) | ||
573 | { | ||
574 | struct mdev_state *mdev_state = mdev_get_drvdata(mdev); | ||
575 | |||
576 | mbochs_used_mbytes -= mdev_state->type->mbytes; | ||
577 | mdev_set_drvdata(mdev, NULL); | ||
578 | kfree(mdev_state->pages); | ||
579 | kfree(mdev_state->vconfig); | ||
580 | kfree(mdev_state); | ||
581 | return 0; | ||
582 | } | ||
583 | |||
584 | static ssize_t mbochs_read(struct mdev_device *mdev, char __user *buf, | ||
585 | size_t count, loff_t *ppos) | ||
586 | { | ||
587 | unsigned int done = 0; | ||
588 | int ret; | ||
589 | |||
590 | while (count) { | ||
591 | size_t filled; | ||
592 | |||
593 | if (count >= 4 && !(*ppos % 4)) { | ||
594 | u32 val; | ||
595 | |||
596 | ret = mdev_access(mdev, (char *)&val, sizeof(val), | ||
597 | *ppos, false); | ||
598 | if (ret <= 0) | ||
599 | goto read_err; | ||
600 | |||
601 | if (copy_to_user(buf, &val, sizeof(val))) | ||
602 | goto read_err; | ||
603 | |||
604 | filled = 4; | ||
605 | } else if (count >= 2 && !(*ppos % 2)) { | ||
606 | u16 val; | ||
607 | |||
608 | ret = mdev_access(mdev, (char *)&val, sizeof(val), | ||
609 | *ppos, false); | ||
610 | if (ret <= 0) | ||
611 | goto read_err; | ||
612 | |||
613 | if (copy_to_user(buf, &val, sizeof(val))) | ||
614 | goto read_err; | ||
615 | |||
616 | filled = 2; | ||
617 | } else { | ||
618 | u8 val; | ||
619 | |||
620 | ret = mdev_access(mdev, (char *)&val, sizeof(val), | ||
621 | *ppos, false); | ||
622 | if (ret <= 0) | ||
623 | goto read_err; | ||
624 | |||
625 | if (copy_to_user(buf, &val, sizeof(val))) | ||
626 | goto read_err; | ||
627 | |||
628 | filled = 1; | ||
629 | } | ||
630 | |||
631 | count -= filled; | ||
632 | done += filled; | ||
633 | *ppos += filled; | ||
634 | buf += filled; | ||
635 | } | ||
636 | |||
637 | return done; | ||
638 | |||
639 | read_err: | ||
640 | return -EFAULT; | ||
641 | } | ||
642 | |||
643 | static ssize_t mbochs_write(struct mdev_device *mdev, const char __user *buf, | ||
644 | size_t count, loff_t *ppos) | ||
645 | { | ||
646 | unsigned int done = 0; | ||
647 | int ret; | ||
648 | |||
649 | while (count) { | ||
650 | size_t filled; | ||
651 | |||
652 | if (count >= 4 && !(*ppos % 4)) { | ||
653 | u32 val; | ||
654 | |||
655 | if (copy_from_user(&val, buf, sizeof(val))) | ||
656 | goto write_err; | ||
657 | |||
658 | ret = mdev_access(mdev, (char *)&val, sizeof(val), | ||
659 | *ppos, true); | ||
660 | if (ret <= 0) | ||
661 | goto write_err; | ||
662 | |||
663 | filled = 4; | ||
664 | } else if (count >= 2 && !(*ppos % 2)) { | ||
665 | u16 val; | ||
666 | |||
667 | if (copy_from_user(&val, buf, sizeof(val))) | ||
668 | goto write_err; | ||
669 | |||
670 | ret = mdev_access(mdev, (char *)&val, sizeof(val), | ||
671 | *ppos, true); | ||
672 | if (ret <= 0) | ||
673 | goto write_err; | ||
674 | |||
675 | filled = 2; | ||
676 | } else { | ||
677 | u8 val; | ||
678 | |||
679 | if (copy_from_user(&val, buf, sizeof(val))) | ||
680 | goto write_err; | ||
681 | |||
682 | ret = mdev_access(mdev, (char *)&val, sizeof(val), | ||
683 | *ppos, true); | ||
684 | if (ret <= 0) | ||
685 | goto write_err; | ||
686 | |||
687 | filled = 1; | ||
688 | } | ||
689 | count -= filled; | ||
690 | done += filled; | ||
691 | *ppos += filled; | ||
692 | buf += filled; | ||
693 | } | ||
694 | |||
695 | return done; | ||
696 | write_err: | ||
697 | return -EFAULT; | ||
698 | } | ||
699 | |||
700 | static struct page *__mbochs_get_page(struct mdev_state *mdev_state, | ||
701 | pgoff_t pgoff) | ||
702 | { | ||
703 | WARN_ON(!mutex_is_locked(&mdev_state->ops_lock)); | ||
704 | |||
705 | if (!mdev_state->pages[pgoff]) { | ||
706 | mdev_state->pages[pgoff] = | ||
707 | alloc_pages(GFP_HIGHUSER | __GFP_ZERO, 0); | ||
708 | if (!mdev_state->pages[pgoff]) | ||
709 | return NULL; | ||
710 | } | ||
711 | |||
712 | get_page(mdev_state->pages[pgoff]); | ||
713 | return mdev_state->pages[pgoff]; | ||
714 | } | ||
715 | |||
716 | static struct page *mbochs_get_page(struct mdev_state *mdev_state, | ||
717 | pgoff_t pgoff) | ||
718 | { | ||
719 | struct page *page; | ||
720 | |||
721 | if (WARN_ON(pgoff >= mdev_state->pagecount)) | ||
722 | return NULL; | ||
723 | |||
724 | mutex_lock(&mdev_state->ops_lock); | ||
725 | page = __mbochs_get_page(mdev_state, pgoff); | ||
726 | mutex_unlock(&mdev_state->ops_lock); | ||
727 | |||
728 | return page; | ||
729 | } | ||
730 | |||
731 | static void mbochs_put_pages(struct mdev_state *mdev_state) | ||
732 | { | ||
733 | struct device *dev = mdev_dev(mdev_state->mdev); | ||
734 | int i, count = 0; | ||
735 | |||
736 | WARN_ON(!mutex_is_locked(&mdev_state->ops_lock)); | ||
737 | |||
738 | for (i = 0; i < mdev_state->pagecount; i++) { | ||
739 | if (!mdev_state->pages[i]) | ||
740 | continue; | ||
741 | put_page(mdev_state->pages[i]); | ||
742 | mdev_state->pages[i] = NULL; | ||
743 | count++; | ||
744 | } | ||
745 | dev_dbg(dev, "%s: %d pages released\n", __func__, count); | ||
746 | } | ||
747 | |||
748 | static vm_fault_t mbochs_region_vm_fault(struct vm_fault *vmf) | ||
749 | { | ||
750 | struct vm_area_struct *vma = vmf->vma; | ||
751 | struct mdev_state *mdev_state = vma->vm_private_data; | ||
752 | pgoff_t page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT; | ||
753 | |||
754 | if (page_offset >= mdev_state->pagecount) | ||
755 | return VM_FAULT_SIGBUS; | ||
756 | |||
757 | vmf->page = mbochs_get_page(mdev_state, page_offset); | ||
758 | if (!vmf->page) | ||
759 | return VM_FAULT_SIGBUS; | ||
760 | |||
761 | return 0; | ||
762 | } | ||
763 | |||
764 | static const struct vm_operations_struct mbochs_region_vm_ops = { | ||
765 | .fault = mbochs_region_vm_fault, | ||
766 | }; | ||
767 | |||
768 | static int mbochs_mmap(struct mdev_device *mdev, struct vm_area_struct *vma) | ||
769 | { | ||
770 | struct mdev_state *mdev_state = mdev_get_drvdata(mdev); | ||
771 | |||
772 | if (vma->vm_pgoff != MBOCHS_MEMORY_BAR_OFFSET >> PAGE_SHIFT) | ||
773 | return -EINVAL; | ||
774 | if (vma->vm_end < vma->vm_start) | ||
775 | return -EINVAL; | ||
776 | if (vma->vm_end - vma->vm_start > mdev_state->memsize) | ||
777 | return -EINVAL; | ||
778 | if ((vma->vm_flags & VM_SHARED) == 0) | ||
779 | return -EINVAL; | ||
780 | |||
781 | vma->vm_ops = &mbochs_region_vm_ops; | ||
782 | vma->vm_private_data = mdev_state; | ||
783 | return 0; | ||
784 | } | ||
785 | |||
786 | static vm_fault_t mbochs_dmabuf_vm_fault(struct vm_fault *vmf) | ||
787 | { | ||
788 | struct vm_area_struct *vma = vmf->vma; | ||
789 | struct mbochs_dmabuf *dmabuf = vma->vm_private_data; | ||
790 | |||
791 | if (WARN_ON(vmf->pgoff >= dmabuf->pagecount)) | ||
792 | return VM_FAULT_SIGBUS; | ||
793 | |||
794 | vmf->page = dmabuf->pages[vmf->pgoff]; | ||
795 | get_page(vmf->page); | ||
796 | return 0; | ||
797 | } | ||
798 | |||
799 | static const struct vm_operations_struct mbochs_dmabuf_vm_ops = { | ||
800 | .fault = mbochs_dmabuf_vm_fault, | ||
801 | }; | ||
802 | |||
803 | static int mbochs_mmap_dmabuf(struct dma_buf *buf, struct vm_area_struct *vma) | ||
804 | { | ||
805 | struct mbochs_dmabuf *dmabuf = buf->priv; | ||
806 | struct device *dev = mdev_dev(dmabuf->mdev_state->mdev); | ||
807 | |||
808 | dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id); | ||
809 | |||
810 | if ((vma->vm_flags & VM_SHARED) == 0) | ||
811 | return -EINVAL; | ||
812 | |||
813 | vma->vm_ops = &mbochs_dmabuf_vm_ops; | ||
814 | vma->vm_private_data = dmabuf; | ||
815 | return 0; | ||
816 | } | ||
817 | |||
818 | static void mbochs_print_dmabuf(struct mbochs_dmabuf *dmabuf, | ||
819 | const char *prefix) | ||
820 | { | ||
821 | struct device *dev = mdev_dev(dmabuf->mdev_state->mdev); | ||
822 | u32 fourcc = dmabuf->mode.drm_format; | ||
823 | |||
824 | dev_dbg(dev, "%s/%d: %c%c%c%c, %dx%d, stride %d, off 0x%llx, size 0x%llx, pages %ld\n", | ||
825 | prefix, dmabuf->id, | ||
826 | fourcc ? ((fourcc >> 0) & 0xff) : '-', | ||
827 | fourcc ? ((fourcc >> 8) & 0xff) : '-', | ||
828 | fourcc ? ((fourcc >> 16) & 0xff) : '-', | ||
829 | fourcc ? ((fourcc >> 24) & 0xff) : '-', | ||
830 | dmabuf->mode.width, dmabuf->mode.height, dmabuf->mode.stride, | ||
831 | dmabuf->mode.offset, dmabuf->mode.size, dmabuf->pagecount); | ||
832 | } | ||
833 | |||
834 | static struct sg_table *mbochs_map_dmabuf(struct dma_buf_attachment *at, | ||
835 | enum dma_data_direction direction) | ||
836 | { | ||
837 | struct mbochs_dmabuf *dmabuf = at->dmabuf->priv; | ||
838 | struct device *dev = mdev_dev(dmabuf->mdev_state->mdev); | ||
839 | struct sg_table *sg; | ||
840 | |||
841 | dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id); | ||
842 | |||
843 | sg = kzalloc(sizeof(*sg), GFP_KERNEL); | ||
844 | if (!sg) | ||
845 | goto err1; | ||
846 | if (sg_alloc_table_from_pages(sg, dmabuf->pages, dmabuf->pagecount, | ||
847 | 0, dmabuf->mode.size, GFP_KERNEL) < 0) | ||
848 | goto err2; | ||
849 | if (dma_map_sgtable(at->dev, sg, direction, 0)) | ||
850 | goto err3; | ||
851 | |||
852 | return sg; | ||
853 | |||
854 | err3: | ||
855 | sg_free_table(sg); | ||
856 | err2: | ||
857 | kfree(sg); | ||
858 | err1: | ||
859 | return ERR_PTR(-ENOMEM); | ||
860 | } | ||
861 | |||
862 | static void mbochs_unmap_dmabuf(struct dma_buf_attachment *at, | ||
863 | struct sg_table *sg, | ||
864 | enum dma_data_direction direction) | ||
865 | { | ||
866 | struct mbochs_dmabuf *dmabuf = at->dmabuf->priv; | ||
867 | struct device *dev = mdev_dev(dmabuf->mdev_state->mdev); | ||
868 | |||
869 | dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id); | ||
870 | |||
871 | dma_unmap_sgtable(at->dev, sg, direction, 0); | ||
872 | sg_free_table(sg); | ||
873 | kfree(sg); | ||
874 | } | ||
875 | |||
876 | static void mbochs_release_dmabuf(struct dma_buf *buf) | ||
877 | { | ||
878 | struct mbochs_dmabuf *dmabuf = buf->priv; | ||
879 | struct mdev_state *mdev_state = dmabuf->mdev_state; | ||
880 | struct device *dev = mdev_dev(mdev_state->mdev); | ||
881 | pgoff_t pg; | ||
882 | |||
883 | dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id); | ||
884 | |||
885 | for (pg = 0; pg < dmabuf->pagecount; pg++) | ||
886 | put_page(dmabuf->pages[pg]); | ||
887 | |||
888 | mutex_lock(&mdev_state->ops_lock); | ||
889 | dmabuf->buf = NULL; | ||
890 | if (dmabuf->unlinked) | ||
891 | kfree(dmabuf); | ||
892 | mutex_unlock(&mdev_state->ops_lock); | ||
893 | } | ||
894 | |||
895 | static struct dma_buf_ops mbochs_dmabuf_ops = { | ||
896 | .map_dma_buf = mbochs_map_dmabuf, | ||
897 | .unmap_dma_buf = mbochs_unmap_dmabuf, | ||
898 | .release = mbochs_release_dmabuf, | ||
899 | .mmap = mbochs_mmap_dmabuf, | ||
900 | }; | ||
901 | |||
902 | static struct mbochs_dmabuf *mbochs_dmabuf_alloc(struct mdev_state *mdev_state, | ||
903 | struct mbochs_mode *mode) | ||
904 | { | ||
905 | struct mbochs_dmabuf *dmabuf; | ||
906 | pgoff_t page_offset, pg; | ||
907 | |||
908 | WARN_ON(!mutex_is_locked(&mdev_state->ops_lock)); | ||
909 | |||
910 | dmabuf = kzalloc(sizeof(struct mbochs_dmabuf), GFP_KERNEL); | ||
911 | if (!dmabuf) | ||
912 | return NULL; | ||
913 | |||
914 | dmabuf->mode = *mode; | ||
915 | dmabuf->id = mdev_state->next_id++; | ||
916 | dmabuf->pagecount = DIV_ROUND_UP(mode->size, PAGE_SIZE); | ||
917 | dmabuf->pages = kcalloc(dmabuf->pagecount, sizeof(struct page *), | ||
918 | GFP_KERNEL); | ||
919 | if (!dmabuf->pages) | ||
920 | goto err_free_dmabuf; | ||
921 | |||
922 | page_offset = dmabuf->mode.offset >> PAGE_SHIFT; | ||
923 | for (pg = 0; pg < dmabuf->pagecount; pg++) { | ||
924 | dmabuf->pages[pg] = __mbochs_get_page(mdev_state, | ||
925 | page_offset + pg); | ||
926 | if (!dmabuf->pages[pg]) | ||
927 | goto err_free_pages; | ||
928 | } | ||
929 | |||
930 | dmabuf->mdev_state = mdev_state; | ||
931 | list_add(&dmabuf->next, &mdev_state->dmabufs); | ||
932 | |||
933 | mbochs_print_dmabuf(dmabuf, __func__); | ||
934 | return dmabuf; | ||
935 | |||
936 | err_free_pages: | ||
937 | while (pg > 0) | ||
938 | put_page(dmabuf->pages[--pg]); | ||
939 | kfree(dmabuf->pages); | ||
940 | err_free_dmabuf: | ||
941 | kfree(dmabuf); | ||
942 | return NULL; | ||
943 | } | ||
944 | |||
945 | static struct mbochs_dmabuf * | ||
946 | mbochs_dmabuf_find_by_mode(struct mdev_state *mdev_state, | ||
947 | struct mbochs_mode *mode) | ||
948 | { | ||
949 | struct mbochs_dmabuf *dmabuf; | ||
950 | |||
951 | WARN_ON(!mutex_is_locked(&mdev_state->ops_lock)); | ||
952 | |||
953 | list_for_each_entry(dmabuf, &mdev_state->dmabufs, next) | ||
954 | if (mbochs_modes_equal(&dmabuf->mode, mode)) | ||
955 | return dmabuf; | ||
956 | |||
957 | return NULL; | ||
958 | } | ||
959 | |||
960 | static struct mbochs_dmabuf * | ||
961 | mbochs_dmabuf_find_by_id(struct mdev_state *mdev_state, u32 id) | ||
962 | { | ||
963 | struct mbochs_dmabuf *dmabuf; | ||
964 | |||
965 | WARN_ON(!mutex_is_locked(&mdev_state->ops_lock)); | ||
966 | |||
967 | list_for_each_entry(dmabuf, &mdev_state->dmabufs, next) | ||
968 | if (dmabuf->id == id) | ||
969 | return dmabuf; | ||
970 | |||
971 | return NULL; | ||
972 | } | ||
973 | |||
974 | static int mbochs_dmabuf_export(struct mbochs_dmabuf *dmabuf) | ||
975 | { | ||
976 | struct mdev_state *mdev_state = dmabuf->mdev_state; | ||
977 | struct device *dev = mdev_dev(mdev_state->mdev); | ||
978 | DEFINE_DMA_BUF_EXPORT_INFO(exp_info); | ||
979 | struct dma_buf *buf; | ||
980 | |||
981 | WARN_ON(!mutex_is_locked(&mdev_state->ops_lock)); | ||
982 | |||
983 | if (!IS_ALIGNED(dmabuf->mode.offset, PAGE_SIZE)) { | ||
984 | dev_info_ratelimited(dev, "%s: framebuffer not page-aligned\n", | ||
985 | __func__); | ||
986 | return -EINVAL; | ||
987 | } | ||
988 | |||
989 | exp_info.ops = &mbochs_dmabuf_ops; | ||
990 | exp_info.size = dmabuf->mode.size; | ||
991 | exp_info.priv = dmabuf; | ||
992 | |||
993 | buf = dma_buf_export(&exp_info); | ||
994 | if (IS_ERR(buf)) { | ||
995 | dev_info_ratelimited(dev, "%s: dma_buf_export failed: %ld\n", | ||
996 | __func__, PTR_ERR(buf)); | ||
997 | return PTR_ERR(buf); | ||
998 | } | ||
999 | |||
1000 | dmabuf->buf = buf; | ||
1001 | dev_dbg(dev, "%s: %d\n", __func__, dmabuf->id); | ||
1002 | return 0; | ||
1003 | } | ||
1004 | |||
1005 | static int mbochs_get_region_info(struct mdev_device *mdev, | ||
1006 | struct vfio_region_info_ext *ext) | ||
1007 | { | ||
1008 | struct vfio_region_info *region_info = &ext->base; | ||
1009 | struct mdev_state *mdev_state; | ||
1010 | |||
1011 | mdev_state = mdev_get_drvdata(mdev); | ||
1012 | if (!mdev_state) | ||
1013 | return -EINVAL; | ||
1014 | |||
1015 | if (region_info->index >= MBOCHS_NUM_REGIONS) | ||
1016 | return -EINVAL; | ||
1017 | |||
1018 | switch (region_info->index) { | ||
1019 | case VFIO_PCI_CONFIG_REGION_INDEX: | ||
1020 | region_info->offset = 0; | ||
1021 | region_info->size = MBOCHS_CONFIG_SPACE_SIZE; | ||
1022 | region_info->flags = (VFIO_REGION_INFO_FLAG_READ | | ||
1023 | VFIO_REGION_INFO_FLAG_WRITE); | ||
1024 | break; | ||
1025 | case VFIO_PCI_BAR0_REGION_INDEX: | ||
1026 | region_info->offset = MBOCHS_MEMORY_BAR_OFFSET; | ||
1027 | region_info->size = mdev_state->memsize; | ||
1028 | region_info->flags = (VFIO_REGION_INFO_FLAG_READ | | ||
1029 | VFIO_REGION_INFO_FLAG_WRITE | | ||
1030 | VFIO_REGION_INFO_FLAG_MMAP); | ||
1031 | break; | ||
1032 | case VFIO_PCI_BAR2_REGION_INDEX: | ||
1033 | region_info->offset = MBOCHS_MMIO_BAR_OFFSET; | ||
1034 | region_info->size = MBOCHS_MMIO_BAR_SIZE; | ||
1035 | region_info->flags = (VFIO_REGION_INFO_FLAG_READ | | ||
1036 | VFIO_REGION_INFO_FLAG_WRITE); | ||
1037 | break; | ||
1038 | case MBOCHS_EDID_REGION_INDEX: | ||
1039 | ext->base.argsz = sizeof(*ext); | ||
1040 | ext->base.offset = MBOCHS_EDID_OFFSET; | ||
1041 | ext->base.size = MBOCHS_EDID_SIZE; | ||
1042 | ext->base.flags = (VFIO_REGION_INFO_FLAG_READ | | ||
1043 | VFIO_REGION_INFO_FLAG_WRITE | | ||
1044 | VFIO_REGION_INFO_FLAG_CAPS); | ||
1045 | ext->base.cap_offset = offsetof(typeof(*ext), type); | ||
1046 | ext->type.header.id = VFIO_REGION_INFO_CAP_TYPE; | ||
1047 | ext->type.header.version = 1; | ||
1048 | ext->type.header.next = 0; | ||
1049 | ext->type.type = VFIO_REGION_TYPE_GFX; | ||
1050 | ext->type.subtype = VFIO_REGION_SUBTYPE_GFX_EDID; | ||
1051 | break; | ||
1052 | default: | ||
1053 | region_info->size = 0; | ||
1054 | region_info->offset = 0; | ||
1055 | region_info->flags = 0; | ||
1056 | } | ||
1057 | |||
1058 | return 0; | ||
1059 | } | ||
1060 | |||
1061 | static int mbochs_get_irq_info(struct mdev_device *mdev, | ||
1062 | struct vfio_irq_info *irq_info) | ||
1063 | { | ||
1064 | irq_info->count = 0; | ||
1065 | return 0; | ||
1066 | } | ||
1067 | |||
1068 | static int mbochs_get_device_info(struct mdev_device *mdev, | ||
1069 | struct vfio_device_info *dev_info) | ||
1070 | { | ||
1071 | dev_info->flags = VFIO_DEVICE_FLAGS_PCI; | ||
1072 | dev_info->num_regions = MBOCHS_NUM_REGIONS; | ||
1073 | dev_info->num_irqs = VFIO_PCI_NUM_IRQS; | ||
1074 | return 0; | ||
1075 | } | ||
1076 | |||
1077 | static int mbochs_query_gfx_plane(struct mdev_device *mdev, | ||
1078 | struct vfio_device_gfx_plane_info *plane) | ||
1079 | { | ||
1080 | struct mdev_state *mdev_state = mdev_get_drvdata(mdev); | ||
1081 | struct device *dev = mdev_dev(mdev); | ||
1082 | struct mbochs_dmabuf *dmabuf; | ||
1083 | struct mbochs_mode mode; | ||
1084 | int ret; | ||
1085 | |||
1086 | if (plane->flags & VFIO_GFX_PLANE_TYPE_PROBE) { | ||
1087 | if (plane->flags == (VFIO_GFX_PLANE_TYPE_PROBE | | ||
1088 | VFIO_GFX_PLANE_TYPE_DMABUF)) | ||
1089 | return 0; | ||
1090 | return -EINVAL; | ||
1091 | } | ||
1092 | |||
1093 | if (plane->flags != VFIO_GFX_PLANE_TYPE_DMABUF) | ||
1094 | return -EINVAL; | ||
1095 | |||
1096 | plane->drm_format_mod = 0; | ||
1097 | plane->x_pos = 0; | ||
1098 | plane->y_pos = 0; | ||
1099 | plane->x_hot = 0; | ||
1100 | plane->y_hot = 0; | ||
1101 | |||
1102 | mutex_lock(&mdev_state->ops_lock); | ||
1103 | |||
1104 | ret = -EINVAL; | ||
1105 | if (plane->drm_plane_type == DRM_PLANE_TYPE_PRIMARY) | ||
1106 | ret = mbochs_check_framebuffer(mdev_state, &mode); | ||
1107 | if (ret < 0) { | ||
1108 | plane->drm_format = 0; | ||
1109 | plane->width = 0; | ||
1110 | plane->height = 0; | ||
1111 | plane->stride = 0; | ||
1112 | plane->size = 0; | ||
1113 | plane->dmabuf_id = 0; | ||
1114 | goto done; | ||
1115 | } | ||
1116 | |||
1117 | dmabuf = mbochs_dmabuf_find_by_mode(mdev_state, &mode); | ||
1118 | if (!dmabuf) | ||
1119 | mbochs_dmabuf_alloc(mdev_state, &mode); | ||
1120 | if (!dmabuf) { | ||
1121 | mutex_unlock(&mdev_state->ops_lock); | ||
1122 | return -ENOMEM; | ||
1123 | } | ||
1124 | |||
1125 | plane->drm_format = dmabuf->mode.drm_format; | ||
1126 | plane->width = dmabuf->mode.width; | ||
1127 | plane->height = dmabuf->mode.height; | ||
1128 | plane->stride = dmabuf->mode.stride; | ||
1129 | plane->size = dmabuf->mode.size; | ||
1130 | plane->dmabuf_id = dmabuf->id; | ||
1131 | |||
1132 | done: | ||
1133 | if (plane->drm_plane_type == DRM_PLANE_TYPE_PRIMARY && | ||
1134 | mdev_state->active_id != plane->dmabuf_id) { | ||
1135 | dev_dbg(dev, "%s: primary: %d => %d\n", __func__, | ||
1136 | mdev_state->active_id, plane->dmabuf_id); | ||
1137 | mdev_state->active_id = plane->dmabuf_id; | ||
1138 | } | ||
1139 | mutex_unlock(&mdev_state->ops_lock); | ||
1140 | return 0; | ||
1141 | } | ||
1142 | |||
1143 | static int mbochs_get_gfx_dmabuf(struct mdev_device *mdev, | ||
1144 | u32 id) | ||
1145 | { | ||
1146 | struct mdev_state *mdev_state = mdev_get_drvdata(mdev); | ||
1147 | struct mbochs_dmabuf *dmabuf; | ||
1148 | |||
1149 | mutex_lock(&mdev_state->ops_lock); | ||
1150 | |||
1151 | dmabuf = mbochs_dmabuf_find_by_id(mdev_state, id); | ||
1152 | if (!dmabuf) { | ||
1153 | mutex_unlock(&mdev_state->ops_lock); | ||
1154 | return -ENOENT; | ||
1155 | } | ||
1156 | |||
1157 | if (!dmabuf->buf) | ||
1158 | mbochs_dmabuf_export(dmabuf); | ||
1159 | |||
1160 | mutex_unlock(&mdev_state->ops_lock); | ||
1161 | |||
1162 | if (!dmabuf->buf) | ||
1163 | return -EINVAL; | ||
1164 | |||
1165 | return dma_buf_fd(dmabuf->buf, 0); | ||
1166 | } | ||
1167 | |||
1168 | static long mbochs_ioctl(struct mdev_device *mdev, unsigned int cmd, | ||
1169 | unsigned long arg) | ||
1170 | { | ||
1171 | int ret = 0; | ||
1172 | unsigned long minsz, outsz; | ||
1173 | |||
1174 | switch (cmd) { | ||
1175 | case VFIO_DEVICE_GET_INFO: | ||
1176 | { | ||
1177 | struct vfio_device_info info; | ||
1178 | |||
1179 | minsz = offsetofend(struct vfio_device_info, num_irqs); | ||
1180 | |||
1181 | if (copy_from_user(&info, (void __user *)arg, minsz)) | ||
1182 | return -EFAULT; | ||
1183 | |||
1184 | if (info.argsz < minsz) | ||
1185 | return -EINVAL; | ||
1186 | |||
1187 | ret = mbochs_get_device_info(mdev, &info); | ||
1188 | if (ret) | ||
1189 | return ret; | ||
1190 | |||
1191 | if (copy_to_user((void __user *)arg, &info, minsz)) | ||
1192 | return -EFAULT; | ||
1193 | |||
1194 | return 0; | ||
1195 | } | ||
1196 | case VFIO_DEVICE_GET_REGION_INFO: | ||
1197 | { | ||
1198 | struct vfio_region_info_ext info; | ||
1199 | |||
1200 | minsz = offsetofend(typeof(info), base.offset); | ||
1201 | |||
1202 | if (copy_from_user(&info, (void __user *)arg, minsz)) | ||
1203 | return -EFAULT; | ||
1204 | |||
1205 | outsz = info.base.argsz; | ||
1206 | if (outsz < minsz) | ||
1207 | return -EINVAL; | ||
1208 | if (outsz > sizeof(info)) | ||
1209 | return -EINVAL; | ||
1210 | |||
1211 | ret = mbochs_get_region_info(mdev, &info); | ||
1212 | if (ret) | ||
1213 | return ret; | ||
1214 | |||
1215 | if (copy_to_user((void __user *)arg, &info, outsz)) | ||
1216 | return -EFAULT; | ||
1217 | |||
1218 | return 0; | ||
1219 | } | ||
1220 | |||
1221 | case VFIO_DEVICE_GET_IRQ_INFO: | ||
1222 | { | ||
1223 | struct vfio_irq_info info; | ||
1224 | |||
1225 | minsz = offsetofend(struct vfio_irq_info, count); | ||
1226 | |||
1227 | if (copy_from_user(&info, (void __user *)arg, minsz)) | ||
1228 | return -EFAULT; | ||
1229 | |||
1230 | if ((info.argsz < minsz) || | ||
1231 | (info.index >= VFIO_PCI_NUM_IRQS)) | ||
1232 | return -EINVAL; | ||
1233 | |||
1234 | ret = mbochs_get_irq_info(mdev, &info); | ||
1235 | if (ret) | ||
1236 | return ret; | ||
1237 | |||
1238 | if (copy_to_user((void __user *)arg, &info, minsz)) | ||
1239 | return -EFAULT; | ||
1240 | |||
1241 | return 0; | ||
1242 | } | ||
1243 | |||
1244 | case VFIO_DEVICE_QUERY_GFX_PLANE: | ||
1245 | { | ||
1246 | struct vfio_device_gfx_plane_info plane; | ||
1247 | |||
1248 | minsz = offsetofend(struct vfio_device_gfx_plane_info, | ||
1249 | region_index); | ||
1250 | |||
1251 | if (copy_from_user(&plane, (void __user *)arg, minsz)) | ||
1252 | return -EFAULT; | ||
1253 | |||
1254 | if (plane.argsz < minsz) | ||
1255 | return -EINVAL; | ||
1256 | |||
1257 | ret = mbochs_query_gfx_plane(mdev, &plane); | ||
1258 | if (ret) | ||
1259 | return ret; | ||
1260 | |||
1261 | if (copy_to_user((void __user *)arg, &plane, minsz)) | ||
1262 | return -EFAULT; | ||
1263 | |||
1264 | return 0; | ||
1265 | } | ||
1266 | |||
1267 | case VFIO_DEVICE_GET_GFX_DMABUF: | ||
1268 | { | ||
1269 | u32 dmabuf_id; | ||
1270 | |||
1271 | if (get_user(dmabuf_id, (__u32 __user *)arg)) | ||
1272 | return -EFAULT; | ||
1273 | |||
1274 | return mbochs_get_gfx_dmabuf(mdev, dmabuf_id); | ||
1275 | } | ||
1276 | |||
1277 | case VFIO_DEVICE_SET_IRQS: | ||
1278 | return -EINVAL; | ||
1279 | |||
1280 | case VFIO_DEVICE_RESET: | ||
1281 | return mbochs_reset(mdev); | ||
1282 | } | ||
1283 | return -ENOTTY; | ||
1284 | } | ||
1285 | |||
1286 | static int mbochs_open(struct mdev_device *mdev) | ||
1287 | { | ||
1288 | if (!try_module_get(THIS_MODULE)) | ||
1289 | return -ENODEV; | ||
1290 | |||
1291 | return 0; | ||
1292 | } | ||
1293 | |||
1294 | static void mbochs_close(struct mdev_device *mdev) | ||
1295 | { | ||
1296 | struct mdev_state *mdev_state = mdev_get_drvdata(mdev); | ||
1297 | struct mbochs_dmabuf *dmabuf, *tmp; | ||
1298 | |||
1299 | mutex_lock(&mdev_state->ops_lock); | ||
1300 | |||
1301 | list_for_each_entry_safe(dmabuf, tmp, &mdev_state->dmabufs, next) { | ||
1302 | list_del(&dmabuf->next); | ||
1303 | if (dmabuf->buf) { | ||
1304 | /* free in mbochs_release_dmabuf() */ | ||
1305 | dmabuf->unlinked = true; | ||
1306 | } else { | ||
1307 | kfree(dmabuf); | ||
1308 | } | ||
1309 | } | ||
1310 | mbochs_put_pages(mdev_state); | ||
1311 | |||
1312 | mutex_unlock(&mdev_state->ops_lock); | ||
1313 | module_put(THIS_MODULE); | ||
1314 | } | ||
1315 | |||
1316 | static ssize_t | ||
1317 | memory_show(struct device *dev, struct device_attribute *attr, | ||
1318 | char *buf) | ||
1319 | { | ||
1320 | struct mdev_device *mdev = mdev_from_dev(dev); | ||
1321 | struct mdev_state *mdev_state = mdev_get_drvdata(mdev); | ||
1322 | |||
1323 | return sprintf(buf, "%d MB\n", mdev_state->type->mbytes); | ||
1324 | } | ||
1325 | static DEVICE_ATTR_RO(memory); | ||
1326 | |||
1327 | static struct attribute *mdev_dev_attrs[] = { | ||
1328 | &dev_attr_memory.attr, | ||
1329 | NULL, | ||
1330 | }; | ||
1331 | |||
1332 | static const struct attribute_group mdev_dev_group = { | ||
1333 | .name = "vendor", | ||
1334 | .attrs = mdev_dev_attrs, | ||
1335 | }; | ||
1336 | |||
1337 | const struct attribute_group *mdev_dev_groups[] = { | ||
1338 | &mdev_dev_group, | ||
1339 | NULL, | ||
1340 | }; | ||
1341 | |||
1342 | static ssize_t | ||
1343 | name_show(struct kobject *kobj, struct device *dev, char *buf) | ||
1344 | { | ||
1345 | return sprintf(buf, "%s\n", kobj->name); | ||
1346 | } | ||
1347 | MDEV_TYPE_ATTR_RO(name); | ||
1348 | |||
1349 | static ssize_t | ||
1350 | description_show(struct kobject *kobj, struct device *dev, char *buf) | ||
1351 | { | ||
1352 | const struct mbochs_type *type = mbochs_find_type(kobj); | ||
1353 | |||
1354 | return sprintf(buf, "virtual display, %d MB video memory\n", | ||
1355 | type ? type->mbytes : 0); | ||
1356 | } | ||
1357 | MDEV_TYPE_ATTR_RO(description); | ||
1358 | |||
1359 | static ssize_t | ||
1360 | available_instances_show(struct kobject *kobj, struct device *dev, char *buf) | ||
1361 | { | ||
1362 | const struct mbochs_type *type = mbochs_find_type(kobj); | ||
1363 | int count = (max_mbytes - mbochs_used_mbytes) / type->mbytes; | ||
1364 | |||
1365 | return sprintf(buf, "%d\n", count); | ||
1366 | } | ||
1367 | MDEV_TYPE_ATTR_RO(available_instances); | ||
1368 | |||
1369 | static ssize_t device_api_show(struct kobject *kobj, struct device *dev, | ||
1370 | char *buf) | ||
1371 | { | ||
1372 | return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING); | ||
1373 | } | ||
1374 | MDEV_TYPE_ATTR_RO(device_api); | ||
1375 | |||
1376 | static struct attribute *mdev_types_attrs[] = { | ||
1377 | &mdev_type_attr_name.attr, | ||
1378 | &mdev_type_attr_description.attr, | ||
1379 | &mdev_type_attr_device_api.attr, | ||
1380 | &mdev_type_attr_available_instances.attr, | ||
1381 | NULL, | ||
1382 | }; | ||
1383 | |||
1384 | static struct attribute_group mdev_type_group1 = { | ||
1385 | .name = MBOCHS_TYPE_1, | ||
1386 | .attrs = mdev_types_attrs, | ||
1387 | }; | ||
1388 | |||
1389 | static struct attribute_group mdev_type_group2 = { | ||
1390 | .name = MBOCHS_TYPE_2, | ||
1391 | .attrs = mdev_types_attrs, | ||
1392 | }; | ||
1393 | |||
1394 | static struct attribute_group mdev_type_group3 = { | ||
1395 | .name = MBOCHS_TYPE_3, | ||
1396 | .attrs = mdev_types_attrs, | ||
1397 | }; | ||
1398 | |||
1399 | static struct attribute_group *mdev_type_groups[] = { | ||
1400 | &mdev_type_group1, | ||
1401 | &mdev_type_group2, | ||
1402 | &mdev_type_group3, | ||
1403 | NULL, | ||
1404 | }; | ||
1405 | |||
1406 | static const struct mdev_parent_ops mdev_fops = { | ||
1407 | .owner = THIS_MODULE, | ||
1408 | .mdev_attr_groups = mdev_dev_groups, | ||
1409 | .supported_type_groups = mdev_type_groups, | ||
1410 | .create = mbochs_create, | ||
1411 | .remove = mbochs_remove, | ||
1412 | .open = mbochs_open, | ||
1413 | .release = mbochs_close, | ||
1414 | .read = mbochs_read, | ||
1415 | .write = mbochs_write, | ||
1416 | .ioctl = mbochs_ioctl, | ||
1417 | .mmap = mbochs_mmap, | ||
1418 | }; | ||
1419 | |||
1420 | static const struct file_operations vd_fops = { | ||
1421 | .owner = THIS_MODULE, | ||
1422 | }; | ||
1423 | |||
1424 | static void mbochs_device_release(struct device *dev) | ||
1425 | { | ||
1426 | /* nothing */ | ||
1427 | } | ||
1428 | |||
1429 | static int __init mbochs_dev_init(void) | ||
1430 | { | ||
1431 | int ret = 0; | ||
1432 | |||
1433 | ret = alloc_chrdev_region(&mbochs_devt, 0, MINORMASK + 1, MBOCHS_NAME); | ||
1434 | if (ret < 0) { | ||
1435 | pr_err("Error: failed to register mbochs_dev, err: %d\n", ret); | ||
1436 | return ret; | ||
1437 | } | ||
1438 | cdev_init(&mbochs_cdev, &vd_fops); | ||
1439 | cdev_add(&mbochs_cdev, mbochs_devt, MINORMASK + 1); | ||
1440 | pr_info("%s: major %d\n", __func__, MAJOR(mbochs_devt)); | ||
1441 | |||
1442 | mbochs_class = class_create(THIS_MODULE, MBOCHS_CLASS_NAME); | ||
1443 | if (IS_ERR(mbochs_class)) { | ||
1444 | pr_err("Error: failed to register mbochs_dev class\n"); | ||
1445 | ret = PTR_ERR(mbochs_class); | ||
1446 | goto failed1; | ||
1447 | } | ||
1448 | mbochs_dev.class = mbochs_class; | ||
1449 | mbochs_dev.release = mbochs_device_release; | ||
1450 | dev_set_name(&mbochs_dev, "%s", MBOCHS_NAME); | ||
1451 | |||
1452 | ret = device_register(&mbochs_dev); | ||
1453 | if (ret) | ||
1454 | goto failed2; | ||
1455 | |||
1456 | ret = mdev_register_device(&mbochs_dev, &mdev_fops); | ||
1457 | if (ret) | ||
1458 | goto failed3; | ||
1459 | |||
1460 | return 0; | ||
1461 | |||
1462 | failed3: | ||
1463 | device_unregister(&mbochs_dev); | ||
1464 | failed2: | ||
1465 | class_destroy(mbochs_class); | ||
1466 | failed1: | ||
1467 | cdev_del(&mbochs_cdev); | ||
1468 | unregister_chrdev_region(mbochs_devt, MINORMASK + 1); | ||
1469 | return ret; | ||
1470 | } | ||
1471 | |||
1472 | static void __exit mbochs_dev_exit(void) | ||
1473 | { | ||
1474 | mbochs_dev.bus = NULL; | ||
1475 | mdev_unregister_device(&mbochs_dev); | ||
1476 | |||
1477 | device_unregister(&mbochs_dev); | ||
1478 | cdev_del(&mbochs_cdev); | ||
1479 | unregister_chrdev_region(mbochs_devt, MINORMASK + 1); | ||
1480 | class_destroy(mbochs_class); | ||
1481 | mbochs_class = NULL; | ||
1482 | } | ||
1483 | |||
1484 | module_init(mbochs_dev_init) | ||
1485 | module_exit(mbochs_dev_exit) | ||
diff --git a/samples/vfio-mdev/mdpy-defs.h b/samples/vfio-mdev/mdpy-defs.h new file mode 100644 index 000000000..961c55ec3 --- /dev/null +++ b/samples/vfio-mdev/mdpy-defs.h | |||
@@ -0,0 +1,22 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
2 | /* | ||
3 | * Simple pci display device. | ||
4 | * | ||
5 | * Framebuffer memory is pci bar 0. | ||
6 | * Configuration (read-only) is in pci config space. | ||
7 | * Format field uses drm fourcc codes. | ||
8 | * ATM only DRM_FORMAT_XRGB8888 is supported. | ||
9 | */ | ||
10 | |||
11 | /* pci ids */ | ||
12 | #define MDPY_PCI_VENDOR_ID PCI_VENDOR_ID_REDHAT | ||
13 | #define MDPY_PCI_DEVICE_ID 0x000f | ||
14 | #define MDPY_PCI_SUBVENDOR_ID PCI_SUBVENDOR_ID_REDHAT_QUMRANET | ||
15 | #define MDPY_PCI_SUBDEVICE_ID PCI_SUBDEVICE_ID_QEMU | ||
16 | |||
17 | /* pci cfg space offsets for fb config (dword) */ | ||
18 | #define MDPY_VENDORCAP_OFFSET 0x40 | ||
19 | #define MDPY_VENDORCAP_SIZE 0x10 | ||
20 | #define MDPY_FORMAT_OFFSET (MDPY_VENDORCAP_OFFSET + 0x04) | ||
21 | #define MDPY_WIDTH_OFFSET (MDPY_VENDORCAP_OFFSET + 0x08) | ||
22 | #define MDPY_HEIGHT_OFFSET (MDPY_VENDORCAP_OFFSET + 0x0c) | ||
diff --git a/samples/vfio-mdev/mdpy-fb.c b/samples/vfio-mdev/mdpy-fb.c new file mode 100644 index 000000000..4eb7aa11c --- /dev/null +++ b/samples/vfio-mdev/mdpy-fb.c | |||
@@ -0,0 +1,243 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Framebuffer driver for mdpy (mediated virtual pci display device). | ||
4 | * | ||
5 | * See mdpy-defs.h for device specs | ||
6 | * | ||
7 | * (c) Gerd Hoffmann <kraxel@redhat.com> | ||
8 | * | ||
9 | * Using some code snippets from simplefb and cirrusfb. | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or modify it | ||
12 | * under the terms and conditions of the GNU General Public License, | ||
13 | * version 2, as published by the Free Software Foundation. | ||
14 | * | ||
15 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
16 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
17 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
18 | * more details. | ||
19 | */ | ||
20 | #include <linux/errno.h> | ||
21 | #include <linux/fb.h> | ||
22 | #include <linux/io.h> | ||
23 | #include <linux/pci.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <drm/drm_fourcc.h> | ||
26 | #include "mdpy-defs.h" | ||
27 | |||
28 | static const struct fb_fix_screeninfo mdpy_fb_fix = { | ||
29 | .id = "mdpy-fb", | ||
30 | .type = FB_TYPE_PACKED_PIXELS, | ||
31 | .visual = FB_VISUAL_TRUECOLOR, | ||
32 | .accel = FB_ACCEL_NONE, | ||
33 | }; | ||
34 | |||
35 | static const struct fb_var_screeninfo mdpy_fb_var = { | ||
36 | .height = -1, | ||
37 | .width = -1, | ||
38 | .activate = FB_ACTIVATE_NOW, | ||
39 | .vmode = FB_VMODE_NONINTERLACED, | ||
40 | |||
41 | .bits_per_pixel = 32, | ||
42 | .transp.offset = 24, | ||
43 | .red.offset = 16, | ||
44 | .green.offset = 8, | ||
45 | .blue.offset = 0, | ||
46 | .transp.length = 8, | ||
47 | .red.length = 8, | ||
48 | .green.length = 8, | ||
49 | .blue.length = 8, | ||
50 | }; | ||
51 | |||
52 | #define PSEUDO_PALETTE_SIZE 16 | ||
53 | |||
54 | struct mdpy_fb_par { | ||
55 | u32 palette[PSEUDO_PALETTE_SIZE]; | ||
56 | }; | ||
57 | |||
58 | static int mdpy_fb_setcolreg(u_int regno, u_int red, u_int green, u_int blue, | ||
59 | u_int transp, struct fb_info *info) | ||
60 | { | ||
61 | u32 *pal = info->pseudo_palette; | ||
62 | u32 cr = red >> (16 - info->var.red.length); | ||
63 | u32 cg = green >> (16 - info->var.green.length); | ||
64 | u32 cb = blue >> (16 - info->var.blue.length); | ||
65 | u32 value, mask; | ||
66 | |||
67 | if (regno >= PSEUDO_PALETTE_SIZE) | ||
68 | return -EINVAL; | ||
69 | |||
70 | value = (cr << info->var.red.offset) | | ||
71 | (cg << info->var.green.offset) | | ||
72 | (cb << info->var.blue.offset); | ||
73 | if (info->var.transp.length > 0) { | ||
74 | mask = (1 << info->var.transp.length) - 1; | ||
75 | mask <<= info->var.transp.offset; | ||
76 | value |= mask; | ||
77 | } | ||
78 | pal[regno] = value; | ||
79 | |||
80 | return 0; | ||
81 | } | ||
82 | |||
83 | static void mdpy_fb_destroy(struct fb_info *info) | ||
84 | { | ||
85 | if (info->screen_base) | ||
86 | iounmap(info->screen_base); | ||
87 | } | ||
88 | |||
89 | static const struct fb_ops mdpy_fb_ops = { | ||
90 | .owner = THIS_MODULE, | ||
91 | .fb_destroy = mdpy_fb_destroy, | ||
92 | .fb_setcolreg = mdpy_fb_setcolreg, | ||
93 | .fb_fillrect = cfb_fillrect, | ||
94 | .fb_copyarea = cfb_copyarea, | ||
95 | .fb_imageblit = cfb_imageblit, | ||
96 | }; | ||
97 | |||
98 | static int mdpy_fb_probe(struct pci_dev *pdev, | ||
99 | const struct pci_device_id *ent) | ||
100 | { | ||
101 | struct fb_info *info; | ||
102 | struct mdpy_fb_par *par; | ||
103 | u32 format, width, height; | ||
104 | int ret; | ||
105 | |||
106 | ret = pci_enable_device(pdev); | ||
107 | if (ret < 0) | ||
108 | return ret; | ||
109 | |||
110 | ret = pci_request_regions(pdev, "mdpy-fb"); | ||
111 | if (ret < 0) | ||
112 | goto err_disable_dev; | ||
113 | |||
114 | pci_read_config_dword(pdev, MDPY_FORMAT_OFFSET, &format); | ||
115 | pci_read_config_dword(pdev, MDPY_WIDTH_OFFSET, &width); | ||
116 | pci_read_config_dword(pdev, MDPY_HEIGHT_OFFSET, &height); | ||
117 | if (format != DRM_FORMAT_XRGB8888) { | ||
118 | pci_err(pdev, "format mismatch (0x%x != 0x%x)\n", | ||
119 | format, DRM_FORMAT_XRGB8888); | ||
120 | ret = -EINVAL; | ||
121 | goto err_release_regions; | ||
122 | } | ||
123 | if (width < 100 || width > 10000) { | ||
124 | pci_err(pdev, "width (%d) out of range\n", width); | ||
125 | ret = -EINVAL; | ||
126 | goto err_release_regions; | ||
127 | } | ||
128 | if (height < 100 || height > 10000) { | ||
129 | pci_err(pdev, "height (%d) out of range\n", height); | ||
130 | ret = -EINVAL; | ||
131 | goto err_release_regions; | ||
132 | } | ||
133 | pci_info(pdev, "mdpy found: %dx%d framebuffer\n", | ||
134 | width, height); | ||
135 | |||
136 | info = framebuffer_alloc(sizeof(struct mdpy_fb_par), &pdev->dev); | ||
137 | if (!info) { | ||
138 | ret = -ENOMEM; | ||
139 | goto err_release_regions; | ||
140 | } | ||
141 | pci_set_drvdata(pdev, info); | ||
142 | par = info->par; | ||
143 | |||
144 | info->fix = mdpy_fb_fix; | ||
145 | info->fix.smem_start = pci_resource_start(pdev, 0); | ||
146 | info->fix.smem_len = pci_resource_len(pdev, 0); | ||
147 | info->fix.line_length = width * 4; | ||
148 | |||
149 | info->var = mdpy_fb_var; | ||
150 | info->var.xres = width; | ||
151 | info->var.yres = height; | ||
152 | info->var.xres_virtual = width; | ||
153 | info->var.yres_virtual = height; | ||
154 | |||
155 | info->screen_size = info->fix.smem_len; | ||
156 | info->screen_base = ioremap(info->fix.smem_start, | ||
157 | info->screen_size); | ||
158 | if (!info->screen_base) { | ||
159 | pci_err(pdev, "ioremap(pcibar) failed\n"); | ||
160 | ret = -EIO; | ||
161 | goto err_release_fb; | ||
162 | } | ||
163 | |||
164 | info->apertures = alloc_apertures(1); | ||
165 | if (!info->apertures) { | ||
166 | ret = -ENOMEM; | ||
167 | goto err_unmap; | ||
168 | } | ||
169 | info->apertures->ranges[0].base = info->fix.smem_start; | ||
170 | info->apertures->ranges[0].size = info->fix.smem_len; | ||
171 | |||
172 | info->fbops = &mdpy_fb_ops; | ||
173 | info->flags = FBINFO_DEFAULT; | ||
174 | info->pseudo_palette = par->palette; | ||
175 | |||
176 | ret = register_framebuffer(info); | ||
177 | if (ret < 0) { | ||
178 | pci_err(pdev, "mdpy-fb device register failed: %d\n", ret); | ||
179 | goto err_unmap; | ||
180 | } | ||
181 | |||
182 | pci_info(pdev, "fb%d registered\n", info->node); | ||
183 | return 0; | ||
184 | |||
185 | err_unmap: | ||
186 | iounmap(info->screen_base); | ||
187 | |||
188 | err_release_fb: | ||
189 | framebuffer_release(info); | ||
190 | |||
191 | err_release_regions: | ||
192 | pci_release_regions(pdev); | ||
193 | |||
194 | err_disable_dev: | ||
195 | pci_disable_device(pdev); | ||
196 | |||
197 | return ret; | ||
198 | } | ||
199 | |||
200 | static void mdpy_fb_remove(struct pci_dev *pdev) | ||
201 | { | ||
202 | struct fb_info *info = pci_get_drvdata(pdev); | ||
203 | |||
204 | unregister_framebuffer(info); | ||
205 | iounmap(info->screen_base); | ||
206 | framebuffer_release(info); | ||
207 | pci_release_regions(pdev); | ||
208 | pci_disable_device(pdev); | ||
209 | } | ||
210 | |||
211 | static struct pci_device_id mdpy_fb_pci_table[] = { | ||
212 | { | ||
213 | .vendor = MDPY_PCI_VENDOR_ID, | ||
214 | .device = MDPY_PCI_DEVICE_ID, | ||
215 | .subvendor = MDPY_PCI_SUBVENDOR_ID, | ||
216 | .subdevice = MDPY_PCI_SUBDEVICE_ID, | ||
217 | }, { | ||
218 | /* end of list */ | ||
219 | } | ||
220 | }; | ||
221 | |||
222 | static struct pci_driver mdpy_fb_pci_driver = { | ||
223 | .name = "mdpy-fb", | ||
224 | .id_table = mdpy_fb_pci_table, | ||
225 | .probe = mdpy_fb_probe, | ||
226 | .remove = mdpy_fb_remove, | ||
227 | }; | ||
228 | |||
229 | static int __init mdpy_fb_init(void) | ||
230 | { | ||
231 | int ret; | ||
232 | |||
233 | ret = pci_register_driver(&mdpy_fb_pci_driver); | ||
234 | if (ret) | ||
235 | return ret; | ||
236 | |||
237 | return 0; | ||
238 | } | ||
239 | |||
240 | module_init(mdpy_fb_init); | ||
241 | |||
242 | MODULE_DEVICE_TABLE(pci, mdpy_fb_pci_table); | ||
243 | MODULE_LICENSE("GPL v2"); | ||
diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c new file mode 100644 index 000000000..9894693f3 --- /dev/null +++ b/samples/vfio-mdev/mdpy.c | |||
@@ -0,0 +1,807 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Mediated virtual PCI display host device driver | ||
4 | * | ||
5 | * See mdpy-defs.h for device specs | ||
6 | * | ||
7 | * (c) Gerd Hoffmann <kraxel@redhat.com> | ||
8 | * | ||
9 | * based on mtty driver which is: | ||
10 | * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. | ||
11 | * Author: Neo Jia <cjia@nvidia.com> | ||
12 | * Kirti Wankhede <kwankhede@nvidia.com> | ||
13 | * | ||
14 | * This program is free software; you can redistribute it and/or modify | ||
15 | * it under the terms of the GNU General Public License version 2 as | ||
16 | * published by the Free Software Foundation. | ||
17 | */ | ||
18 | #include <linux/init.h> | ||
19 | #include <linux/module.h> | ||
20 | #include <linux/device.h> | ||
21 | #include <linux/kernel.h> | ||
22 | #include <linux/slab.h> | ||
23 | #include <linux/vmalloc.h> | ||
24 | #include <linux/cdev.h> | ||
25 | #include <linux/vfio.h> | ||
26 | #include <linux/iommu.h> | ||
27 | #include <linux/sysfs.h> | ||
28 | #include <linux/mdev.h> | ||
29 | #include <linux/pci.h> | ||
30 | #include <drm/drm_fourcc.h> | ||
31 | #include "mdpy-defs.h" | ||
32 | |||
33 | #define MDPY_NAME "mdpy" | ||
34 | #define MDPY_CLASS_NAME "mdpy" | ||
35 | |||
36 | #define MDPY_CONFIG_SPACE_SIZE 0xff | ||
37 | #define MDPY_MEMORY_BAR_OFFSET PAGE_SIZE | ||
38 | #define MDPY_DISPLAY_REGION 16 | ||
39 | |||
40 | #define STORE_LE16(addr, val) (*(u16 *)addr = val) | ||
41 | #define STORE_LE32(addr, val) (*(u32 *)addr = val) | ||
42 | |||
43 | |||
44 | MODULE_LICENSE("GPL v2"); | ||
45 | |||
46 | static int max_devices = 4; | ||
47 | module_param_named(count, max_devices, int, 0444); | ||
48 | MODULE_PARM_DESC(count, "number of " MDPY_NAME " devices"); | ||
49 | |||
50 | |||
51 | #define MDPY_TYPE_1 "vga" | ||
52 | #define MDPY_TYPE_2 "xga" | ||
53 | #define MDPY_TYPE_3 "hd" | ||
54 | |||
55 | static const struct mdpy_type { | ||
56 | const char *name; | ||
57 | u32 format; | ||
58 | u32 bytepp; | ||
59 | u32 width; | ||
60 | u32 height; | ||
61 | } mdpy_types[] = { | ||
62 | { | ||
63 | .name = MDPY_CLASS_NAME "-" MDPY_TYPE_1, | ||
64 | .format = DRM_FORMAT_XRGB8888, | ||
65 | .bytepp = 4, | ||
66 | .width = 640, | ||
67 | .height = 480, | ||
68 | }, { | ||
69 | .name = MDPY_CLASS_NAME "-" MDPY_TYPE_2, | ||
70 | .format = DRM_FORMAT_XRGB8888, | ||
71 | .bytepp = 4, | ||
72 | .width = 1024, | ||
73 | .height = 768, | ||
74 | }, { | ||
75 | .name = MDPY_CLASS_NAME "-" MDPY_TYPE_3, | ||
76 | .format = DRM_FORMAT_XRGB8888, | ||
77 | .bytepp = 4, | ||
78 | .width = 1920, | ||
79 | .height = 1080, | ||
80 | }, | ||
81 | }; | ||
82 | |||
83 | static dev_t mdpy_devt; | ||
84 | static struct class *mdpy_class; | ||
85 | static struct cdev mdpy_cdev; | ||
86 | static struct device mdpy_dev; | ||
87 | static u32 mdpy_count; | ||
88 | |||
89 | /* State of each mdev device */ | ||
90 | struct mdev_state { | ||
91 | u8 *vconfig; | ||
92 | u32 bar_mask; | ||
93 | struct mutex ops_lock; | ||
94 | struct mdev_device *mdev; | ||
95 | struct vfio_device_info dev_info; | ||
96 | |||
97 | const struct mdpy_type *type; | ||
98 | u32 memsize; | ||
99 | void *memblk; | ||
100 | }; | ||
101 | |||
102 | static const struct mdpy_type *mdpy_find_type(struct kobject *kobj) | ||
103 | { | ||
104 | int i; | ||
105 | |||
106 | for (i = 0; i < ARRAY_SIZE(mdpy_types); i++) | ||
107 | if (strcmp(mdpy_types[i].name, kobj->name) == 0) | ||
108 | return mdpy_types + i; | ||
109 | return NULL; | ||
110 | } | ||
111 | |||
112 | static void mdpy_create_config_space(struct mdev_state *mdev_state) | ||
113 | { | ||
114 | STORE_LE16((u16 *) &mdev_state->vconfig[PCI_VENDOR_ID], | ||
115 | MDPY_PCI_VENDOR_ID); | ||
116 | STORE_LE16((u16 *) &mdev_state->vconfig[PCI_DEVICE_ID], | ||
117 | MDPY_PCI_DEVICE_ID); | ||
118 | STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_VENDOR_ID], | ||
119 | MDPY_PCI_SUBVENDOR_ID); | ||
120 | STORE_LE16((u16 *) &mdev_state->vconfig[PCI_SUBSYSTEM_ID], | ||
121 | MDPY_PCI_SUBDEVICE_ID); | ||
122 | |||
123 | STORE_LE16((u16 *) &mdev_state->vconfig[PCI_COMMAND], | ||
124 | PCI_COMMAND_IO | PCI_COMMAND_MEMORY); | ||
125 | STORE_LE16((u16 *) &mdev_state->vconfig[PCI_STATUS], | ||
126 | PCI_STATUS_CAP_LIST); | ||
127 | STORE_LE16((u16 *) &mdev_state->vconfig[PCI_CLASS_DEVICE], | ||
128 | PCI_CLASS_DISPLAY_OTHER); | ||
129 | mdev_state->vconfig[PCI_CLASS_REVISION] = 0x01; | ||
130 | |||
131 | STORE_LE32((u32 *) &mdev_state->vconfig[PCI_BASE_ADDRESS_0], | ||
132 | PCI_BASE_ADDRESS_SPACE_MEMORY | | ||
133 | PCI_BASE_ADDRESS_MEM_TYPE_32 | | ||
134 | PCI_BASE_ADDRESS_MEM_PREFETCH); | ||
135 | mdev_state->bar_mask = ~(mdev_state->memsize) + 1; | ||
136 | |||
137 | /* vendor specific capability for the config registers */ | ||
138 | mdev_state->vconfig[PCI_CAPABILITY_LIST] = MDPY_VENDORCAP_OFFSET; | ||
139 | mdev_state->vconfig[MDPY_VENDORCAP_OFFSET + 0] = 0x09; /* vendor cap */ | ||
140 | mdev_state->vconfig[MDPY_VENDORCAP_OFFSET + 1] = 0x00; /* next ptr */ | ||
141 | mdev_state->vconfig[MDPY_VENDORCAP_OFFSET + 2] = MDPY_VENDORCAP_SIZE; | ||
142 | STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_FORMAT_OFFSET], | ||
143 | mdev_state->type->format); | ||
144 | STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_WIDTH_OFFSET], | ||
145 | mdev_state->type->width); | ||
146 | STORE_LE32((u32 *) &mdev_state->vconfig[MDPY_HEIGHT_OFFSET], | ||
147 | mdev_state->type->height); | ||
148 | } | ||
149 | |||
150 | static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset, | ||
151 | char *buf, u32 count) | ||
152 | { | ||
153 | struct device *dev = mdev_dev(mdev_state->mdev); | ||
154 | u32 cfg_addr; | ||
155 | |||
156 | switch (offset) { | ||
157 | case PCI_BASE_ADDRESS_0: | ||
158 | cfg_addr = *(u32 *)buf; | ||
159 | |||
160 | if (cfg_addr == 0xffffffff) { | ||
161 | cfg_addr = (cfg_addr & mdev_state->bar_mask); | ||
162 | } else { | ||
163 | cfg_addr &= PCI_BASE_ADDRESS_MEM_MASK; | ||
164 | if (cfg_addr) | ||
165 | dev_info(dev, "BAR0 @ 0x%x\n", cfg_addr); | ||
166 | } | ||
167 | |||
168 | cfg_addr |= (mdev_state->vconfig[offset] & | ||
169 | ~PCI_BASE_ADDRESS_MEM_MASK); | ||
170 | STORE_LE32(&mdev_state->vconfig[offset], cfg_addr); | ||
171 | break; | ||
172 | } | ||
173 | } | ||
174 | |||
175 | static ssize_t mdev_access(struct mdev_device *mdev, char *buf, size_t count, | ||
176 | loff_t pos, bool is_write) | ||
177 | { | ||
178 | struct mdev_state *mdev_state = mdev_get_drvdata(mdev); | ||
179 | struct device *dev = mdev_dev(mdev); | ||
180 | int ret = 0; | ||
181 | |||
182 | mutex_lock(&mdev_state->ops_lock); | ||
183 | |||
184 | if (pos < MDPY_CONFIG_SPACE_SIZE) { | ||
185 | if (is_write) | ||
186 | handle_pci_cfg_write(mdev_state, pos, buf, count); | ||
187 | else | ||
188 | memcpy(buf, (mdev_state->vconfig + pos), count); | ||
189 | |||
190 | } else if ((pos >= MDPY_MEMORY_BAR_OFFSET) && | ||
191 | (pos + count <= | ||
192 | MDPY_MEMORY_BAR_OFFSET + mdev_state->memsize)) { | ||
193 | pos -= MDPY_MEMORY_BAR_OFFSET; | ||
194 | if (is_write) | ||
195 | memcpy(mdev_state->memblk, buf, count); | ||
196 | else | ||
197 | memcpy(buf, mdev_state->memblk, count); | ||
198 | |||
199 | } else { | ||
200 | dev_info(dev, "%s: %s @0x%llx (unhandled)\n", | ||
201 | __func__, is_write ? "WR" : "RD", pos); | ||
202 | ret = -1; | ||
203 | goto accessfailed; | ||
204 | } | ||
205 | |||
206 | ret = count; | ||
207 | |||
208 | |||
209 | accessfailed: | ||
210 | mutex_unlock(&mdev_state->ops_lock); | ||
211 | |||
212 | return ret; | ||
213 | } | ||
214 | |||
215 | static int mdpy_reset(struct mdev_device *mdev) | ||
216 | { | ||
217 | struct mdev_state *mdev_state = mdev_get_drvdata(mdev); | ||
218 | u32 stride, i; | ||
219 | |||
220 | /* initialize with gray gradient */ | ||
221 | stride = mdev_state->type->width * mdev_state->type->bytepp; | ||
222 | for (i = 0; i < mdev_state->type->height; i++) | ||
223 | memset(mdev_state->memblk + i * stride, | ||
224 | i * 255 / mdev_state->type->height, | ||
225 | stride); | ||
226 | return 0; | ||
227 | } | ||
228 | |||
229 | static int mdpy_create(struct kobject *kobj, struct mdev_device *mdev) | ||
230 | { | ||
231 | const struct mdpy_type *type = mdpy_find_type(kobj); | ||
232 | struct device *dev = mdev_dev(mdev); | ||
233 | struct mdev_state *mdev_state; | ||
234 | u32 fbsize; | ||
235 | |||
236 | if (mdpy_count >= max_devices) | ||
237 | return -ENOMEM; | ||
238 | |||
239 | mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL); | ||
240 | if (mdev_state == NULL) | ||
241 | return -ENOMEM; | ||
242 | |||
243 | mdev_state->vconfig = kzalloc(MDPY_CONFIG_SPACE_SIZE, GFP_KERNEL); | ||
244 | if (mdev_state->vconfig == NULL) { | ||
245 | kfree(mdev_state); | ||
246 | return -ENOMEM; | ||
247 | } | ||
248 | |||
249 | if (!type) | ||
250 | type = &mdpy_types[0]; | ||
251 | fbsize = roundup_pow_of_two(type->width * type->height * type->bytepp); | ||
252 | |||
253 | mdev_state->memblk = vmalloc_user(fbsize); | ||
254 | if (!mdev_state->memblk) { | ||
255 | kfree(mdev_state->vconfig); | ||
256 | kfree(mdev_state); | ||
257 | return -ENOMEM; | ||
258 | } | ||
259 | dev_info(dev, "%s: %s (%dx%d)\n", | ||
260 | __func__, kobj->name, type->width, type->height); | ||
261 | |||
262 | mutex_init(&mdev_state->ops_lock); | ||
263 | mdev_state->mdev = mdev; | ||
264 | mdev_set_drvdata(mdev, mdev_state); | ||
265 | |||
266 | mdev_state->type = type; | ||
267 | mdev_state->memsize = fbsize; | ||
268 | mdpy_create_config_space(mdev_state); | ||
269 | mdpy_reset(mdev); | ||
270 | |||
271 | mdpy_count++; | ||
272 | return 0; | ||
273 | } | ||
274 | |||
275 | static int mdpy_remove(struct mdev_device *mdev) | ||
276 | { | ||
277 | struct mdev_state *mdev_state = mdev_get_drvdata(mdev); | ||
278 | struct device *dev = mdev_dev(mdev); | ||
279 | |||
280 | dev_info(dev, "%s\n", __func__); | ||
281 | |||
282 | mdev_set_drvdata(mdev, NULL); | ||
283 | vfree(mdev_state->memblk); | ||
284 | kfree(mdev_state->vconfig); | ||
285 | kfree(mdev_state); | ||
286 | |||
287 | mdpy_count--; | ||
288 | return 0; | ||
289 | } | ||
290 | |||
291 | static ssize_t mdpy_read(struct mdev_device *mdev, char __user *buf, | ||
292 | size_t count, loff_t *ppos) | ||
293 | { | ||
294 | unsigned int done = 0; | ||
295 | int ret; | ||
296 | |||
297 | while (count) { | ||
298 | size_t filled; | ||
299 | |||
300 | if (count >= 4 && !(*ppos % 4)) { | ||
301 | u32 val; | ||
302 | |||
303 | ret = mdev_access(mdev, (char *)&val, sizeof(val), | ||
304 | *ppos, false); | ||
305 | if (ret <= 0) | ||
306 | goto read_err; | ||
307 | |||
308 | if (copy_to_user(buf, &val, sizeof(val))) | ||
309 | goto read_err; | ||
310 | |||
311 | filled = 4; | ||
312 | } else if (count >= 2 && !(*ppos % 2)) { | ||
313 | u16 val; | ||
314 | |||
315 | ret = mdev_access(mdev, (char *)&val, sizeof(val), | ||
316 | *ppos, false); | ||
317 | if (ret <= 0) | ||
318 | goto read_err; | ||
319 | |||
320 | if (copy_to_user(buf, &val, sizeof(val))) | ||
321 | goto read_err; | ||
322 | |||
323 | filled = 2; | ||
324 | } else { | ||
325 | u8 val; | ||
326 | |||
327 | ret = mdev_access(mdev, (char *)&val, sizeof(val), | ||
328 | *ppos, false); | ||
329 | if (ret <= 0) | ||
330 | goto read_err; | ||
331 | |||
332 | if (copy_to_user(buf, &val, sizeof(val))) | ||
333 | goto read_err; | ||
334 | |||
335 | filled = 1; | ||
336 | } | ||
337 | |||
338 | count -= filled; | ||
339 | done += filled; | ||
340 | *ppos += filled; | ||
341 | buf += filled; | ||
342 | } | ||
343 | |||
344 | return done; | ||
345 | |||
346 | read_err: | ||
347 | return -EFAULT; | ||
348 | } | ||
349 | |||
350 | static ssize_t mdpy_write(struct mdev_device *mdev, const char __user *buf, | ||
351 | size_t count, loff_t *ppos) | ||
352 | { | ||
353 | unsigned int done = 0; | ||
354 | int ret; | ||
355 | |||
356 | while (count) { | ||
357 | size_t filled; | ||
358 | |||
359 | if (count >= 4 && !(*ppos % 4)) { | ||
360 | u32 val; | ||
361 | |||
362 | if (copy_from_user(&val, buf, sizeof(val))) | ||
363 | goto write_err; | ||
364 | |||
365 | ret = mdev_access(mdev, (char *)&val, sizeof(val), | ||
366 | *ppos, true); | ||
367 | if (ret <= 0) | ||
368 | goto write_err; | ||
369 | |||
370 | filled = 4; | ||
371 | } else if (count >= 2 && !(*ppos % 2)) { | ||
372 | u16 val; | ||
373 | |||
374 | if (copy_from_user(&val, buf, sizeof(val))) | ||
375 | goto write_err; | ||
376 | |||
377 | ret = mdev_access(mdev, (char *)&val, sizeof(val), | ||
378 | *ppos, true); | ||
379 | if (ret <= 0) | ||
380 | goto write_err; | ||
381 | |||
382 | filled = 2; | ||
383 | } else { | ||
384 | u8 val; | ||
385 | |||
386 | if (copy_from_user(&val, buf, sizeof(val))) | ||
387 | goto write_err; | ||
388 | |||
389 | ret = mdev_access(mdev, (char *)&val, sizeof(val), | ||
390 | *ppos, true); | ||
391 | if (ret <= 0) | ||
392 | goto write_err; | ||
393 | |||
394 | filled = 1; | ||
395 | } | ||
396 | count -= filled; | ||
397 | done += filled; | ||
398 | *ppos += filled; | ||
399 | buf += filled; | ||
400 | } | ||
401 | |||
402 | return done; | ||
403 | write_err: | ||
404 | return -EFAULT; | ||
405 | } | ||
406 | |||
407 | static int mdpy_mmap(struct mdev_device *mdev, struct vm_area_struct *vma) | ||
408 | { | ||
409 | struct mdev_state *mdev_state = mdev_get_drvdata(mdev); | ||
410 | |||
411 | if (vma->vm_pgoff != MDPY_MEMORY_BAR_OFFSET >> PAGE_SHIFT) | ||
412 | return -EINVAL; | ||
413 | if (vma->vm_end < vma->vm_start) | ||
414 | return -EINVAL; | ||
415 | if (vma->vm_end - vma->vm_start > mdev_state->memsize) | ||
416 | return -EINVAL; | ||
417 | if ((vma->vm_flags & VM_SHARED) == 0) | ||
418 | return -EINVAL; | ||
419 | |||
420 | return remap_vmalloc_range_partial(vma, vma->vm_start, | ||
421 | mdev_state->memblk, 0, | ||
422 | vma->vm_end - vma->vm_start); | ||
423 | } | ||
424 | |||
425 | static int mdpy_get_region_info(struct mdev_device *mdev, | ||
426 | struct vfio_region_info *region_info, | ||
427 | u16 *cap_type_id, void **cap_type) | ||
428 | { | ||
429 | struct mdev_state *mdev_state; | ||
430 | |||
431 | mdev_state = mdev_get_drvdata(mdev); | ||
432 | if (!mdev_state) | ||
433 | return -EINVAL; | ||
434 | |||
435 | if (region_info->index >= VFIO_PCI_NUM_REGIONS && | ||
436 | region_info->index != MDPY_DISPLAY_REGION) | ||
437 | return -EINVAL; | ||
438 | |||
439 | switch (region_info->index) { | ||
440 | case VFIO_PCI_CONFIG_REGION_INDEX: | ||
441 | region_info->offset = 0; | ||
442 | region_info->size = MDPY_CONFIG_SPACE_SIZE; | ||
443 | region_info->flags = (VFIO_REGION_INFO_FLAG_READ | | ||
444 | VFIO_REGION_INFO_FLAG_WRITE); | ||
445 | break; | ||
446 | case VFIO_PCI_BAR0_REGION_INDEX: | ||
447 | case MDPY_DISPLAY_REGION: | ||
448 | region_info->offset = MDPY_MEMORY_BAR_OFFSET; | ||
449 | region_info->size = mdev_state->memsize; | ||
450 | region_info->flags = (VFIO_REGION_INFO_FLAG_READ | | ||
451 | VFIO_REGION_INFO_FLAG_WRITE | | ||
452 | VFIO_REGION_INFO_FLAG_MMAP); | ||
453 | break; | ||
454 | default: | ||
455 | region_info->size = 0; | ||
456 | region_info->offset = 0; | ||
457 | region_info->flags = 0; | ||
458 | } | ||
459 | |||
460 | return 0; | ||
461 | } | ||
462 | |||
463 | static int mdpy_get_irq_info(struct mdev_device *mdev, | ||
464 | struct vfio_irq_info *irq_info) | ||
465 | { | ||
466 | irq_info->count = 0; | ||
467 | return 0; | ||
468 | } | ||
469 | |||
470 | static int mdpy_get_device_info(struct mdev_device *mdev, | ||
471 | struct vfio_device_info *dev_info) | ||
472 | { | ||
473 | dev_info->flags = VFIO_DEVICE_FLAGS_PCI; | ||
474 | dev_info->num_regions = VFIO_PCI_NUM_REGIONS; | ||
475 | dev_info->num_irqs = VFIO_PCI_NUM_IRQS; | ||
476 | return 0; | ||
477 | } | ||
478 | |||
479 | static int mdpy_query_gfx_plane(struct mdev_device *mdev, | ||
480 | struct vfio_device_gfx_plane_info *plane) | ||
481 | { | ||
482 | struct mdev_state *mdev_state = mdev_get_drvdata(mdev); | ||
483 | |||
484 | if (plane->flags & VFIO_GFX_PLANE_TYPE_PROBE) { | ||
485 | if (plane->flags == (VFIO_GFX_PLANE_TYPE_PROBE | | ||
486 | VFIO_GFX_PLANE_TYPE_REGION)) | ||
487 | return 0; | ||
488 | return -EINVAL; | ||
489 | } | ||
490 | |||
491 | if (plane->flags != VFIO_GFX_PLANE_TYPE_REGION) | ||
492 | return -EINVAL; | ||
493 | |||
494 | plane->drm_format = mdev_state->type->format; | ||
495 | plane->width = mdev_state->type->width; | ||
496 | plane->height = mdev_state->type->height; | ||
497 | plane->stride = (mdev_state->type->width * | ||
498 | mdev_state->type->bytepp); | ||
499 | plane->size = mdev_state->memsize; | ||
500 | plane->region_index = MDPY_DISPLAY_REGION; | ||
501 | |||
502 | /* unused */ | ||
503 | plane->drm_format_mod = 0; | ||
504 | plane->x_pos = 0; | ||
505 | plane->y_pos = 0; | ||
506 | plane->x_hot = 0; | ||
507 | plane->y_hot = 0; | ||
508 | |||
509 | return 0; | ||
510 | } | ||
511 | |||
512 | static long mdpy_ioctl(struct mdev_device *mdev, unsigned int cmd, | ||
513 | unsigned long arg) | ||
514 | { | ||
515 | int ret = 0; | ||
516 | unsigned long minsz; | ||
517 | struct mdev_state *mdev_state; | ||
518 | |||
519 | mdev_state = mdev_get_drvdata(mdev); | ||
520 | |||
521 | switch (cmd) { | ||
522 | case VFIO_DEVICE_GET_INFO: | ||
523 | { | ||
524 | struct vfio_device_info info; | ||
525 | |||
526 | minsz = offsetofend(struct vfio_device_info, num_irqs); | ||
527 | |||
528 | if (copy_from_user(&info, (void __user *)arg, minsz)) | ||
529 | return -EFAULT; | ||
530 | |||
531 | if (info.argsz < minsz) | ||
532 | return -EINVAL; | ||
533 | |||
534 | ret = mdpy_get_device_info(mdev, &info); | ||
535 | if (ret) | ||
536 | return ret; | ||
537 | |||
538 | memcpy(&mdev_state->dev_info, &info, sizeof(info)); | ||
539 | |||
540 | if (copy_to_user((void __user *)arg, &info, minsz)) | ||
541 | return -EFAULT; | ||
542 | |||
543 | return 0; | ||
544 | } | ||
545 | case VFIO_DEVICE_GET_REGION_INFO: | ||
546 | { | ||
547 | struct vfio_region_info info; | ||
548 | u16 cap_type_id = 0; | ||
549 | void *cap_type = NULL; | ||
550 | |||
551 | minsz = offsetofend(struct vfio_region_info, offset); | ||
552 | |||
553 | if (copy_from_user(&info, (void __user *)arg, minsz)) | ||
554 | return -EFAULT; | ||
555 | |||
556 | if (info.argsz < minsz) | ||
557 | return -EINVAL; | ||
558 | |||
559 | ret = mdpy_get_region_info(mdev, &info, &cap_type_id, | ||
560 | &cap_type); | ||
561 | if (ret) | ||
562 | return ret; | ||
563 | |||
564 | if (copy_to_user((void __user *)arg, &info, minsz)) | ||
565 | return -EFAULT; | ||
566 | |||
567 | return 0; | ||
568 | } | ||
569 | |||
570 | case VFIO_DEVICE_GET_IRQ_INFO: | ||
571 | { | ||
572 | struct vfio_irq_info info; | ||
573 | |||
574 | minsz = offsetofend(struct vfio_irq_info, count); | ||
575 | |||
576 | if (copy_from_user(&info, (void __user *)arg, minsz)) | ||
577 | return -EFAULT; | ||
578 | |||
579 | if ((info.argsz < minsz) || | ||
580 | (info.index >= mdev_state->dev_info.num_irqs)) | ||
581 | return -EINVAL; | ||
582 | |||
583 | ret = mdpy_get_irq_info(mdev, &info); | ||
584 | if (ret) | ||
585 | return ret; | ||
586 | |||
587 | if (copy_to_user((void __user *)arg, &info, minsz)) | ||
588 | return -EFAULT; | ||
589 | |||
590 | return 0; | ||
591 | } | ||
592 | |||
593 | case VFIO_DEVICE_QUERY_GFX_PLANE: | ||
594 | { | ||
595 | struct vfio_device_gfx_plane_info plane; | ||
596 | |||
597 | minsz = offsetofend(struct vfio_device_gfx_plane_info, | ||
598 | region_index); | ||
599 | |||
600 | if (copy_from_user(&plane, (void __user *)arg, minsz)) | ||
601 | return -EFAULT; | ||
602 | |||
603 | if (plane.argsz < minsz) | ||
604 | return -EINVAL; | ||
605 | |||
606 | ret = mdpy_query_gfx_plane(mdev, &plane); | ||
607 | if (ret) | ||
608 | return ret; | ||
609 | |||
610 | if (copy_to_user((void __user *)arg, &plane, minsz)) | ||
611 | return -EFAULT; | ||
612 | |||
613 | return 0; | ||
614 | } | ||
615 | |||
616 | case VFIO_DEVICE_SET_IRQS: | ||
617 | return -EINVAL; | ||
618 | |||
619 | case VFIO_DEVICE_RESET: | ||
620 | return mdpy_reset(mdev); | ||
621 | } | ||
622 | return -ENOTTY; | ||
623 | } | ||
624 | |||
625 | static int mdpy_open(struct mdev_device *mdev) | ||
626 | { | ||
627 | if (!try_module_get(THIS_MODULE)) | ||
628 | return -ENODEV; | ||
629 | |||
630 | return 0; | ||
631 | } | ||
632 | |||
633 | static void mdpy_close(struct mdev_device *mdev) | ||
634 | { | ||
635 | module_put(THIS_MODULE); | ||
636 | } | ||
637 | |||
638 | static ssize_t | ||
639 | resolution_show(struct device *dev, struct device_attribute *attr, | ||
640 | char *buf) | ||
641 | { | ||
642 | struct mdev_device *mdev = mdev_from_dev(dev); | ||
643 | struct mdev_state *mdev_state = mdev_get_drvdata(mdev); | ||
644 | |||
645 | return sprintf(buf, "%dx%d\n", | ||
646 | mdev_state->type->width, | ||
647 | mdev_state->type->height); | ||
648 | } | ||
649 | static DEVICE_ATTR_RO(resolution); | ||
650 | |||
651 | static struct attribute *mdev_dev_attrs[] = { | ||
652 | &dev_attr_resolution.attr, | ||
653 | NULL, | ||
654 | }; | ||
655 | |||
656 | static const struct attribute_group mdev_dev_group = { | ||
657 | .name = "vendor", | ||
658 | .attrs = mdev_dev_attrs, | ||
659 | }; | ||
660 | |||
661 | const struct attribute_group *mdev_dev_groups[] = { | ||
662 | &mdev_dev_group, | ||
663 | NULL, | ||
664 | }; | ||
665 | |||
666 | static ssize_t | ||
667 | name_show(struct kobject *kobj, struct device *dev, char *buf) | ||
668 | { | ||
669 | return sprintf(buf, "%s\n", kobj->name); | ||
670 | } | ||
671 | MDEV_TYPE_ATTR_RO(name); | ||
672 | |||
673 | static ssize_t | ||
674 | description_show(struct kobject *kobj, struct device *dev, char *buf) | ||
675 | { | ||
676 | const struct mdpy_type *type = mdpy_find_type(kobj); | ||
677 | |||
678 | return sprintf(buf, "virtual display, %dx%d framebuffer\n", | ||
679 | type ? type->width : 0, | ||
680 | type ? type->height : 0); | ||
681 | } | ||
682 | MDEV_TYPE_ATTR_RO(description); | ||
683 | |||
684 | static ssize_t | ||
685 | available_instances_show(struct kobject *kobj, struct device *dev, char *buf) | ||
686 | { | ||
687 | return sprintf(buf, "%d\n", max_devices - mdpy_count); | ||
688 | } | ||
689 | MDEV_TYPE_ATTR_RO(available_instances); | ||
690 | |||
691 | static ssize_t device_api_show(struct kobject *kobj, struct device *dev, | ||
692 | char *buf) | ||
693 | { | ||
694 | return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING); | ||
695 | } | ||
696 | MDEV_TYPE_ATTR_RO(device_api); | ||
697 | |||
698 | static struct attribute *mdev_types_attrs[] = { | ||
699 | &mdev_type_attr_name.attr, | ||
700 | &mdev_type_attr_description.attr, | ||
701 | &mdev_type_attr_device_api.attr, | ||
702 | &mdev_type_attr_available_instances.attr, | ||
703 | NULL, | ||
704 | }; | ||
705 | |||
706 | static struct attribute_group mdev_type_group1 = { | ||
707 | .name = MDPY_TYPE_1, | ||
708 | .attrs = mdev_types_attrs, | ||
709 | }; | ||
710 | |||
711 | static struct attribute_group mdev_type_group2 = { | ||
712 | .name = MDPY_TYPE_2, | ||
713 | .attrs = mdev_types_attrs, | ||
714 | }; | ||
715 | |||
716 | static struct attribute_group mdev_type_group3 = { | ||
717 | .name = MDPY_TYPE_3, | ||
718 | .attrs = mdev_types_attrs, | ||
719 | }; | ||
720 | |||
721 | static struct attribute_group *mdev_type_groups[] = { | ||
722 | &mdev_type_group1, | ||
723 | &mdev_type_group2, | ||
724 | &mdev_type_group3, | ||
725 | NULL, | ||
726 | }; | ||
727 | |||
728 | static const struct mdev_parent_ops mdev_fops = { | ||
729 | .owner = THIS_MODULE, | ||
730 | .mdev_attr_groups = mdev_dev_groups, | ||
731 | .supported_type_groups = mdev_type_groups, | ||
732 | .create = mdpy_create, | ||
733 | .remove = mdpy_remove, | ||
734 | .open = mdpy_open, | ||
735 | .release = mdpy_close, | ||
736 | .read = mdpy_read, | ||
737 | .write = mdpy_write, | ||
738 | .ioctl = mdpy_ioctl, | ||
739 | .mmap = mdpy_mmap, | ||
740 | }; | ||
741 | |||
742 | static const struct file_operations vd_fops = { | ||
743 | .owner = THIS_MODULE, | ||
744 | }; | ||
745 | |||
746 | static void mdpy_device_release(struct device *dev) | ||
747 | { | ||
748 | /* nothing */ | ||
749 | } | ||
750 | |||
751 | static int __init mdpy_dev_init(void) | ||
752 | { | ||
753 | int ret = 0; | ||
754 | |||
755 | ret = alloc_chrdev_region(&mdpy_devt, 0, MINORMASK + 1, MDPY_NAME); | ||
756 | if (ret < 0) { | ||
757 | pr_err("Error: failed to register mdpy_dev, err: %d\n", ret); | ||
758 | return ret; | ||
759 | } | ||
760 | cdev_init(&mdpy_cdev, &vd_fops); | ||
761 | cdev_add(&mdpy_cdev, mdpy_devt, MINORMASK + 1); | ||
762 | pr_info("%s: major %d\n", __func__, MAJOR(mdpy_devt)); | ||
763 | |||
764 | mdpy_class = class_create(THIS_MODULE, MDPY_CLASS_NAME); | ||
765 | if (IS_ERR(mdpy_class)) { | ||
766 | pr_err("Error: failed to register mdpy_dev class\n"); | ||
767 | ret = PTR_ERR(mdpy_class); | ||
768 | goto failed1; | ||
769 | } | ||
770 | mdpy_dev.class = mdpy_class; | ||
771 | mdpy_dev.release = mdpy_device_release; | ||
772 | dev_set_name(&mdpy_dev, "%s", MDPY_NAME); | ||
773 | |||
774 | ret = device_register(&mdpy_dev); | ||
775 | if (ret) | ||
776 | goto failed2; | ||
777 | |||
778 | ret = mdev_register_device(&mdpy_dev, &mdev_fops); | ||
779 | if (ret) | ||
780 | goto failed3; | ||
781 | |||
782 | return 0; | ||
783 | |||
784 | failed3: | ||
785 | device_unregister(&mdpy_dev); | ||
786 | failed2: | ||
787 | class_destroy(mdpy_class); | ||
788 | failed1: | ||
789 | cdev_del(&mdpy_cdev); | ||
790 | unregister_chrdev_region(mdpy_devt, MINORMASK + 1); | ||
791 | return ret; | ||
792 | } | ||
793 | |||
794 | static void __exit mdpy_dev_exit(void) | ||
795 | { | ||
796 | mdpy_dev.bus = NULL; | ||
797 | mdev_unregister_device(&mdpy_dev); | ||
798 | |||
799 | device_unregister(&mdpy_dev); | ||
800 | cdev_del(&mdpy_cdev); | ||
801 | unregister_chrdev_region(mdpy_devt, MINORMASK + 1); | ||
802 | class_destroy(mdpy_class); | ||
803 | mdpy_class = NULL; | ||
804 | } | ||
805 | |||
806 | module_init(mdpy_dev_init) | ||
807 | module_exit(mdpy_dev_exit) | ||
diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c new file mode 100644 index 000000000..ce84a300a --- /dev/null +++ b/samples/vfio-mdev/mtty.c | |||
@@ -0,0 +1,1491 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * Mediated virtual PCI serial host device driver | ||
4 | * | ||
5 | * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. | ||
6 | * Author: Neo Jia <cjia@nvidia.com> | ||
7 | * Kirti Wankhede <kwankhede@nvidia.com> | ||
8 | * | ||
9 | * Sample driver that creates mdev device that simulates serial port over PCI | ||
10 | * card. | ||
11 | */ | ||
12 | |||
13 | #include <linux/init.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <linux/device.h> | ||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/fs.h> | ||
18 | #include <linux/poll.h> | ||
19 | #include <linux/slab.h> | ||
20 | #include <linux/cdev.h> | ||
21 | #include <linux/sched.h> | ||
22 | #include <linux/wait.h> | ||
23 | #include <linux/uuid.h> | ||
24 | #include <linux/vfio.h> | ||
25 | #include <linux/iommu.h> | ||
26 | #include <linux/sysfs.h> | ||
27 | #include <linux/ctype.h> | ||
28 | #include <linux/file.h> | ||
29 | #include <linux/mdev.h> | ||
30 | #include <linux/pci.h> | ||
31 | #include <linux/serial.h> | ||
32 | #include <uapi/linux/serial_reg.h> | ||
33 | #include <linux/eventfd.h> | ||
34 | /* | ||
35 | * #defines | ||
36 | */ | ||
37 | |||
38 | #define VERSION_STRING "0.1" | ||
39 | #define DRIVER_AUTHOR "NVIDIA Corporation" | ||
40 | |||
41 | #define MTTY_CLASS_NAME "mtty" | ||
42 | |||
43 | #define MTTY_NAME "mtty" | ||
44 | |||
45 | #define MTTY_STRING_LEN 16 | ||
46 | |||
47 | #define MTTY_CONFIG_SPACE_SIZE 0xff | ||
48 | #define MTTY_IO_BAR_SIZE 0x8 | ||
49 | #define MTTY_MMIO_BAR_SIZE 0x100000 | ||
50 | |||
51 | #define STORE_LE16(addr, val) (*(u16 *)addr = val) | ||
52 | #define STORE_LE32(addr, val) (*(u32 *)addr = val) | ||
53 | |||
54 | #define MAX_FIFO_SIZE 16 | ||
55 | |||
56 | #define CIRCULAR_BUF_INC_IDX(idx) (idx = (idx + 1) & (MAX_FIFO_SIZE - 1)) | ||
57 | |||
58 | #define MTTY_VFIO_PCI_OFFSET_SHIFT 40 | ||
59 | |||
60 | #define MTTY_VFIO_PCI_OFFSET_TO_INDEX(off) (off >> MTTY_VFIO_PCI_OFFSET_SHIFT) | ||
61 | #define MTTY_VFIO_PCI_INDEX_TO_OFFSET(index) \ | ||
62 | ((u64)(index) << MTTY_VFIO_PCI_OFFSET_SHIFT) | ||
63 | #define MTTY_VFIO_PCI_OFFSET_MASK \ | ||
64 | (((u64)(1) << MTTY_VFIO_PCI_OFFSET_SHIFT) - 1) | ||
65 | #define MAX_MTTYS 24 | ||
66 | |||
67 | /* | ||
68 | * Global Structures | ||
69 | */ | ||
70 | |||
71 | static struct mtty_dev { | ||
72 | dev_t vd_devt; | ||
73 | struct class *vd_class; | ||
74 | struct cdev vd_cdev; | ||
75 | struct idr vd_idr; | ||
76 | struct device dev; | ||
77 | } mtty_dev; | ||
78 | |||
79 | struct mdev_region_info { | ||
80 | u64 start; | ||
81 | u64 phys_start; | ||
82 | u32 size; | ||
83 | u64 vfio_offset; | ||
84 | }; | ||
85 | |||
86 | #if defined(DEBUG_REGS) | ||
87 | static const char *wr_reg[] = { | ||
88 | "TX", | ||
89 | "IER", | ||
90 | "FCR", | ||
91 | "LCR", | ||
92 | "MCR", | ||
93 | "LSR", | ||
94 | "MSR", | ||
95 | "SCR" | ||
96 | }; | ||
97 | |||
98 | static const char *rd_reg[] = { | ||
99 | "RX", | ||
100 | "IER", | ||
101 | "IIR", | ||
102 | "LCR", | ||
103 | "MCR", | ||
104 | "LSR", | ||
105 | "MSR", | ||
106 | "SCR" | ||
107 | }; | ||
108 | #endif | ||
109 | |||
110 | /* loop back buffer */ | ||
111 | struct rxtx { | ||
112 | u8 fifo[MAX_FIFO_SIZE]; | ||
113 | u8 head, tail; | ||
114 | u8 count; | ||
115 | }; | ||
116 | |||
117 | struct serial_port { | ||
118 | u8 uart_reg[8]; /* 8 registers */ | ||
119 | struct rxtx rxtx; /* loop back buffer */ | ||
120 | bool dlab; | ||
121 | bool overrun; | ||
122 | u16 divisor; | ||
123 | u8 fcr; /* FIFO control register */ | ||
124 | u8 max_fifo_size; | ||
125 | u8 intr_trigger_level; /* interrupt trigger level */ | ||
126 | }; | ||
127 | |||
128 | /* State of each mdev device */ | ||
129 | struct mdev_state { | ||
130 | int irq_fd; | ||
131 | struct eventfd_ctx *intx_evtfd; | ||
132 | struct eventfd_ctx *msi_evtfd; | ||
133 | int irq_index; | ||
134 | u8 *vconfig; | ||
135 | struct mutex ops_lock; | ||
136 | struct mdev_device *mdev; | ||
137 | struct mdev_region_info region_info[VFIO_PCI_NUM_REGIONS]; | ||
138 | u32 bar_mask[VFIO_PCI_NUM_REGIONS]; | ||
139 | struct list_head next; | ||
140 | struct serial_port s[2]; | ||
141 | struct mutex rxtx_lock; | ||
142 | struct vfio_device_info dev_info; | ||
143 | int nr_ports; | ||
144 | }; | ||
145 | |||
146 | static struct mutex mdev_list_lock; | ||
147 | static struct list_head mdev_devices_list; | ||
148 | |||
149 | static const struct file_operations vd_fops = { | ||
150 | .owner = THIS_MODULE, | ||
151 | }; | ||
152 | |||
153 | /* function prototypes */ | ||
154 | |||
155 | static int mtty_trigger_interrupt(struct mdev_state *mdev_state); | ||
156 | |||
157 | /* Helper functions */ | ||
158 | |||
159 | static void dump_buffer(u8 *buf, uint32_t count) | ||
160 | { | ||
161 | #if defined(DEBUG) | ||
162 | int i; | ||
163 | |||
164 | pr_info("Buffer:\n"); | ||
165 | for (i = 0; i < count; i++) { | ||
166 | pr_info("%2x ", *(buf + i)); | ||
167 | if ((i + 1) % 16 == 0) | ||
168 | pr_info("\n"); | ||
169 | } | ||
170 | #endif | ||
171 | } | ||
172 | |||
173 | static void mtty_create_config_space(struct mdev_state *mdev_state) | ||
174 | { | ||
175 | /* PCI dev ID */ | ||
176 | STORE_LE32((u32 *) &mdev_state->vconfig[0x0], 0x32534348); | ||
177 | |||
178 | /* Control: I/O+, Mem-, BusMaster- */ | ||
179 | STORE_LE16((u16 *) &mdev_state->vconfig[0x4], 0x0001); | ||
180 | |||
181 | /* Status: capabilities list absent */ | ||
182 | STORE_LE16((u16 *) &mdev_state->vconfig[0x6], 0x0200); | ||
183 | |||
184 | /* Rev ID */ | ||
185 | mdev_state->vconfig[0x8] = 0x10; | ||
186 | |||
187 | /* programming interface class : 16550-compatible serial controller */ | ||
188 | mdev_state->vconfig[0x9] = 0x02; | ||
189 | |||
190 | /* Sub class : 00 */ | ||
191 | mdev_state->vconfig[0xa] = 0x00; | ||
192 | |||
193 | /* Base class : Simple Communication controllers */ | ||
194 | mdev_state->vconfig[0xb] = 0x07; | ||
195 | |||
196 | /* base address registers */ | ||
197 | /* BAR0: IO space */ | ||
198 | STORE_LE32((u32 *) &mdev_state->vconfig[0x10], 0x000001); | ||
199 | mdev_state->bar_mask[0] = ~(MTTY_IO_BAR_SIZE) + 1; | ||
200 | |||
201 | if (mdev_state->nr_ports == 2) { | ||
202 | /* BAR1: IO space */ | ||
203 | STORE_LE32((u32 *) &mdev_state->vconfig[0x14], 0x000001); | ||
204 | mdev_state->bar_mask[1] = ~(MTTY_IO_BAR_SIZE) + 1; | ||
205 | } | ||
206 | |||
207 | /* Subsystem ID */ | ||
208 | STORE_LE32((u32 *) &mdev_state->vconfig[0x2c], 0x32534348); | ||
209 | |||
210 | mdev_state->vconfig[0x34] = 0x00; /* Cap Ptr */ | ||
211 | mdev_state->vconfig[0x3d] = 0x01; /* interrupt pin (INTA#) */ | ||
212 | |||
213 | /* Vendor specific data */ | ||
214 | mdev_state->vconfig[0x40] = 0x23; | ||
215 | mdev_state->vconfig[0x43] = 0x80; | ||
216 | mdev_state->vconfig[0x44] = 0x23; | ||
217 | mdev_state->vconfig[0x48] = 0x23; | ||
218 | mdev_state->vconfig[0x4c] = 0x23; | ||
219 | |||
220 | mdev_state->vconfig[0x60] = 0x50; | ||
221 | mdev_state->vconfig[0x61] = 0x43; | ||
222 | mdev_state->vconfig[0x62] = 0x49; | ||
223 | mdev_state->vconfig[0x63] = 0x20; | ||
224 | mdev_state->vconfig[0x64] = 0x53; | ||
225 | mdev_state->vconfig[0x65] = 0x65; | ||
226 | mdev_state->vconfig[0x66] = 0x72; | ||
227 | mdev_state->vconfig[0x67] = 0x69; | ||
228 | mdev_state->vconfig[0x68] = 0x61; | ||
229 | mdev_state->vconfig[0x69] = 0x6c; | ||
230 | mdev_state->vconfig[0x6a] = 0x2f; | ||
231 | mdev_state->vconfig[0x6b] = 0x55; | ||
232 | mdev_state->vconfig[0x6c] = 0x41; | ||
233 | mdev_state->vconfig[0x6d] = 0x52; | ||
234 | mdev_state->vconfig[0x6e] = 0x54; | ||
235 | } | ||
236 | |||
237 | static void handle_pci_cfg_write(struct mdev_state *mdev_state, u16 offset, | ||
238 | u8 *buf, u32 count) | ||
239 | { | ||
240 | u32 cfg_addr, bar_mask, bar_index = 0; | ||
241 | |||
242 | switch (offset) { | ||
243 | case 0x04: /* device control */ | ||
244 | case 0x06: /* device status */ | ||
245 | /* do nothing */ | ||
246 | break; | ||
247 | case 0x3c: /* interrupt line */ | ||
248 | mdev_state->vconfig[0x3c] = buf[0]; | ||
249 | break; | ||
250 | case 0x3d: | ||
251 | /* | ||
252 | * Interrupt Pin is hardwired to INTA. | ||
253 | * This field is write protected by hardware | ||
254 | */ | ||
255 | break; | ||
256 | case 0x10: /* BAR0 */ | ||
257 | case 0x14: /* BAR1 */ | ||
258 | if (offset == 0x10) | ||
259 | bar_index = 0; | ||
260 | else if (offset == 0x14) | ||
261 | bar_index = 1; | ||
262 | |||
263 | if ((mdev_state->nr_ports == 1) && (bar_index == 1)) { | ||
264 | STORE_LE32(&mdev_state->vconfig[offset], 0); | ||
265 | break; | ||
266 | } | ||
267 | |||
268 | cfg_addr = *(u32 *)buf; | ||
269 | pr_info("BAR%d addr 0x%x\n", bar_index, cfg_addr); | ||
270 | |||
271 | if (cfg_addr == 0xffffffff) { | ||
272 | bar_mask = mdev_state->bar_mask[bar_index]; | ||
273 | cfg_addr = (cfg_addr & bar_mask); | ||
274 | } | ||
275 | |||
276 | cfg_addr |= (mdev_state->vconfig[offset] & 0x3ul); | ||
277 | STORE_LE32(&mdev_state->vconfig[offset], cfg_addr); | ||
278 | break; | ||
279 | case 0x18: /* BAR2 */ | ||
280 | case 0x1c: /* BAR3 */ | ||
281 | case 0x20: /* BAR4 */ | ||
282 | STORE_LE32(&mdev_state->vconfig[offset], 0); | ||
283 | break; | ||
284 | default: | ||
285 | pr_info("PCI config write @0x%x of %d bytes not handled\n", | ||
286 | offset, count); | ||
287 | break; | ||
288 | } | ||
289 | } | ||
290 | |||
291 | static void handle_bar_write(unsigned int index, struct mdev_state *mdev_state, | ||
292 | u16 offset, u8 *buf, u32 count) | ||
293 | { | ||
294 | u8 data = *buf; | ||
295 | |||
296 | /* Handle data written by guest */ | ||
297 | switch (offset) { | ||
298 | case UART_TX: | ||
299 | /* if DLAB set, data is LSB of divisor */ | ||
300 | if (mdev_state->s[index].dlab) { | ||
301 | mdev_state->s[index].divisor |= data; | ||
302 | break; | ||
303 | } | ||
304 | |||
305 | mutex_lock(&mdev_state->rxtx_lock); | ||
306 | |||
307 | /* save in TX buffer */ | ||
308 | if (mdev_state->s[index].rxtx.count < | ||
309 | mdev_state->s[index].max_fifo_size) { | ||
310 | mdev_state->s[index].rxtx.fifo[ | ||
311 | mdev_state->s[index].rxtx.head] = data; | ||
312 | mdev_state->s[index].rxtx.count++; | ||
313 | CIRCULAR_BUF_INC_IDX(mdev_state->s[index].rxtx.head); | ||
314 | mdev_state->s[index].overrun = false; | ||
315 | |||
316 | /* | ||
317 | * Trigger interrupt if receive data interrupt is | ||
318 | * enabled and fifo reached trigger level | ||
319 | */ | ||
320 | if ((mdev_state->s[index].uart_reg[UART_IER] & | ||
321 | UART_IER_RDI) && | ||
322 | (mdev_state->s[index].rxtx.count == | ||
323 | mdev_state->s[index].intr_trigger_level)) { | ||
324 | /* trigger interrupt */ | ||
325 | #if defined(DEBUG_INTR) | ||
326 | pr_err("Serial port %d: Fifo level trigger\n", | ||
327 | index); | ||
328 | #endif | ||
329 | mtty_trigger_interrupt(mdev_state); | ||
330 | } | ||
331 | } else { | ||
332 | #if defined(DEBUG_INTR) | ||
333 | pr_err("Serial port %d: Buffer Overflow\n", index); | ||
334 | #endif | ||
335 | mdev_state->s[index].overrun = true; | ||
336 | |||
337 | /* | ||
338 | * Trigger interrupt if receiver line status interrupt | ||
339 | * is enabled | ||
340 | */ | ||
341 | if (mdev_state->s[index].uart_reg[UART_IER] & | ||
342 | UART_IER_RLSI) | ||
343 | mtty_trigger_interrupt(mdev_state); | ||
344 | } | ||
345 | mutex_unlock(&mdev_state->rxtx_lock); | ||
346 | break; | ||
347 | |||
348 | case UART_IER: | ||
349 | /* if DLAB set, data is MSB of divisor */ | ||
350 | if (mdev_state->s[index].dlab) | ||
351 | mdev_state->s[index].divisor |= (u16)data << 8; | ||
352 | else { | ||
353 | mdev_state->s[index].uart_reg[offset] = data; | ||
354 | mutex_lock(&mdev_state->rxtx_lock); | ||
355 | if ((data & UART_IER_THRI) && | ||
356 | (mdev_state->s[index].rxtx.head == | ||
357 | mdev_state->s[index].rxtx.tail)) { | ||
358 | #if defined(DEBUG_INTR) | ||
359 | pr_err("Serial port %d: IER_THRI write\n", | ||
360 | index); | ||
361 | #endif | ||
362 | mtty_trigger_interrupt(mdev_state); | ||
363 | } | ||
364 | |||
365 | mutex_unlock(&mdev_state->rxtx_lock); | ||
366 | } | ||
367 | |||
368 | break; | ||
369 | |||
370 | case UART_FCR: | ||
371 | mdev_state->s[index].fcr = data; | ||
372 | |||
373 | mutex_lock(&mdev_state->rxtx_lock); | ||
374 | if (data & (UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT)) { | ||
375 | /* clear loop back FIFO */ | ||
376 | mdev_state->s[index].rxtx.count = 0; | ||
377 | mdev_state->s[index].rxtx.head = 0; | ||
378 | mdev_state->s[index].rxtx.tail = 0; | ||
379 | } | ||
380 | mutex_unlock(&mdev_state->rxtx_lock); | ||
381 | |||
382 | switch (data & UART_FCR_TRIGGER_MASK) { | ||
383 | case UART_FCR_TRIGGER_1: | ||
384 | mdev_state->s[index].intr_trigger_level = 1; | ||
385 | break; | ||
386 | |||
387 | case UART_FCR_TRIGGER_4: | ||
388 | mdev_state->s[index].intr_trigger_level = 4; | ||
389 | break; | ||
390 | |||
391 | case UART_FCR_TRIGGER_8: | ||
392 | mdev_state->s[index].intr_trigger_level = 8; | ||
393 | break; | ||
394 | |||
395 | case UART_FCR_TRIGGER_14: | ||
396 | mdev_state->s[index].intr_trigger_level = 14; | ||
397 | break; | ||
398 | } | ||
399 | |||
400 | /* | ||
401 | * Set trigger level to 1 otherwise or implement timer with | ||
402 | * timeout of 4 characters and on expiring that timer set | ||
403 | * Recevice data timeout in IIR register | ||
404 | */ | ||
405 | mdev_state->s[index].intr_trigger_level = 1; | ||
406 | if (data & UART_FCR_ENABLE_FIFO) | ||
407 | mdev_state->s[index].max_fifo_size = MAX_FIFO_SIZE; | ||
408 | else { | ||
409 | mdev_state->s[index].max_fifo_size = 1; | ||
410 | mdev_state->s[index].intr_trigger_level = 1; | ||
411 | } | ||
412 | |||
413 | break; | ||
414 | |||
415 | case UART_LCR: | ||
416 | if (data & UART_LCR_DLAB) { | ||
417 | mdev_state->s[index].dlab = true; | ||
418 | mdev_state->s[index].divisor = 0; | ||
419 | } else | ||
420 | mdev_state->s[index].dlab = false; | ||
421 | |||
422 | mdev_state->s[index].uart_reg[offset] = data; | ||
423 | break; | ||
424 | |||
425 | case UART_MCR: | ||
426 | mdev_state->s[index].uart_reg[offset] = data; | ||
427 | |||
428 | if ((mdev_state->s[index].uart_reg[UART_IER] & UART_IER_MSI) && | ||
429 | (data & UART_MCR_OUT2)) { | ||
430 | #if defined(DEBUG_INTR) | ||
431 | pr_err("Serial port %d: MCR_OUT2 write\n", index); | ||
432 | #endif | ||
433 | mtty_trigger_interrupt(mdev_state); | ||
434 | } | ||
435 | |||
436 | if ((mdev_state->s[index].uart_reg[UART_IER] & UART_IER_MSI) && | ||
437 | (data & (UART_MCR_RTS | UART_MCR_DTR))) { | ||
438 | #if defined(DEBUG_INTR) | ||
439 | pr_err("Serial port %d: MCR RTS/DTR write\n", index); | ||
440 | #endif | ||
441 | mtty_trigger_interrupt(mdev_state); | ||
442 | } | ||
443 | break; | ||
444 | |||
445 | case UART_LSR: | ||
446 | case UART_MSR: | ||
447 | /* do nothing */ | ||
448 | break; | ||
449 | |||
450 | case UART_SCR: | ||
451 | mdev_state->s[index].uart_reg[offset] = data; | ||
452 | break; | ||
453 | |||
454 | default: | ||
455 | break; | ||
456 | } | ||
457 | } | ||
458 | |||
459 | static void handle_bar_read(unsigned int index, struct mdev_state *mdev_state, | ||
460 | u16 offset, u8 *buf, u32 count) | ||
461 | { | ||
462 | /* Handle read requests by guest */ | ||
463 | switch (offset) { | ||
464 | case UART_RX: | ||
465 | /* if DLAB set, data is LSB of divisor */ | ||
466 | if (mdev_state->s[index].dlab) { | ||
467 | *buf = (u8)mdev_state->s[index].divisor; | ||
468 | break; | ||
469 | } | ||
470 | |||
471 | mutex_lock(&mdev_state->rxtx_lock); | ||
472 | /* return data in tx buffer */ | ||
473 | if (mdev_state->s[index].rxtx.head != | ||
474 | mdev_state->s[index].rxtx.tail) { | ||
475 | *buf = mdev_state->s[index].rxtx.fifo[ | ||
476 | mdev_state->s[index].rxtx.tail]; | ||
477 | mdev_state->s[index].rxtx.count--; | ||
478 | CIRCULAR_BUF_INC_IDX(mdev_state->s[index].rxtx.tail); | ||
479 | } | ||
480 | |||
481 | if (mdev_state->s[index].rxtx.head == | ||
482 | mdev_state->s[index].rxtx.tail) { | ||
483 | /* | ||
484 | * Trigger interrupt if tx buffer empty interrupt is | ||
485 | * enabled and fifo is empty | ||
486 | */ | ||
487 | #if defined(DEBUG_INTR) | ||
488 | pr_err("Serial port %d: Buffer Empty\n", index); | ||
489 | #endif | ||
490 | if (mdev_state->s[index].uart_reg[UART_IER] & | ||
491 | UART_IER_THRI) | ||
492 | mtty_trigger_interrupt(mdev_state); | ||
493 | } | ||
494 | mutex_unlock(&mdev_state->rxtx_lock); | ||
495 | |||
496 | break; | ||
497 | |||
498 | case UART_IER: | ||
499 | if (mdev_state->s[index].dlab) { | ||
500 | *buf = (u8)(mdev_state->s[index].divisor >> 8); | ||
501 | break; | ||
502 | } | ||
503 | *buf = mdev_state->s[index].uart_reg[offset] & 0x0f; | ||
504 | break; | ||
505 | |||
506 | case UART_IIR: | ||
507 | { | ||
508 | u8 ier = mdev_state->s[index].uart_reg[UART_IER]; | ||
509 | *buf = 0; | ||
510 | |||
511 | mutex_lock(&mdev_state->rxtx_lock); | ||
512 | /* Interrupt priority 1: Parity, overrun, framing or break */ | ||
513 | if ((ier & UART_IER_RLSI) && mdev_state->s[index].overrun) | ||
514 | *buf |= UART_IIR_RLSI; | ||
515 | |||
516 | /* Interrupt priority 2: Fifo trigger level reached */ | ||
517 | if ((ier & UART_IER_RDI) && | ||
518 | (mdev_state->s[index].rxtx.count >= | ||
519 | mdev_state->s[index].intr_trigger_level)) | ||
520 | *buf |= UART_IIR_RDI; | ||
521 | |||
522 | /* Interrupt priotiry 3: transmitter holding register empty */ | ||
523 | if ((ier & UART_IER_THRI) && | ||
524 | (mdev_state->s[index].rxtx.head == | ||
525 | mdev_state->s[index].rxtx.tail)) | ||
526 | *buf |= UART_IIR_THRI; | ||
527 | |||
528 | /* Interrupt priotiry 4: Modem status: CTS, DSR, RI or DCD */ | ||
529 | if ((ier & UART_IER_MSI) && | ||
530 | (mdev_state->s[index].uart_reg[UART_MCR] & | ||
531 | (UART_MCR_RTS | UART_MCR_DTR))) | ||
532 | *buf |= UART_IIR_MSI; | ||
533 | |||
534 | /* bit0: 0=> interrupt pending, 1=> no interrupt is pending */ | ||
535 | if (*buf == 0) | ||
536 | *buf = UART_IIR_NO_INT; | ||
537 | |||
538 | /* set bit 6 & 7 to be 16550 compatible */ | ||
539 | *buf |= 0xC0; | ||
540 | mutex_unlock(&mdev_state->rxtx_lock); | ||
541 | } | ||
542 | break; | ||
543 | |||
544 | case UART_LCR: | ||
545 | case UART_MCR: | ||
546 | *buf = mdev_state->s[index].uart_reg[offset]; | ||
547 | break; | ||
548 | |||
549 | case UART_LSR: | ||
550 | { | ||
551 | u8 lsr = 0; | ||
552 | |||
553 | mutex_lock(&mdev_state->rxtx_lock); | ||
554 | /* atleast one char in FIFO */ | ||
555 | if (mdev_state->s[index].rxtx.head != | ||
556 | mdev_state->s[index].rxtx.tail) | ||
557 | lsr |= UART_LSR_DR; | ||
558 | |||
559 | /* if FIFO overrun */ | ||
560 | if (mdev_state->s[index].overrun) | ||
561 | lsr |= UART_LSR_OE; | ||
562 | |||
563 | /* transmit FIFO empty and tramsitter empty */ | ||
564 | if (mdev_state->s[index].rxtx.head == | ||
565 | mdev_state->s[index].rxtx.tail) | ||
566 | lsr |= UART_LSR_TEMT | UART_LSR_THRE; | ||
567 | |||
568 | mutex_unlock(&mdev_state->rxtx_lock); | ||
569 | *buf = lsr; | ||
570 | break; | ||
571 | } | ||
572 | case UART_MSR: | ||
573 | *buf = UART_MSR_DSR | UART_MSR_DDSR | UART_MSR_DCD; | ||
574 | |||
575 | mutex_lock(&mdev_state->rxtx_lock); | ||
576 | /* if AFE is 1 and FIFO have space, set CTS bit */ | ||
577 | if (mdev_state->s[index].uart_reg[UART_MCR] & | ||
578 | UART_MCR_AFE) { | ||
579 | if (mdev_state->s[index].rxtx.count < | ||
580 | mdev_state->s[index].max_fifo_size) | ||
581 | *buf |= UART_MSR_CTS | UART_MSR_DCTS; | ||
582 | } else | ||
583 | *buf |= UART_MSR_CTS | UART_MSR_DCTS; | ||
584 | mutex_unlock(&mdev_state->rxtx_lock); | ||
585 | |||
586 | break; | ||
587 | |||
588 | case UART_SCR: | ||
589 | *buf = mdev_state->s[index].uart_reg[offset]; | ||
590 | break; | ||
591 | |||
592 | default: | ||
593 | break; | ||
594 | } | ||
595 | } | ||
596 | |||
597 | static void mdev_read_base(struct mdev_state *mdev_state) | ||
598 | { | ||
599 | int index, pos; | ||
600 | u32 start_lo, start_hi; | ||
601 | u32 mem_type; | ||
602 | |||
603 | pos = PCI_BASE_ADDRESS_0; | ||
604 | |||
605 | for (index = 0; index <= VFIO_PCI_BAR5_REGION_INDEX; index++) { | ||
606 | |||
607 | if (!mdev_state->region_info[index].size) | ||
608 | continue; | ||
609 | |||
610 | start_lo = (*(u32 *)(mdev_state->vconfig + pos)) & | ||
611 | PCI_BASE_ADDRESS_MEM_MASK; | ||
612 | mem_type = (*(u32 *)(mdev_state->vconfig + pos)) & | ||
613 | PCI_BASE_ADDRESS_MEM_TYPE_MASK; | ||
614 | |||
615 | switch (mem_type) { | ||
616 | case PCI_BASE_ADDRESS_MEM_TYPE_64: | ||
617 | start_hi = (*(u32 *)(mdev_state->vconfig + pos + 4)); | ||
618 | pos += 4; | ||
619 | break; | ||
620 | case PCI_BASE_ADDRESS_MEM_TYPE_32: | ||
621 | case PCI_BASE_ADDRESS_MEM_TYPE_1M: | ||
622 | /* 1M mem BAR treated as 32-bit BAR */ | ||
623 | default: | ||
624 | /* mem unknown type treated as 32-bit BAR */ | ||
625 | start_hi = 0; | ||
626 | break; | ||
627 | } | ||
628 | pos += 4; | ||
629 | mdev_state->region_info[index].start = ((u64)start_hi << 32) | | ||
630 | start_lo; | ||
631 | } | ||
632 | } | ||
633 | |||
634 | static ssize_t mdev_access(struct mdev_device *mdev, u8 *buf, size_t count, | ||
635 | loff_t pos, bool is_write) | ||
636 | { | ||
637 | struct mdev_state *mdev_state; | ||
638 | unsigned int index; | ||
639 | loff_t offset; | ||
640 | int ret = 0; | ||
641 | |||
642 | if (!mdev || !buf) | ||
643 | return -EINVAL; | ||
644 | |||
645 | mdev_state = mdev_get_drvdata(mdev); | ||
646 | if (!mdev_state) { | ||
647 | pr_err("%s mdev_state not found\n", __func__); | ||
648 | return -EINVAL; | ||
649 | } | ||
650 | |||
651 | mutex_lock(&mdev_state->ops_lock); | ||
652 | |||
653 | index = MTTY_VFIO_PCI_OFFSET_TO_INDEX(pos); | ||
654 | offset = pos & MTTY_VFIO_PCI_OFFSET_MASK; | ||
655 | switch (index) { | ||
656 | case VFIO_PCI_CONFIG_REGION_INDEX: | ||
657 | |||
658 | #if defined(DEBUG) | ||
659 | pr_info("%s: PCI config space %s at offset 0x%llx\n", | ||
660 | __func__, is_write ? "write" : "read", offset); | ||
661 | #endif | ||
662 | if (is_write) { | ||
663 | dump_buffer(buf, count); | ||
664 | handle_pci_cfg_write(mdev_state, offset, buf, count); | ||
665 | } else { | ||
666 | memcpy(buf, (mdev_state->vconfig + offset), count); | ||
667 | dump_buffer(buf, count); | ||
668 | } | ||
669 | |||
670 | break; | ||
671 | |||
672 | case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: | ||
673 | if (!mdev_state->region_info[index].start) | ||
674 | mdev_read_base(mdev_state); | ||
675 | |||
676 | if (is_write) { | ||
677 | dump_buffer(buf, count); | ||
678 | |||
679 | #if defined(DEBUG_REGS) | ||
680 | pr_info("%s: BAR%d WR @0x%llx %s val:0x%02x dlab:%d\n", | ||
681 | __func__, index, offset, wr_reg[offset], | ||
682 | *buf, mdev_state->s[index].dlab); | ||
683 | #endif | ||
684 | handle_bar_write(index, mdev_state, offset, buf, count); | ||
685 | } else { | ||
686 | handle_bar_read(index, mdev_state, offset, buf, count); | ||
687 | dump_buffer(buf, count); | ||
688 | |||
689 | #if defined(DEBUG_REGS) | ||
690 | pr_info("%s: BAR%d RD @0x%llx %s val:0x%02x dlab:%d\n", | ||
691 | __func__, index, offset, rd_reg[offset], | ||
692 | *buf, mdev_state->s[index].dlab); | ||
693 | #endif | ||
694 | } | ||
695 | break; | ||
696 | |||
697 | default: | ||
698 | ret = -1; | ||
699 | goto accessfailed; | ||
700 | } | ||
701 | |||
702 | ret = count; | ||
703 | |||
704 | |||
705 | accessfailed: | ||
706 | mutex_unlock(&mdev_state->ops_lock); | ||
707 | |||
708 | return ret; | ||
709 | } | ||
710 | |||
711 | static int mtty_create(struct kobject *kobj, struct mdev_device *mdev) | ||
712 | { | ||
713 | struct mdev_state *mdev_state; | ||
714 | char name[MTTY_STRING_LEN]; | ||
715 | int nr_ports = 0, i; | ||
716 | |||
717 | if (!mdev) | ||
718 | return -EINVAL; | ||
719 | |||
720 | for (i = 0; i < 2; i++) { | ||
721 | snprintf(name, MTTY_STRING_LEN, "%s-%d", | ||
722 | dev_driver_string(mdev_parent_dev(mdev)), i + 1); | ||
723 | if (!strcmp(kobj->name, name)) { | ||
724 | nr_ports = i + 1; | ||
725 | break; | ||
726 | } | ||
727 | } | ||
728 | |||
729 | if (!nr_ports) | ||
730 | return -EINVAL; | ||
731 | |||
732 | mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL); | ||
733 | if (mdev_state == NULL) | ||
734 | return -ENOMEM; | ||
735 | |||
736 | mdev_state->nr_ports = nr_ports; | ||
737 | mdev_state->irq_index = -1; | ||
738 | mdev_state->s[0].max_fifo_size = MAX_FIFO_SIZE; | ||
739 | mdev_state->s[1].max_fifo_size = MAX_FIFO_SIZE; | ||
740 | mutex_init(&mdev_state->rxtx_lock); | ||
741 | mdev_state->vconfig = kzalloc(MTTY_CONFIG_SPACE_SIZE, GFP_KERNEL); | ||
742 | |||
743 | if (mdev_state->vconfig == NULL) { | ||
744 | kfree(mdev_state); | ||
745 | return -ENOMEM; | ||
746 | } | ||
747 | |||
748 | mutex_init(&mdev_state->ops_lock); | ||
749 | mdev_state->mdev = mdev; | ||
750 | mdev_set_drvdata(mdev, mdev_state); | ||
751 | |||
752 | mtty_create_config_space(mdev_state); | ||
753 | |||
754 | mutex_lock(&mdev_list_lock); | ||
755 | list_add(&mdev_state->next, &mdev_devices_list); | ||
756 | mutex_unlock(&mdev_list_lock); | ||
757 | |||
758 | return 0; | ||
759 | } | ||
760 | |||
761 | static int mtty_remove(struct mdev_device *mdev) | ||
762 | { | ||
763 | struct mdev_state *mds, *tmp_mds; | ||
764 | struct mdev_state *mdev_state = mdev_get_drvdata(mdev); | ||
765 | int ret = -EINVAL; | ||
766 | |||
767 | mutex_lock(&mdev_list_lock); | ||
768 | list_for_each_entry_safe(mds, tmp_mds, &mdev_devices_list, next) { | ||
769 | if (mdev_state == mds) { | ||
770 | list_del(&mdev_state->next); | ||
771 | mdev_set_drvdata(mdev, NULL); | ||
772 | kfree(mdev_state->vconfig); | ||
773 | kfree(mdev_state); | ||
774 | ret = 0; | ||
775 | break; | ||
776 | } | ||
777 | } | ||
778 | mutex_unlock(&mdev_list_lock); | ||
779 | |||
780 | return ret; | ||
781 | } | ||
782 | |||
783 | static int mtty_reset(struct mdev_device *mdev) | ||
784 | { | ||
785 | struct mdev_state *mdev_state; | ||
786 | |||
787 | if (!mdev) | ||
788 | return -EINVAL; | ||
789 | |||
790 | mdev_state = mdev_get_drvdata(mdev); | ||
791 | if (!mdev_state) | ||
792 | return -EINVAL; | ||
793 | |||
794 | pr_info("%s: called\n", __func__); | ||
795 | |||
796 | return 0; | ||
797 | } | ||
798 | |||
799 | static ssize_t mtty_read(struct mdev_device *mdev, char __user *buf, | ||
800 | size_t count, loff_t *ppos) | ||
801 | { | ||
802 | unsigned int done = 0; | ||
803 | int ret; | ||
804 | |||
805 | while (count) { | ||
806 | size_t filled; | ||
807 | |||
808 | if (count >= 4 && !(*ppos % 4)) { | ||
809 | u32 val; | ||
810 | |||
811 | ret = mdev_access(mdev, (u8 *)&val, sizeof(val), | ||
812 | *ppos, false); | ||
813 | if (ret <= 0) | ||
814 | goto read_err; | ||
815 | |||
816 | if (copy_to_user(buf, &val, sizeof(val))) | ||
817 | goto read_err; | ||
818 | |||
819 | filled = 4; | ||
820 | } else if (count >= 2 && !(*ppos % 2)) { | ||
821 | u16 val; | ||
822 | |||
823 | ret = mdev_access(mdev, (u8 *)&val, sizeof(val), | ||
824 | *ppos, false); | ||
825 | if (ret <= 0) | ||
826 | goto read_err; | ||
827 | |||
828 | if (copy_to_user(buf, &val, sizeof(val))) | ||
829 | goto read_err; | ||
830 | |||
831 | filled = 2; | ||
832 | } else { | ||
833 | u8 val; | ||
834 | |||
835 | ret = mdev_access(mdev, (u8 *)&val, sizeof(val), | ||
836 | *ppos, false); | ||
837 | if (ret <= 0) | ||
838 | goto read_err; | ||
839 | |||
840 | if (copy_to_user(buf, &val, sizeof(val))) | ||
841 | goto read_err; | ||
842 | |||
843 | filled = 1; | ||
844 | } | ||
845 | |||
846 | count -= filled; | ||
847 | done += filled; | ||
848 | *ppos += filled; | ||
849 | buf += filled; | ||
850 | } | ||
851 | |||
852 | return done; | ||
853 | |||
854 | read_err: | ||
855 | return -EFAULT; | ||
856 | } | ||
857 | |||
858 | static ssize_t mtty_write(struct mdev_device *mdev, const char __user *buf, | ||
859 | size_t count, loff_t *ppos) | ||
860 | { | ||
861 | unsigned int done = 0; | ||
862 | int ret; | ||
863 | |||
864 | while (count) { | ||
865 | size_t filled; | ||
866 | |||
867 | if (count >= 4 && !(*ppos % 4)) { | ||
868 | u32 val; | ||
869 | |||
870 | if (copy_from_user(&val, buf, sizeof(val))) | ||
871 | goto write_err; | ||
872 | |||
873 | ret = mdev_access(mdev, (u8 *)&val, sizeof(val), | ||
874 | *ppos, true); | ||
875 | if (ret <= 0) | ||
876 | goto write_err; | ||
877 | |||
878 | filled = 4; | ||
879 | } else if (count >= 2 && !(*ppos % 2)) { | ||
880 | u16 val; | ||
881 | |||
882 | if (copy_from_user(&val, buf, sizeof(val))) | ||
883 | goto write_err; | ||
884 | |||
885 | ret = mdev_access(mdev, (u8 *)&val, sizeof(val), | ||
886 | *ppos, true); | ||
887 | if (ret <= 0) | ||
888 | goto write_err; | ||
889 | |||
890 | filled = 2; | ||
891 | } else { | ||
892 | u8 val; | ||
893 | |||
894 | if (copy_from_user(&val, buf, sizeof(val))) | ||
895 | goto write_err; | ||
896 | |||
897 | ret = mdev_access(mdev, (u8 *)&val, sizeof(val), | ||
898 | *ppos, true); | ||
899 | if (ret <= 0) | ||
900 | goto write_err; | ||
901 | |||
902 | filled = 1; | ||
903 | } | ||
904 | count -= filled; | ||
905 | done += filled; | ||
906 | *ppos += filled; | ||
907 | buf += filled; | ||
908 | } | ||
909 | |||
910 | return done; | ||
911 | write_err: | ||
912 | return -EFAULT; | ||
913 | } | ||
914 | |||
915 | static int mtty_set_irqs(struct mdev_device *mdev, uint32_t flags, | ||
916 | unsigned int index, unsigned int start, | ||
917 | unsigned int count, void *data) | ||
918 | { | ||
919 | int ret = 0; | ||
920 | struct mdev_state *mdev_state; | ||
921 | |||
922 | if (!mdev) | ||
923 | return -EINVAL; | ||
924 | |||
925 | mdev_state = mdev_get_drvdata(mdev); | ||
926 | if (!mdev_state) | ||
927 | return -EINVAL; | ||
928 | |||
929 | mutex_lock(&mdev_state->ops_lock); | ||
930 | switch (index) { | ||
931 | case VFIO_PCI_INTX_IRQ_INDEX: | ||
932 | switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { | ||
933 | case VFIO_IRQ_SET_ACTION_MASK: | ||
934 | case VFIO_IRQ_SET_ACTION_UNMASK: | ||
935 | break; | ||
936 | case VFIO_IRQ_SET_ACTION_TRIGGER: | ||
937 | { | ||
938 | if (flags & VFIO_IRQ_SET_DATA_NONE) { | ||
939 | pr_info("%s: disable INTx\n", __func__); | ||
940 | if (mdev_state->intx_evtfd) | ||
941 | eventfd_ctx_put(mdev_state->intx_evtfd); | ||
942 | break; | ||
943 | } | ||
944 | |||
945 | if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { | ||
946 | int fd = *(int *)data; | ||
947 | |||
948 | if (fd > 0) { | ||
949 | struct eventfd_ctx *evt; | ||
950 | |||
951 | evt = eventfd_ctx_fdget(fd); | ||
952 | if (IS_ERR(evt)) { | ||
953 | ret = PTR_ERR(evt); | ||
954 | break; | ||
955 | } | ||
956 | mdev_state->intx_evtfd = evt; | ||
957 | mdev_state->irq_fd = fd; | ||
958 | mdev_state->irq_index = index; | ||
959 | break; | ||
960 | } | ||
961 | } | ||
962 | break; | ||
963 | } | ||
964 | } | ||
965 | break; | ||
966 | case VFIO_PCI_MSI_IRQ_INDEX: | ||
967 | switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { | ||
968 | case VFIO_IRQ_SET_ACTION_MASK: | ||
969 | case VFIO_IRQ_SET_ACTION_UNMASK: | ||
970 | break; | ||
971 | case VFIO_IRQ_SET_ACTION_TRIGGER: | ||
972 | if (flags & VFIO_IRQ_SET_DATA_NONE) { | ||
973 | if (mdev_state->msi_evtfd) | ||
974 | eventfd_ctx_put(mdev_state->msi_evtfd); | ||
975 | pr_info("%s: disable MSI\n", __func__); | ||
976 | mdev_state->irq_index = VFIO_PCI_INTX_IRQ_INDEX; | ||
977 | break; | ||
978 | } | ||
979 | if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { | ||
980 | int fd = *(int *)data; | ||
981 | struct eventfd_ctx *evt; | ||
982 | |||
983 | if (fd <= 0) | ||
984 | break; | ||
985 | |||
986 | if (mdev_state->msi_evtfd) | ||
987 | break; | ||
988 | |||
989 | evt = eventfd_ctx_fdget(fd); | ||
990 | if (IS_ERR(evt)) { | ||
991 | ret = PTR_ERR(evt); | ||
992 | break; | ||
993 | } | ||
994 | mdev_state->msi_evtfd = evt; | ||
995 | mdev_state->irq_fd = fd; | ||
996 | mdev_state->irq_index = index; | ||
997 | } | ||
998 | break; | ||
999 | } | ||
1000 | break; | ||
1001 | case VFIO_PCI_MSIX_IRQ_INDEX: | ||
1002 | pr_info("%s: MSIX_IRQ\n", __func__); | ||
1003 | break; | ||
1004 | case VFIO_PCI_ERR_IRQ_INDEX: | ||
1005 | pr_info("%s: ERR_IRQ\n", __func__); | ||
1006 | break; | ||
1007 | case VFIO_PCI_REQ_IRQ_INDEX: | ||
1008 | pr_info("%s: REQ_IRQ\n", __func__); | ||
1009 | break; | ||
1010 | } | ||
1011 | |||
1012 | mutex_unlock(&mdev_state->ops_lock); | ||
1013 | return ret; | ||
1014 | } | ||
1015 | |||
1016 | static int mtty_trigger_interrupt(struct mdev_state *mdev_state) | ||
1017 | { | ||
1018 | int ret = -1; | ||
1019 | |||
1020 | if ((mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX) && | ||
1021 | (!mdev_state->msi_evtfd)) | ||
1022 | return -EINVAL; | ||
1023 | else if ((mdev_state->irq_index == VFIO_PCI_INTX_IRQ_INDEX) && | ||
1024 | (!mdev_state->intx_evtfd)) { | ||
1025 | pr_info("%s: Intr eventfd not found\n", __func__); | ||
1026 | return -EINVAL; | ||
1027 | } | ||
1028 | |||
1029 | if (mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX) | ||
1030 | ret = eventfd_signal(mdev_state->msi_evtfd, 1); | ||
1031 | else | ||
1032 | ret = eventfd_signal(mdev_state->intx_evtfd, 1); | ||
1033 | |||
1034 | #if defined(DEBUG_INTR) | ||
1035 | pr_info("Intx triggered\n"); | ||
1036 | #endif | ||
1037 | if (ret != 1) | ||
1038 | pr_err("%s: eventfd signal failed (%d)\n", __func__, ret); | ||
1039 | |||
1040 | return ret; | ||
1041 | } | ||
1042 | |||
1043 | static int mtty_get_region_info(struct mdev_device *mdev, | ||
1044 | struct vfio_region_info *region_info, | ||
1045 | u16 *cap_type_id, void **cap_type) | ||
1046 | { | ||
1047 | unsigned int size = 0; | ||
1048 | struct mdev_state *mdev_state; | ||
1049 | u32 bar_index; | ||
1050 | |||
1051 | if (!mdev) | ||
1052 | return -EINVAL; | ||
1053 | |||
1054 | mdev_state = mdev_get_drvdata(mdev); | ||
1055 | if (!mdev_state) | ||
1056 | return -EINVAL; | ||
1057 | |||
1058 | bar_index = region_info->index; | ||
1059 | if (bar_index >= VFIO_PCI_NUM_REGIONS) | ||
1060 | return -EINVAL; | ||
1061 | |||
1062 | mutex_lock(&mdev_state->ops_lock); | ||
1063 | |||
1064 | switch (bar_index) { | ||
1065 | case VFIO_PCI_CONFIG_REGION_INDEX: | ||
1066 | size = MTTY_CONFIG_SPACE_SIZE; | ||
1067 | break; | ||
1068 | case VFIO_PCI_BAR0_REGION_INDEX: | ||
1069 | size = MTTY_IO_BAR_SIZE; | ||
1070 | break; | ||
1071 | case VFIO_PCI_BAR1_REGION_INDEX: | ||
1072 | if (mdev_state->nr_ports == 2) | ||
1073 | size = MTTY_IO_BAR_SIZE; | ||
1074 | break; | ||
1075 | default: | ||
1076 | size = 0; | ||
1077 | break; | ||
1078 | } | ||
1079 | |||
1080 | mdev_state->region_info[bar_index].size = size; | ||
1081 | mdev_state->region_info[bar_index].vfio_offset = | ||
1082 | MTTY_VFIO_PCI_INDEX_TO_OFFSET(bar_index); | ||
1083 | |||
1084 | region_info->size = size; | ||
1085 | region_info->offset = MTTY_VFIO_PCI_INDEX_TO_OFFSET(bar_index); | ||
1086 | region_info->flags = VFIO_REGION_INFO_FLAG_READ | | ||
1087 | VFIO_REGION_INFO_FLAG_WRITE; | ||
1088 | mutex_unlock(&mdev_state->ops_lock); | ||
1089 | return 0; | ||
1090 | } | ||
1091 | |||
1092 | static int mtty_get_irq_info(struct mdev_device *mdev, | ||
1093 | struct vfio_irq_info *irq_info) | ||
1094 | { | ||
1095 | switch (irq_info->index) { | ||
1096 | case VFIO_PCI_INTX_IRQ_INDEX: | ||
1097 | case VFIO_PCI_MSI_IRQ_INDEX: | ||
1098 | case VFIO_PCI_REQ_IRQ_INDEX: | ||
1099 | break; | ||
1100 | |||
1101 | default: | ||
1102 | return -EINVAL; | ||
1103 | } | ||
1104 | |||
1105 | irq_info->flags = VFIO_IRQ_INFO_EVENTFD; | ||
1106 | irq_info->count = 1; | ||
1107 | |||
1108 | if (irq_info->index == VFIO_PCI_INTX_IRQ_INDEX) | ||
1109 | irq_info->flags |= (VFIO_IRQ_INFO_MASKABLE | | ||
1110 | VFIO_IRQ_INFO_AUTOMASKED); | ||
1111 | else | ||
1112 | irq_info->flags |= VFIO_IRQ_INFO_NORESIZE; | ||
1113 | |||
1114 | return 0; | ||
1115 | } | ||
1116 | |||
1117 | static int mtty_get_device_info(struct mdev_device *mdev, | ||
1118 | struct vfio_device_info *dev_info) | ||
1119 | { | ||
1120 | dev_info->flags = VFIO_DEVICE_FLAGS_PCI; | ||
1121 | dev_info->num_regions = VFIO_PCI_NUM_REGIONS; | ||
1122 | dev_info->num_irqs = VFIO_PCI_NUM_IRQS; | ||
1123 | |||
1124 | return 0; | ||
1125 | } | ||
1126 | |||
1127 | static long mtty_ioctl(struct mdev_device *mdev, unsigned int cmd, | ||
1128 | unsigned long arg) | ||
1129 | { | ||
1130 | int ret = 0; | ||
1131 | unsigned long minsz; | ||
1132 | struct mdev_state *mdev_state; | ||
1133 | |||
1134 | if (!mdev) | ||
1135 | return -EINVAL; | ||
1136 | |||
1137 | mdev_state = mdev_get_drvdata(mdev); | ||
1138 | if (!mdev_state) | ||
1139 | return -ENODEV; | ||
1140 | |||
1141 | switch (cmd) { | ||
1142 | case VFIO_DEVICE_GET_INFO: | ||
1143 | { | ||
1144 | struct vfio_device_info info; | ||
1145 | |||
1146 | minsz = offsetofend(struct vfio_device_info, num_irqs); | ||
1147 | |||
1148 | if (copy_from_user(&info, (void __user *)arg, minsz)) | ||
1149 | return -EFAULT; | ||
1150 | |||
1151 | if (info.argsz < minsz) | ||
1152 | return -EINVAL; | ||
1153 | |||
1154 | ret = mtty_get_device_info(mdev, &info); | ||
1155 | if (ret) | ||
1156 | return ret; | ||
1157 | |||
1158 | memcpy(&mdev_state->dev_info, &info, sizeof(info)); | ||
1159 | |||
1160 | if (copy_to_user((void __user *)arg, &info, minsz)) | ||
1161 | return -EFAULT; | ||
1162 | |||
1163 | return 0; | ||
1164 | } | ||
1165 | case VFIO_DEVICE_GET_REGION_INFO: | ||
1166 | { | ||
1167 | struct vfio_region_info info; | ||
1168 | u16 cap_type_id = 0; | ||
1169 | void *cap_type = NULL; | ||
1170 | |||
1171 | minsz = offsetofend(struct vfio_region_info, offset); | ||
1172 | |||
1173 | if (copy_from_user(&info, (void __user *)arg, minsz)) | ||
1174 | return -EFAULT; | ||
1175 | |||
1176 | if (info.argsz < minsz) | ||
1177 | return -EINVAL; | ||
1178 | |||
1179 | ret = mtty_get_region_info(mdev, &info, &cap_type_id, | ||
1180 | &cap_type); | ||
1181 | if (ret) | ||
1182 | return ret; | ||
1183 | |||
1184 | if (copy_to_user((void __user *)arg, &info, minsz)) | ||
1185 | return -EFAULT; | ||
1186 | |||
1187 | return 0; | ||
1188 | } | ||
1189 | |||
1190 | case VFIO_DEVICE_GET_IRQ_INFO: | ||
1191 | { | ||
1192 | struct vfio_irq_info info; | ||
1193 | |||
1194 | minsz = offsetofend(struct vfio_irq_info, count); | ||
1195 | |||
1196 | if (copy_from_user(&info, (void __user *)arg, minsz)) | ||
1197 | return -EFAULT; | ||
1198 | |||
1199 | if ((info.argsz < minsz) || | ||
1200 | (info.index >= mdev_state->dev_info.num_irqs)) | ||
1201 | return -EINVAL; | ||
1202 | |||
1203 | ret = mtty_get_irq_info(mdev, &info); | ||
1204 | if (ret) | ||
1205 | return ret; | ||
1206 | |||
1207 | if (copy_to_user((void __user *)arg, &info, minsz)) | ||
1208 | return -EFAULT; | ||
1209 | |||
1210 | return 0; | ||
1211 | } | ||
1212 | case VFIO_DEVICE_SET_IRQS: | ||
1213 | { | ||
1214 | struct vfio_irq_set hdr; | ||
1215 | u8 *data = NULL, *ptr = NULL; | ||
1216 | size_t data_size = 0; | ||
1217 | |||
1218 | minsz = offsetofend(struct vfio_irq_set, count); | ||
1219 | |||
1220 | if (copy_from_user(&hdr, (void __user *)arg, minsz)) | ||
1221 | return -EFAULT; | ||
1222 | |||
1223 | ret = vfio_set_irqs_validate_and_prepare(&hdr, | ||
1224 | mdev_state->dev_info.num_irqs, | ||
1225 | VFIO_PCI_NUM_IRQS, | ||
1226 | &data_size); | ||
1227 | if (ret) | ||
1228 | return ret; | ||
1229 | |||
1230 | if (data_size) { | ||
1231 | ptr = data = memdup_user((void __user *)(arg + minsz), | ||
1232 | data_size); | ||
1233 | if (IS_ERR(data)) | ||
1234 | return PTR_ERR(data); | ||
1235 | } | ||
1236 | |||
1237 | ret = mtty_set_irqs(mdev, hdr.flags, hdr.index, hdr.start, | ||
1238 | hdr.count, data); | ||
1239 | |||
1240 | kfree(ptr); | ||
1241 | return ret; | ||
1242 | } | ||
1243 | case VFIO_DEVICE_RESET: | ||
1244 | return mtty_reset(mdev); | ||
1245 | } | ||
1246 | return -ENOTTY; | ||
1247 | } | ||
1248 | |||
1249 | static int mtty_open(struct mdev_device *mdev) | ||
1250 | { | ||
1251 | pr_info("%s\n", __func__); | ||
1252 | return 0; | ||
1253 | } | ||
1254 | |||
1255 | static void mtty_close(struct mdev_device *mdev) | ||
1256 | { | ||
1257 | pr_info("%s\n", __func__); | ||
1258 | } | ||
1259 | |||
1260 | static ssize_t | ||
1261 | sample_mtty_dev_show(struct device *dev, struct device_attribute *attr, | ||
1262 | char *buf) | ||
1263 | { | ||
1264 | return sprintf(buf, "This is phy device\n"); | ||
1265 | } | ||
1266 | |||
1267 | static DEVICE_ATTR_RO(sample_mtty_dev); | ||
1268 | |||
1269 | static struct attribute *mtty_dev_attrs[] = { | ||
1270 | &dev_attr_sample_mtty_dev.attr, | ||
1271 | NULL, | ||
1272 | }; | ||
1273 | |||
1274 | static const struct attribute_group mtty_dev_group = { | ||
1275 | .name = "mtty_dev", | ||
1276 | .attrs = mtty_dev_attrs, | ||
1277 | }; | ||
1278 | |||
1279 | static const struct attribute_group *mtty_dev_groups[] = { | ||
1280 | &mtty_dev_group, | ||
1281 | NULL, | ||
1282 | }; | ||
1283 | |||
1284 | static ssize_t | ||
1285 | sample_mdev_dev_show(struct device *dev, struct device_attribute *attr, | ||
1286 | char *buf) | ||
1287 | { | ||
1288 | if (mdev_from_dev(dev)) | ||
1289 | return sprintf(buf, "This is MDEV %s\n", dev_name(dev)); | ||
1290 | |||
1291 | return sprintf(buf, "\n"); | ||
1292 | } | ||
1293 | |||
1294 | static DEVICE_ATTR_RO(sample_mdev_dev); | ||
1295 | |||
1296 | static struct attribute *mdev_dev_attrs[] = { | ||
1297 | &dev_attr_sample_mdev_dev.attr, | ||
1298 | NULL, | ||
1299 | }; | ||
1300 | |||
1301 | static const struct attribute_group mdev_dev_group = { | ||
1302 | .name = "vendor", | ||
1303 | .attrs = mdev_dev_attrs, | ||
1304 | }; | ||
1305 | |||
1306 | static const struct attribute_group *mdev_dev_groups[] = { | ||
1307 | &mdev_dev_group, | ||
1308 | NULL, | ||
1309 | }; | ||
1310 | |||
1311 | static ssize_t | ||
1312 | name_show(struct kobject *kobj, struct device *dev, char *buf) | ||
1313 | { | ||
1314 | char name[MTTY_STRING_LEN]; | ||
1315 | int i; | ||
1316 | const char *name_str[2] = {"Single port serial", "Dual port serial"}; | ||
1317 | |||
1318 | for (i = 0; i < 2; i++) { | ||
1319 | snprintf(name, MTTY_STRING_LEN, "%s-%d", | ||
1320 | dev_driver_string(dev), i + 1); | ||
1321 | if (!strcmp(kobj->name, name)) | ||
1322 | return sprintf(buf, "%s\n", name_str[i]); | ||
1323 | } | ||
1324 | |||
1325 | return -EINVAL; | ||
1326 | } | ||
1327 | |||
1328 | static MDEV_TYPE_ATTR_RO(name); | ||
1329 | |||
1330 | static ssize_t | ||
1331 | available_instances_show(struct kobject *kobj, struct device *dev, char *buf) | ||
1332 | { | ||
1333 | char name[MTTY_STRING_LEN]; | ||
1334 | int i; | ||
1335 | struct mdev_state *mds; | ||
1336 | int ports = 0, used = 0; | ||
1337 | |||
1338 | for (i = 0; i < 2; i++) { | ||
1339 | snprintf(name, MTTY_STRING_LEN, "%s-%d", | ||
1340 | dev_driver_string(dev), i + 1); | ||
1341 | if (!strcmp(kobj->name, name)) { | ||
1342 | ports = i + 1; | ||
1343 | break; | ||
1344 | } | ||
1345 | } | ||
1346 | |||
1347 | if (!ports) | ||
1348 | return -EINVAL; | ||
1349 | |||
1350 | list_for_each_entry(mds, &mdev_devices_list, next) | ||
1351 | used += mds->nr_ports; | ||
1352 | |||
1353 | return sprintf(buf, "%d\n", (MAX_MTTYS - used)/ports); | ||
1354 | } | ||
1355 | |||
1356 | static MDEV_TYPE_ATTR_RO(available_instances); | ||
1357 | |||
1358 | |||
1359 | static ssize_t device_api_show(struct kobject *kobj, struct device *dev, | ||
1360 | char *buf) | ||
1361 | { | ||
1362 | return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING); | ||
1363 | } | ||
1364 | |||
1365 | static MDEV_TYPE_ATTR_RO(device_api); | ||
1366 | |||
1367 | static struct attribute *mdev_types_attrs[] = { | ||
1368 | &mdev_type_attr_name.attr, | ||
1369 | &mdev_type_attr_device_api.attr, | ||
1370 | &mdev_type_attr_available_instances.attr, | ||
1371 | NULL, | ||
1372 | }; | ||
1373 | |||
1374 | static struct attribute_group mdev_type_group1 = { | ||
1375 | .name = "1", | ||
1376 | .attrs = mdev_types_attrs, | ||
1377 | }; | ||
1378 | |||
1379 | static struct attribute_group mdev_type_group2 = { | ||
1380 | .name = "2", | ||
1381 | .attrs = mdev_types_attrs, | ||
1382 | }; | ||
1383 | |||
1384 | static struct attribute_group *mdev_type_groups[] = { | ||
1385 | &mdev_type_group1, | ||
1386 | &mdev_type_group2, | ||
1387 | NULL, | ||
1388 | }; | ||
1389 | |||
1390 | static const struct mdev_parent_ops mdev_fops = { | ||
1391 | .owner = THIS_MODULE, | ||
1392 | .dev_attr_groups = mtty_dev_groups, | ||
1393 | .mdev_attr_groups = mdev_dev_groups, | ||
1394 | .supported_type_groups = mdev_type_groups, | ||
1395 | .create = mtty_create, | ||
1396 | .remove = mtty_remove, | ||
1397 | .open = mtty_open, | ||
1398 | .release = mtty_close, | ||
1399 | .read = mtty_read, | ||
1400 | .write = mtty_write, | ||
1401 | .ioctl = mtty_ioctl, | ||
1402 | }; | ||
1403 | |||
1404 | static void mtty_device_release(struct device *dev) | ||
1405 | { | ||
1406 | dev_dbg(dev, "mtty: released\n"); | ||
1407 | } | ||
1408 | |||
1409 | static int __init mtty_dev_init(void) | ||
1410 | { | ||
1411 | int ret = 0; | ||
1412 | |||
1413 | pr_info("mtty_dev: %s\n", __func__); | ||
1414 | |||
1415 | memset(&mtty_dev, 0, sizeof(mtty_dev)); | ||
1416 | |||
1417 | idr_init(&mtty_dev.vd_idr); | ||
1418 | |||
1419 | ret = alloc_chrdev_region(&mtty_dev.vd_devt, 0, MINORMASK + 1, | ||
1420 | MTTY_NAME); | ||
1421 | |||
1422 | if (ret < 0) { | ||
1423 | pr_err("Error: failed to register mtty_dev, err:%d\n", ret); | ||
1424 | return ret; | ||
1425 | } | ||
1426 | |||
1427 | cdev_init(&mtty_dev.vd_cdev, &vd_fops); | ||
1428 | cdev_add(&mtty_dev.vd_cdev, mtty_dev.vd_devt, MINORMASK + 1); | ||
1429 | |||
1430 | pr_info("major_number:%d\n", MAJOR(mtty_dev.vd_devt)); | ||
1431 | |||
1432 | mtty_dev.vd_class = class_create(THIS_MODULE, MTTY_CLASS_NAME); | ||
1433 | |||
1434 | if (IS_ERR(mtty_dev.vd_class)) { | ||
1435 | pr_err("Error: failed to register mtty_dev class\n"); | ||
1436 | ret = PTR_ERR(mtty_dev.vd_class); | ||
1437 | goto failed1; | ||
1438 | } | ||
1439 | |||
1440 | mtty_dev.dev.class = mtty_dev.vd_class; | ||
1441 | mtty_dev.dev.release = mtty_device_release; | ||
1442 | dev_set_name(&mtty_dev.dev, "%s", MTTY_NAME); | ||
1443 | |||
1444 | ret = device_register(&mtty_dev.dev); | ||
1445 | if (ret) | ||
1446 | goto failed2; | ||
1447 | |||
1448 | ret = mdev_register_device(&mtty_dev.dev, &mdev_fops); | ||
1449 | if (ret) | ||
1450 | goto failed3; | ||
1451 | |||
1452 | mutex_init(&mdev_list_lock); | ||
1453 | INIT_LIST_HEAD(&mdev_devices_list); | ||
1454 | |||
1455 | goto all_done; | ||
1456 | |||
1457 | failed3: | ||
1458 | |||
1459 | device_unregister(&mtty_dev.dev); | ||
1460 | failed2: | ||
1461 | class_destroy(mtty_dev.vd_class); | ||
1462 | |||
1463 | failed1: | ||
1464 | cdev_del(&mtty_dev.vd_cdev); | ||
1465 | unregister_chrdev_region(mtty_dev.vd_devt, MINORMASK + 1); | ||
1466 | |||
1467 | all_done: | ||
1468 | return ret; | ||
1469 | } | ||
1470 | |||
1471 | static void __exit mtty_dev_exit(void) | ||
1472 | { | ||
1473 | mtty_dev.dev.bus = NULL; | ||
1474 | mdev_unregister_device(&mtty_dev.dev); | ||
1475 | |||
1476 | device_unregister(&mtty_dev.dev); | ||
1477 | idr_destroy(&mtty_dev.vd_idr); | ||
1478 | cdev_del(&mtty_dev.vd_cdev); | ||
1479 | unregister_chrdev_region(mtty_dev.vd_devt, MINORMASK + 1); | ||
1480 | class_destroy(mtty_dev.vd_class); | ||
1481 | mtty_dev.vd_class = NULL; | ||
1482 | pr_info("mtty_dev: Unloaded!\n"); | ||
1483 | } | ||
1484 | |||
1485 | module_init(mtty_dev_init) | ||
1486 | module_exit(mtty_dev_exit) | ||
1487 | |||
1488 | MODULE_LICENSE("GPL v2"); | ||
1489 | MODULE_INFO(supported, "Test driver that simulate serial port over PCI"); | ||
1490 | MODULE_VERSION(VERSION_STRING); | ||
1491 | MODULE_AUTHOR(DRIVER_AUTHOR); | ||
diff --git a/samples/vfs/.gitignore b/samples/vfs/.gitignore new file mode 100644 index 000000000..8fdabf7e5 --- /dev/null +++ b/samples/vfs/.gitignore | |||
@@ -0,0 +1,3 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | test-fsmount | ||
3 | test-statx | ||
diff --git a/samples/vfs/Makefile b/samples/vfs/Makefile new file mode 100644 index 000000000..6377a6781 --- /dev/null +++ b/samples/vfs/Makefile | |||
@@ -0,0 +1,4 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | userprogs-always-y += test-fsmount test-statx | ||
3 | |||
4 | userccflags += -I usr/include | ||
diff --git a/samples/vfs/test-fsmount.c b/samples/vfs/test-fsmount.c new file mode 100644 index 000000000..50f47b72e --- /dev/null +++ b/samples/vfs/test-fsmount.c | |||
@@ -0,0 +1,129 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
2 | /* fd-based mount test. | ||
3 | * | ||
4 | * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. | ||
5 | * Written by David Howells (dhowells@redhat.com) | ||
6 | */ | ||
7 | |||
8 | #include <stdio.h> | ||
9 | #include <stdlib.h> | ||
10 | #include <unistd.h> | ||
11 | #include <errno.h> | ||
12 | #include <fcntl.h> | ||
13 | #include <sys/prctl.h> | ||
14 | #include <sys/wait.h> | ||
15 | #include <linux/mount.h> | ||
16 | #include <linux/unistd.h> | ||
17 | |||
18 | #define E(x) do { if ((x) == -1) { perror(#x); exit(1); } } while(0) | ||
19 | |||
20 | static void check_messages(int fd) | ||
21 | { | ||
22 | char buf[4096]; | ||
23 | int err, n; | ||
24 | |||
25 | err = errno; | ||
26 | |||
27 | for (;;) { | ||
28 | n = read(fd, buf, sizeof(buf)); | ||
29 | if (n < 0) | ||
30 | break; | ||
31 | n -= 2; | ||
32 | |||
33 | switch (buf[0]) { | ||
34 | case 'e': | ||
35 | fprintf(stderr, "Error: %*.*s\n", n, n, buf + 2); | ||
36 | break; | ||
37 | case 'w': | ||
38 | fprintf(stderr, "Warning: %*.*s\n", n, n, buf + 2); | ||
39 | break; | ||
40 | case 'i': | ||
41 | fprintf(stderr, "Info: %*.*s\n", n, n, buf + 2); | ||
42 | break; | ||
43 | } | ||
44 | } | ||
45 | |||
46 | errno = err; | ||
47 | } | ||
48 | |||
49 | static __attribute__((noreturn)) | ||
50 | void mount_error(int fd, const char *s) | ||
51 | { | ||
52 | check_messages(fd); | ||
53 | fprintf(stderr, "%s: %m\n", s); | ||
54 | exit(1); | ||
55 | } | ||
56 | |||
57 | /* Hope -1 isn't a syscall */ | ||
58 | #ifndef __NR_fsopen | ||
59 | #define __NR_fsopen -1 | ||
60 | #endif | ||
61 | #ifndef __NR_fsmount | ||
62 | #define __NR_fsmount -1 | ||
63 | #endif | ||
64 | #ifndef __NR_fsconfig | ||
65 | #define __NR_fsconfig -1 | ||
66 | #endif | ||
67 | #ifndef __NR_move_mount | ||
68 | #define __NR_move_mount -1 | ||
69 | #endif | ||
70 | |||
71 | |||
72 | static inline int fsopen(const char *fs_name, unsigned int flags) | ||
73 | { | ||
74 | return syscall(__NR_fsopen, fs_name, flags); | ||
75 | } | ||
76 | |||
77 | static inline int fsmount(int fsfd, unsigned int flags, unsigned int ms_flags) | ||
78 | { | ||
79 | return syscall(__NR_fsmount, fsfd, flags, ms_flags); | ||
80 | } | ||
81 | |||
82 | static inline int fsconfig(int fsfd, unsigned int cmd, | ||
83 | const char *key, const void *val, int aux) | ||
84 | { | ||
85 | return syscall(__NR_fsconfig, fsfd, cmd, key, val, aux); | ||
86 | } | ||
87 | |||
88 | static inline int move_mount(int from_dfd, const char *from_pathname, | ||
89 | int to_dfd, const char *to_pathname, | ||
90 | unsigned int flags) | ||
91 | { | ||
92 | return syscall(__NR_move_mount, | ||
93 | from_dfd, from_pathname, | ||
94 | to_dfd, to_pathname, flags); | ||
95 | } | ||
96 | |||
97 | #define E_fsconfig(fd, cmd, key, val, aux) \ | ||
98 | do { \ | ||
99 | if (fsconfig(fd, cmd, key, val, aux) == -1) \ | ||
100 | mount_error(fd, key ?: "create"); \ | ||
101 | } while (0) | ||
102 | |||
103 | int main(int argc, char *argv[]) | ||
104 | { | ||
105 | int fsfd, mfd; | ||
106 | |||
107 | /* Mount a publically available AFS filesystem */ | ||
108 | fsfd = fsopen("afs", 0); | ||
109 | if (fsfd == -1) { | ||
110 | perror("fsopen"); | ||
111 | exit(1); | ||
112 | } | ||
113 | |||
114 | E_fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "#grand.central.org:root.cell.", 0); | ||
115 | E_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); | ||
116 | |||
117 | mfd = fsmount(fsfd, 0, MOUNT_ATTR_RDONLY); | ||
118 | if (mfd < 0) | ||
119 | mount_error(fsfd, "fsmount"); | ||
120 | E(close(fsfd)); | ||
121 | |||
122 | if (move_mount(mfd, "", AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH) < 0) { | ||
123 | perror("move_mount"); | ||
124 | exit(1); | ||
125 | } | ||
126 | |||
127 | E(close(mfd)); | ||
128 | exit(0); | ||
129 | } | ||
diff --git a/samples/vfs/test-statx.c b/samples/vfs/test-statx.c new file mode 100644 index 000000000..49c7a46ce --- /dev/null +++ b/samples/vfs/test-statx.c | |||
@@ -0,0 +1,265 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
2 | /* Test the statx() system call. | ||
3 | * | ||
4 | * Note that the output of this program is intended to look like the output of | ||
5 | * /bin/stat where possible. | ||
6 | * | ||
7 | * Copyright (C) 2015 Red Hat, Inc. All Rights Reserved. | ||
8 | * Written by David Howells (dhowells@redhat.com) | ||
9 | */ | ||
10 | |||
11 | #define _GNU_SOURCE | ||
12 | #define _ATFILE_SOURCE | ||
13 | #include <stdio.h> | ||
14 | #include <stdlib.h> | ||
15 | #include <string.h> | ||
16 | #include <unistd.h> | ||
17 | #include <ctype.h> | ||
18 | #include <errno.h> | ||
19 | #include <time.h> | ||
20 | #include <sys/syscall.h> | ||
21 | #include <sys/types.h> | ||
22 | #include <linux/stat.h> | ||
23 | #include <linux/fcntl.h> | ||
24 | #define statx foo | ||
25 | #define statx_timestamp foo_timestamp | ||
26 | struct statx; | ||
27 | struct statx_timestamp; | ||
28 | #include <sys/stat.h> | ||
29 | #undef statx | ||
30 | #undef statx_timestamp | ||
31 | |||
32 | #define AT_STATX_SYNC_TYPE 0x6000 | ||
33 | #define AT_STATX_SYNC_AS_STAT 0x0000 | ||
34 | #define AT_STATX_FORCE_SYNC 0x2000 | ||
35 | #define AT_STATX_DONT_SYNC 0x4000 | ||
36 | |||
37 | #ifndef __NR_statx | ||
38 | #define __NR_statx -1 | ||
39 | #endif | ||
40 | |||
41 | static __attribute__((unused)) | ||
42 | ssize_t statx(int dfd, const char *filename, unsigned flags, | ||
43 | unsigned int mask, struct statx *buffer) | ||
44 | { | ||
45 | return syscall(__NR_statx, dfd, filename, flags, mask, buffer); | ||
46 | } | ||
47 | |||
48 | static void print_time(const char *field, struct statx_timestamp *ts) | ||
49 | { | ||
50 | struct tm tm; | ||
51 | time_t tim; | ||
52 | char buffer[100]; | ||
53 | int len; | ||
54 | |||
55 | tim = ts->tv_sec; | ||
56 | if (!localtime_r(&tim, &tm)) { | ||
57 | perror("localtime_r"); | ||
58 | exit(1); | ||
59 | } | ||
60 | len = strftime(buffer, 100, "%F %T", &tm); | ||
61 | if (len == 0) { | ||
62 | perror("strftime"); | ||
63 | exit(1); | ||
64 | } | ||
65 | printf("%s", field); | ||
66 | fwrite(buffer, 1, len, stdout); | ||
67 | printf(".%09u", ts->tv_nsec); | ||
68 | len = strftime(buffer, 100, "%z", &tm); | ||
69 | if (len == 0) { | ||
70 | perror("strftime2"); | ||
71 | exit(1); | ||
72 | } | ||
73 | fwrite(buffer, 1, len, stdout); | ||
74 | printf("\n"); | ||
75 | } | ||
76 | |||
77 | static void dump_statx(struct statx *stx) | ||
78 | { | ||
79 | char buffer[256], ft = '?'; | ||
80 | |||
81 | printf("results=%x\n", stx->stx_mask); | ||
82 | |||
83 | printf(" "); | ||
84 | if (stx->stx_mask & STATX_SIZE) | ||
85 | printf(" Size: %-15llu", (unsigned long long)stx->stx_size); | ||
86 | if (stx->stx_mask & STATX_BLOCKS) | ||
87 | printf(" Blocks: %-10llu", (unsigned long long)stx->stx_blocks); | ||
88 | printf(" IO Block: %-6llu", (unsigned long long)stx->stx_blksize); | ||
89 | if (stx->stx_mask & STATX_TYPE) { | ||
90 | switch (stx->stx_mode & S_IFMT) { | ||
91 | case S_IFIFO: printf(" FIFO\n"); ft = 'p'; break; | ||
92 | case S_IFCHR: printf(" character special file\n"); ft = 'c'; break; | ||
93 | case S_IFDIR: printf(" directory\n"); ft = 'd'; break; | ||
94 | case S_IFBLK: printf(" block special file\n"); ft = 'b'; break; | ||
95 | case S_IFREG: printf(" regular file\n"); ft = '-'; break; | ||
96 | case S_IFLNK: printf(" symbolic link\n"); ft = 'l'; break; | ||
97 | case S_IFSOCK: printf(" socket\n"); ft = 's'; break; | ||
98 | default: | ||
99 | printf(" unknown type (%o)\n", stx->stx_mode & S_IFMT); | ||
100 | break; | ||
101 | } | ||
102 | } else { | ||
103 | printf(" no type\n"); | ||
104 | } | ||
105 | |||
106 | sprintf(buffer, "%02x:%02x", stx->stx_dev_major, stx->stx_dev_minor); | ||
107 | printf("Device: %-15s", buffer); | ||
108 | if (stx->stx_mask & STATX_INO) | ||
109 | printf(" Inode: %-11llu", (unsigned long long) stx->stx_ino); | ||
110 | if (stx->stx_mask & STATX_NLINK) | ||
111 | printf(" Links: %-5u", stx->stx_nlink); | ||
112 | if (stx->stx_mask & STATX_TYPE) { | ||
113 | switch (stx->stx_mode & S_IFMT) { | ||
114 | case S_IFBLK: | ||
115 | case S_IFCHR: | ||
116 | printf(" Device type: %u,%u", | ||
117 | stx->stx_rdev_major, stx->stx_rdev_minor); | ||
118 | break; | ||
119 | } | ||
120 | } | ||
121 | printf("\n"); | ||
122 | |||
123 | if (stx->stx_mask & STATX_MODE) | ||
124 | printf("Access: (%04o/%c%c%c%c%c%c%c%c%c%c) ", | ||
125 | stx->stx_mode & 07777, | ||
126 | ft, | ||
127 | stx->stx_mode & S_IRUSR ? 'r' : '-', | ||
128 | stx->stx_mode & S_IWUSR ? 'w' : '-', | ||
129 | stx->stx_mode & S_IXUSR ? 'x' : '-', | ||
130 | stx->stx_mode & S_IRGRP ? 'r' : '-', | ||
131 | stx->stx_mode & S_IWGRP ? 'w' : '-', | ||
132 | stx->stx_mode & S_IXGRP ? 'x' : '-', | ||
133 | stx->stx_mode & S_IROTH ? 'r' : '-', | ||
134 | stx->stx_mode & S_IWOTH ? 'w' : '-', | ||
135 | stx->stx_mode & S_IXOTH ? 'x' : '-'); | ||
136 | if (stx->stx_mask & STATX_UID) | ||
137 | printf("Uid: %5d ", stx->stx_uid); | ||
138 | if (stx->stx_mask & STATX_GID) | ||
139 | printf("Gid: %5d\n", stx->stx_gid); | ||
140 | |||
141 | if (stx->stx_mask & STATX_ATIME) | ||
142 | print_time("Access: ", &stx->stx_atime); | ||
143 | if (stx->stx_mask & STATX_MTIME) | ||
144 | print_time("Modify: ", &stx->stx_mtime); | ||
145 | if (stx->stx_mask & STATX_CTIME) | ||
146 | print_time("Change: ", &stx->stx_ctime); | ||
147 | if (stx->stx_mask & STATX_BTIME) | ||
148 | print_time(" Birth: ", &stx->stx_btime); | ||
149 | |||
150 | if (stx->stx_attributes_mask) { | ||
151 | unsigned char bits, mbits; | ||
152 | int loop, byte; | ||
153 | |||
154 | static char attr_representation[64 + 1] = | ||
155 | /* STATX_ATTR_ flags: */ | ||
156 | "????????" /* 63-56 */ | ||
157 | "????????" /* 55-48 */ | ||
158 | "????????" /* 47-40 */ | ||
159 | "????????" /* 39-32 */ | ||
160 | "????????" /* 31-24 0x00000000-ff000000 */ | ||
161 | "????????" /* 23-16 0x00000000-00ff0000 */ | ||
162 | "???me???" /* 15- 8 0x00000000-0000ff00 */ | ||
163 | "?dai?c??" /* 7- 0 0x00000000-000000ff */ | ||
164 | ; | ||
165 | |||
166 | printf("Attributes: %016llx (", | ||
167 | (unsigned long long)stx->stx_attributes); | ||
168 | for (byte = 64 - 8; byte >= 0; byte -= 8) { | ||
169 | bits = stx->stx_attributes >> byte; | ||
170 | mbits = stx->stx_attributes_mask >> byte; | ||
171 | for (loop = 7; loop >= 0; loop--) { | ||
172 | int bit = byte + loop; | ||
173 | |||
174 | if (!(mbits & 0x80)) | ||
175 | putchar('.'); /* Not supported */ | ||
176 | else if (bits & 0x80) | ||
177 | putchar(attr_representation[63 - bit]); | ||
178 | else | ||
179 | putchar('-'); /* Not set */ | ||
180 | bits <<= 1; | ||
181 | mbits <<= 1; | ||
182 | } | ||
183 | if (byte) | ||
184 | putchar(' '); | ||
185 | } | ||
186 | printf(")\n"); | ||
187 | } | ||
188 | } | ||
189 | |||
190 | static void dump_hex(unsigned long long *data, int from, int to) | ||
191 | { | ||
192 | unsigned offset, print_offset = 1, col = 0; | ||
193 | |||
194 | from /= 8; | ||
195 | to = (to + 7) / 8; | ||
196 | |||
197 | for (offset = from; offset < to; offset++) { | ||
198 | if (print_offset) { | ||
199 | printf("%04x: ", offset * 8); | ||
200 | print_offset = 0; | ||
201 | } | ||
202 | printf("%016llx", data[offset]); | ||
203 | col++; | ||
204 | if ((col & 3) == 0) { | ||
205 | printf("\n"); | ||
206 | print_offset = 1; | ||
207 | } else { | ||
208 | printf(" "); | ||
209 | } | ||
210 | } | ||
211 | |||
212 | if (!print_offset) | ||
213 | printf("\n"); | ||
214 | } | ||
215 | |||
216 | int main(int argc, char **argv) | ||
217 | { | ||
218 | struct statx stx; | ||
219 | int ret, raw = 0, atflag = AT_SYMLINK_NOFOLLOW; | ||
220 | |||
221 | unsigned int mask = STATX_BASIC_STATS | STATX_BTIME; | ||
222 | |||
223 | for (argv++; *argv; argv++) { | ||
224 | if (strcmp(*argv, "-F") == 0) { | ||
225 | atflag &= ~AT_STATX_SYNC_TYPE; | ||
226 | atflag |= AT_STATX_FORCE_SYNC; | ||
227 | continue; | ||
228 | } | ||
229 | if (strcmp(*argv, "-D") == 0) { | ||
230 | atflag &= ~AT_STATX_SYNC_TYPE; | ||
231 | atflag |= AT_STATX_DONT_SYNC; | ||
232 | continue; | ||
233 | } | ||
234 | if (strcmp(*argv, "-L") == 0) { | ||
235 | atflag &= ~AT_SYMLINK_NOFOLLOW; | ||
236 | continue; | ||
237 | } | ||
238 | if (strcmp(*argv, "-O") == 0) { | ||
239 | mask &= ~STATX_BASIC_STATS; | ||
240 | continue; | ||
241 | } | ||
242 | if (strcmp(*argv, "-A") == 0) { | ||
243 | atflag |= AT_NO_AUTOMOUNT; | ||
244 | continue; | ||
245 | } | ||
246 | if (strcmp(*argv, "-R") == 0) { | ||
247 | raw = 1; | ||
248 | continue; | ||
249 | } | ||
250 | |||
251 | memset(&stx, 0xbf, sizeof(stx)); | ||
252 | ret = statx(AT_FDCWD, *argv, atflag, mask, &stx); | ||
253 | printf("statx(%s) = %d\n", *argv, ret); | ||
254 | if (ret < 0) { | ||
255 | perror(*argv); | ||
256 | exit(1); | ||
257 | } | ||
258 | |||
259 | if (raw) | ||
260 | dump_hex((unsigned long long *)&stx, 0, sizeof(stx)); | ||
261 | |||
262 | dump_statx(&stx); | ||
263 | } | ||
264 | return 0; | ||
265 | } | ||
diff --git a/samples/watch_queue/.gitignore b/samples/watch_queue/.gitignore new file mode 100644 index 000000000..2aa3c7e56 --- /dev/null +++ b/samples/watch_queue/.gitignore | |||
@@ -0,0 +1 @@ | |||
watch_test | |||
diff --git a/samples/watch_queue/Makefile b/samples/watch_queue/Makefile new file mode 100644 index 000000000..c0db3a6bc --- /dev/null +++ b/samples/watch_queue/Makefile | |||
@@ -0,0 +1,4 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | userprogs-always-y += watch_test | ||
3 | |||
4 | userccflags += -I usr/include | ||
diff --git a/samples/watch_queue/watch_test.c b/samples/watch_queue/watch_test.c new file mode 100644 index 000000000..8c6cb57d5 --- /dev/null +++ b/samples/watch_queue/watch_test.c | |||
@@ -0,0 +1,186 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* Use watch_queue API to watch for notifications. | ||
3 | * | ||
4 | * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. | ||
5 | * Written by David Howells (dhowells@redhat.com) | ||
6 | */ | ||
7 | |||
8 | #define _GNU_SOURCE | ||
9 | #include <stdbool.h> | ||
10 | #include <stdarg.h> | ||
11 | #include <stdio.h> | ||
12 | #include <stdlib.h> | ||
13 | #include <string.h> | ||
14 | #include <signal.h> | ||
15 | #include <unistd.h> | ||
16 | #include <errno.h> | ||
17 | #include <sys/ioctl.h> | ||
18 | #include <limits.h> | ||
19 | #include <linux/watch_queue.h> | ||
20 | #include <linux/unistd.h> | ||
21 | #include <linux/keyctl.h> | ||
22 | |||
23 | #ifndef KEYCTL_WATCH_KEY | ||
24 | #define KEYCTL_WATCH_KEY -1 | ||
25 | #endif | ||
26 | #ifndef __NR_keyctl | ||
27 | #define __NR_keyctl -1 | ||
28 | #endif | ||
29 | |||
30 | #define BUF_SIZE 256 | ||
31 | |||
32 | static long keyctl_watch_key(int key, int watch_fd, int watch_id) | ||
33 | { | ||
34 | return syscall(__NR_keyctl, KEYCTL_WATCH_KEY, key, watch_fd, watch_id); | ||
35 | } | ||
36 | |||
37 | static const char *key_subtypes[256] = { | ||
38 | [NOTIFY_KEY_INSTANTIATED] = "instantiated", | ||
39 | [NOTIFY_KEY_UPDATED] = "updated", | ||
40 | [NOTIFY_KEY_LINKED] = "linked", | ||
41 | [NOTIFY_KEY_UNLINKED] = "unlinked", | ||
42 | [NOTIFY_KEY_CLEARED] = "cleared", | ||
43 | [NOTIFY_KEY_REVOKED] = "revoked", | ||
44 | [NOTIFY_KEY_INVALIDATED] = "invalidated", | ||
45 | [NOTIFY_KEY_SETATTR] = "setattr", | ||
46 | }; | ||
47 | |||
48 | static void saw_key_change(struct watch_notification *n, size_t len) | ||
49 | { | ||
50 | struct key_notification *k = (struct key_notification *)n; | ||
51 | |||
52 | if (len != sizeof(struct key_notification)) { | ||
53 | fprintf(stderr, "Incorrect key message length\n"); | ||
54 | return; | ||
55 | } | ||
56 | |||
57 | printf("KEY %08x change=%u[%s] aux=%u\n", | ||
58 | k->key_id, n->subtype, key_subtypes[n->subtype], k->aux); | ||
59 | } | ||
60 | |||
61 | /* | ||
62 | * Consume and display events. | ||
63 | */ | ||
64 | static void consumer(int fd) | ||
65 | { | ||
66 | unsigned char buffer[433], *p, *end; | ||
67 | union { | ||
68 | struct watch_notification n; | ||
69 | unsigned char buf1[128]; | ||
70 | } n; | ||
71 | ssize_t buf_len; | ||
72 | |||
73 | for (;;) { | ||
74 | buf_len = read(fd, buffer, sizeof(buffer)); | ||
75 | if (buf_len == -1) { | ||
76 | perror("read"); | ||
77 | exit(1); | ||
78 | } | ||
79 | |||
80 | if (buf_len == 0) { | ||
81 | printf("-- END --\n"); | ||
82 | return; | ||
83 | } | ||
84 | |||
85 | if (buf_len > sizeof(buffer)) { | ||
86 | fprintf(stderr, "Read buffer overrun: %zd\n", buf_len); | ||
87 | return; | ||
88 | } | ||
89 | |||
90 | printf("read() = %zd\n", buf_len); | ||
91 | |||
92 | p = buffer; | ||
93 | end = buffer + buf_len; | ||
94 | while (p < end) { | ||
95 | size_t largest, len; | ||
96 | |||
97 | largest = end - p; | ||
98 | if (largest > 128) | ||
99 | largest = 128; | ||
100 | if (largest < sizeof(struct watch_notification)) { | ||
101 | fprintf(stderr, "Short message header: %zu\n", largest); | ||
102 | return; | ||
103 | } | ||
104 | memcpy(&n, p, largest); | ||
105 | |||
106 | printf("NOTIFY[%03zx]: ty=%06x sy=%02x i=%08x\n", | ||
107 | p - buffer, n.n.type, n.n.subtype, n.n.info); | ||
108 | |||
109 | len = n.n.info & WATCH_INFO_LENGTH; | ||
110 | if (len < sizeof(n.n) || len > largest) { | ||
111 | fprintf(stderr, "Bad message length: %zu/%zu\n", len, largest); | ||
112 | exit(1); | ||
113 | } | ||
114 | |||
115 | switch (n.n.type) { | ||
116 | case WATCH_TYPE_META: | ||
117 | switch (n.n.subtype) { | ||
118 | case WATCH_META_REMOVAL_NOTIFICATION: | ||
119 | printf("REMOVAL of watchpoint %08x\n", | ||
120 | (n.n.info & WATCH_INFO_ID) >> | ||
121 | WATCH_INFO_ID__SHIFT); | ||
122 | break; | ||
123 | case WATCH_META_LOSS_NOTIFICATION: | ||
124 | printf("-- LOSS --\n"); | ||
125 | break; | ||
126 | default: | ||
127 | printf("other meta record\n"); | ||
128 | break; | ||
129 | } | ||
130 | break; | ||
131 | case WATCH_TYPE_KEY_NOTIFY: | ||
132 | saw_key_change(&n.n, len); | ||
133 | break; | ||
134 | default: | ||
135 | printf("other type\n"); | ||
136 | break; | ||
137 | } | ||
138 | |||
139 | p += len; | ||
140 | } | ||
141 | } | ||
142 | } | ||
143 | |||
144 | static struct watch_notification_filter filter = { | ||
145 | .nr_filters = 1, | ||
146 | .filters = { | ||
147 | [0] = { | ||
148 | .type = WATCH_TYPE_KEY_NOTIFY, | ||
149 | .subtype_filter[0] = UINT_MAX, | ||
150 | }, | ||
151 | }, | ||
152 | }; | ||
153 | |||
154 | int main(int argc, char **argv) | ||
155 | { | ||
156 | int pipefd[2], fd; | ||
157 | |||
158 | if (pipe2(pipefd, O_NOTIFICATION_PIPE) == -1) { | ||
159 | perror("pipe2"); | ||
160 | exit(1); | ||
161 | } | ||
162 | fd = pipefd[0]; | ||
163 | |||
164 | if (ioctl(fd, IOC_WATCH_QUEUE_SET_SIZE, BUF_SIZE) == -1) { | ||
165 | perror("watch_queue(size)"); | ||
166 | exit(1); | ||
167 | } | ||
168 | |||
169 | if (ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter) == -1) { | ||
170 | perror("watch_queue(filter)"); | ||
171 | exit(1); | ||
172 | } | ||
173 | |||
174 | if (keyctl_watch_key(KEY_SPEC_SESSION_KEYRING, fd, 0x01) == -1) { | ||
175 | perror("keyctl"); | ||
176 | exit(1); | ||
177 | } | ||
178 | |||
179 | if (keyctl_watch_key(KEY_SPEC_USER_KEYRING, fd, 0x02) == -1) { | ||
180 | perror("keyctl"); | ||
181 | exit(1); | ||
182 | } | ||
183 | |||
184 | consumer(fd); | ||
185 | exit(0); | ||
186 | } | ||
diff --git a/samples/watchdog/.gitignore b/samples/watchdog/.gitignore new file mode 100644 index 000000000..74153b831 --- /dev/null +++ b/samples/watchdog/.gitignore | |||
@@ -0,0 +1,2 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | watchdog-simple | ||
diff --git a/samples/watchdog/Makefile b/samples/watchdog/Makefile new file mode 100644 index 000000000..ab39d23dc --- /dev/null +++ b/samples/watchdog/Makefile | |||
@@ -0,0 +1,2 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0 | ||
2 | userprogs-always-y += watchdog-simple | ||
diff --git a/samples/watchdog/watchdog-simple.c b/samples/watchdog/watchdog-simple.c new file mode 100644 index 000000000..9ce66d2ca --- /dev/null +++ b/samples/watchdog/watchdog-simple.c | |||
@@ -0,0 +1,25 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #include <stdio.h> | ||
3 | #include <stdlib.h> | ||
4 | #include <unistd.h> | ||
5 | #include <fcntl.h> | ||
6 | |||
7 | int main(void) | ||
8 | { | ||
9 | int fd = open("/dev/watchdog", O_WRONLY); | ||
10 | int ret = 0; | ||
11 | if (fd == -1) { | ||
12 | perror("watchdog"); | ||
13 | exit(EXIT_FAILURE); | ||
14 | } | ||
15 | while (1) { | ||
16 | ret = write(fd, "\0", 1); | ||
17 | if (ret != 1) { | ||
18 | ret = -1; | ||
19 | break; | ||
20 | } | ||
21 | sleep(10); | ||
22 | } | ||
23 | close(fd); | ||
24 | return ret; | ||
25 | } | ||