From fc61a4a525846fa31ee2288df4e82f745bb39c95 Mon Sep 17 00:00:00 2001 From: We-unite <3205135446@qq.com> Date: Tue, 23 Jul 2024 19:32:09 +0800 Subject: Try ot fix the out-of-order bug, add EXECVE to it The Most important work during this time is to find out solution to the out-of-order bug. Discribe it here in detail: info from audit may be out of order, which means fork may comes after execve, even after exit. What an absurd penomenon to see a process not yet created to work or exit! To deal with this problem, I've tried several ways: - in the 2nd coroutine, when EOE msg comes, if it's a fork/clone event, send it immediately, otherwise wait for some time(such as 100 ms). But after all it delays longer, and has other problems. - the 2nd coroutine doesn't send directly, but record all the finished event id in a slice, and another thread checks once every one second, if there are sth in slice, send corresponding events in the order of event id. But: event that happens first doesn't always has lower id or time, for example, 1 forks 2, then 2 execve, the audit in kernel it self may gets execve before fork(maybe fork makes other settings), which means execve has earlier timestamp and lower event id. The out- of-order problem is not completely resolved. If we then add delays to non-clone event, a more serious problem happens: we must use mutex to lock the slice recording finished event id to prevent crush between send thread and wait thread, but the wait thread can't get the mutex again, because there are to much clone event and frequent send! - So I use no delay but mongodb, when an execve comes, if pid is not recorded, just insert it and wait for the fork. It does works, but some other works is still left to do: - what should i do if 2 forks 3 comes before 1 forks 2? Now I suggest it doesn't happen, but what if? - when execve comes before fork, i recorded it, but if this process has a parent i don't care, delete, or stays there? Also, as mentioned above, I've add EXECVE field in process into db, records all the execve(time, and args) from the same process. Besides, exit_timestamp and exit_code can be caught now, but too many process has no exit info. This is also to be fixed. Now, let's listen to the file changed by process. Don't forget the to-do works listed above! --- src/basefunc.go | 40 ++++++++----- src/deal.go | 173 +++++++++++++++++++++++++++++++++----------------------- src/global.go | 2 +- src/godo.go | 18 ------ src/mongo.go | 79 ++++++++++++++++++++++++++ src/organize.go | 79 +++++++++++++++++++------- 6 files changed, 266 insertions(+), 125 deletions(-) create mode 100644 src/mongo.go diff --git a/src/basefunc.go b/src/basefunc.go index 5fff3e8..2f39507 100644 --- a/src/basefunc.go +++ b/src/basefunc.go @@ -4,32 +4,46 @@ import ( "bufio" "fmt" "os" + "os/exec" "path/filepath" - "regexp" "strconv" "strings" "time" ) func figureOutSyscalls() error { - NRRegex := regexp.MustCompile(`#define __NR_(.*?) (\d+)$`) - file, err := os.Open("/usr/include/asm/unistd_64.h") + cmd := exec.Command("ausyscall", "--dump") + stdout, err := cmd.StdoutPipe() if err != nil { return err } - defer file.Close() - scanner := bufio.NewScanner(file) - for scanner.Scan() { + if err := cmd.Start(); err != nil { + return err + } + + scanner := bufio.NewScanner(stdout) + for i := 0; scanner.Scan(); i++ { + if i == 0 { + continue + } line := scanner.Text() - if NRRegex.MatchString(line) { - match := NRRegex.FindStringSubmatch(line) - num, err := strconv.Atoi(match[2]) - if err != nil { - return err - } - syscallTable[num] = match[1] + parts := strings.Split(line, "\t") + if len(parts) != 2 { + return fmt.Errorf("invalid ausyscall format") } + num, err := strconv.Atoi(parts[0]) + if err != nil { + return err + } + syscallTable[num] = parts[1] + } + + if err := scanner.Err(); err != nil { + return err + } + if err := cmd.Wait(); err != nil { + return err } return nil } diff --git a/src/deal.go b/src/deal.go index 118d914..783dab8 100644 --- a/src/deal.go +++ b/src/deal.go @@ -1,51 +1,48 @@ package main import ( - "context" "fmt" + "sync" "time" "go.mongodb.org/mongo-driver/bson" - "go.mongodb.org/mongo-driver/mongo" - "go.mongodb.org/mongo-driver/mongo/options" ) const ( - dbName string = "test" - colName string = "pids" + dbName string = "test" + pidColName string = "pids" ) +var mongoMutex sync.Mutex +var pidCol mongoClient + func deal() { defer wg.Done() var cooked Event var ok bool var err error - var mongo *mongo.Client var res []bson.M - mongo, err = connect() - if err != nil { - fmt.Printf("Err connecting the mongodb: %v\n", err) + if err = pidCol.Connect(dbName, pidColName); err != nil { + fmt.Printf("Error connecting the mongodb: %v\n", err) } - pidCol := mongo.Database(dbName).Collection(colName) - - err = pidCol.Drop(context.Background()) - if err != nil { - fmt.Printf("Err drop: %v\n", err) + if err = pidCol.Drop(); err != nil { + fmt.Printf("Error drop the mongodb: %v\n", err) } - _, err = pidCol.InsertOne(context.Background(), bson.M{ - "ppid": 1, - "pid": containerdPid, - "cwd": "/", + err = pidCol.InsertOne(bson.M{ + "ppid": 1, + "pid": containerdPid, + "cwd": "/", + "children": bson.M{}, }) if err != nil { fmt.Printf("Err containerd: %v", err) return } - fmt.Printf("Containerd: %d\n", containerdPid) + defer pidCol.Disconnect() for { cooked, ok = <-cookedChan @@ -54,81 +51,115 @@ func deal() { } switch syscallTable[cooked.syscall] { - case "fork", "vfork", "clone": + case "clone": // 有无父进程在观察中 - res, err = findDocuments(mongo, "test", "pids", bson.M{"pid": cooked.ppid}) + res, err = pidCol.Finddoc(bson.M{"pid": cooked.ppid}) if err != nil || len(res) != 1 { break } // 自身是否已经记录 - res, err = findDocuments(mongo, "test", "pids", bson.M{"pid": cooked.pid}) + res, err = pidCol.Finddoc(bson.M{"pid": cooked.pid}) if err != nil { fmt.Printf("Err finding: %v\n", err) break - } else if len(res) != 0 { - fmt.Printf("Err inserting pid %v: already in db: %v\n", cooked.pid, res) - break - } - - doc := []bson.A{} - for _, str := range cooked.argv { - doc = append(doc, bson.A{str}) } - _, err := pidCol.InsertOne(context.Background(), bson.M{ - "timestamp": cooked.timestamp, - "ppid": cooked.ppid, - "pid": cooked.pid, - "cwd": cooked.cwd, - "args": doc, - "children": []bson.M{}, - }) - if err != nil { - fmt.Printf("Err insert: %v\n", err) + mongoMutex.Lock() + if len(res) != 0 { + // 进程原本就存在,换言之别的消息先到了 + // 所有先行抵达的消息必须保留execve/children字段 + // 此处不再更新 + // 以防把原有信息更没了 + pidCol.UpdateOne(bson.M{"pid": cooked.pid}, bson.M{ + "start_timestamp": cooked.timestamp, + "ppid": cooked.ppid, + "pid": cooked.pid, + "cwd": cooked.cwd, + // "execve": []bson.M{}, + "args": cooked.argv, + // "children": []bson.M{}, + }) + } else { + // 这进程本是新修的 + pidCol.InsertOne(bson.M{ + "start_timestamp": cooked.timestamp, + "ppid": cooked.ppid, + "pid": cooked.pid, + "cwd": cooked.cwd, + "execve": []bson.M{}, + "args": cooked.argv, + "children": []bson.M{}, + }) } - _, err = pidCol.UpdateOne(context.Background(), bson.M{"pid": cooked.pid}, bson.M{ + pidCol.UpdateOne(bson.M{"pid": cooked.ppid}, bson.M{ "$push": bson.M{ "children": cooked.pid, }, }) + mongoMutex.Unlock() + case "execve": + // 父进程在不在?不在扔 + res, err = pidCol.Finddoc(bson.M{"pid": cooked.ppid}) + if err != nil || len(res) != 1 { + break + } + + // 首先检查进程是否存在,如不存在则为之创建 + res, err = pidCol.Finddoc(bson.M{"pid": cooked.pid}) if err != nil { - fmt.Printf("Err insert: %v\n", err) + break + } + mongoMutex.Lock() + if len(res) == 1 { + // 自身已在,直接记录 + pidCol.UpdateOne(bson.M{"pid": cooked.pid}, bson.M{ + "$push": bson.M{ + "execve": bson.M{ + "timestamp": cooked.timestamp, + "args": cooked.argv, + }, + }, + }) + } else { + // 先fork抵达,插入 + pidCol.InsertOne(bson.M{ + "children": []bson.M{}, + "execve": []bson.M{ + { + "timestamp": cooked.timestamp, + "execve": cooked.argv, + }, + }, + }) } + mongoMutex.Unlock() case "exit", "exit_group": - // TODO: 记得补全退出逻辑 - // 上哪找exit code呢? + go deletePid(cooked) } } } -func connect() (*mongo.Client, error) { - client, err := mongo.NewClient(options.Client().ApplyURI("mongodb://localhost:27017")) - - if err != nil { - return nil, err - } - - ctx, _ := context.WithTimeout(context.Background(), 10*time.Second) - err = client.Connect(ctx) - - if err != nil { - return nil, err - } - - return client, nil -} - -func findDocuments(client *mongo.Client, dbName, colName string, filter bson.M) ([]bson.M, error) { - collection := client.Database(dbName).Collection(colName) - - cur, err := collection.Find(context.Background(), filter) - if err != nil { - return nil, err - } +func deletePid(cooked Event) { + time.Sleep(1 * time.Second) + mongoMutex.Lock() + // 先从老爹那里销户 + pidCol.UpdateOne(bson.M{"pid": cooked.ppid}, bson.M{ + "$pull": bson.M{ + "children": cooked.pid, + }, + }) - var results []bson.M - err = cur.All(context.Background(), &results) + // 孩子们需要收容 + // 不必到children里一个个找,直接看ppid即可 + pidCol.UpdateMany(bson.M{"ppid": cooked.pid}, bson.M{"ppid": 1}) - return results, err + // 可以去死了 + pidCol.UpdateOne(bson.M{"pid": cooked.pid}, bson.M{ + "$set": bson.M{ + "exit_timestamp": cooked.timestamp, + "exit_code": cooked.exit_code, + }, + }) + mongoMutex.Unlock() } diff --git a/src/global.go b/src/global.go index 0439df6..c3001ab 100644 --- a/src/global.go +++ b/src/global.go @@ -9,6 +9,7 @@ type Event struct { timestamp time.Time pid, ppid int syscall int + exit_code uint64 argc int argv []string cwd string @@ -23,7 +24,6 @@ type process struct { children []int } -var pids sync.Map // 古希腊掌管进程的神,int->*process var wg sync.WaitGroup // 掌管协程 var rawChan chan interface{} // 从接收到整理的管道 var cookedChan chan Event // 整理好的信息的管道 diff --git a/src/godo.go b/src/godo.go index 72f68c0..cc29a01 100644 --- a/src/godo.go +++ b/src/godo.go @@ -51,26 +51,8 @@ func main() { } // 创世之神,1号进程 - // pids[1] = &process{rootfs: "/", children: make([]int, 0)} - // pids[1].children = append(pids[1].children, containerdPid) // 1号进程还是不要在进程树上直接出现了,不然它的小儿子们都会出现 - // /usr/bin/containerd,也就是我们最关注的进程 - // pids[containerdPid] = &process{rootfs: "/", children: make([]int, 0)} - pids.Store(containerdPid, &process{ - ppid: 1, - pid: containerdPid, - argv: make([]string, 0), - cwd: "/", - rootfs: "/", - children: make([]int, 0), - }) - p, ok := pids.Load(containerdPid) - if !ok { - fmt.Printf("???\n") - return - } - p.(*process).argv = append(p.(*process).argv, "/usr/bin/containerd") // 开始运行,解析命令行参数后监听 if err := fs.Parse(os.Args[1:]); err != nil { diff --git a/src/mongo.go b/src/mongo.go new file mode 100644 index 0000000..d00abd2 --- /dev/null +++ b/src/mongo.go @@ -0,0 +1,79 @@ +package main + +import ( + "context" + "time" + + "go.mongodb.org/mongo-driver/bson" + "go.mongodb.org/mongo-driver/mongo" + "go.mongodb.org/mongo-driver/mongo/options" +) + +type mongoClient struct { + dbName, colName string + client *mongo.Client + col *mongo.Collection +} + +func (mc *mongoClient) Connect(dbName, colName string) error { + var err error + mc.client, err = mongo.NewClient(options.Client().ApplyURI("mongodb://localhost:27017")) + + if err != nil { + return err + } + + ctx, _ := context.WithTimeout(context.Background(), 10*time.Second) + err = mc.client.Connect(ctx) + if err != nil { + return err + } + + mc.col = mc.client.Database(dbName).Collection(colName) + mc.dbName = dbName + mc.colName = colName + return nil +} + +func (mc *mongoClient) InsertOne(document interface{}) error { + _, err := mc.col.InsertOne(context.Background(), document) + return err +} + +func (mc *mongoClient) UpdateOne(filter, update interface{}) error { + _, err := mc.col.UpdateOne(context.Background(), filter, update) + return err +} + +func (mc *mongoClient) UpdateMany(filter, update interface{}) error { + _, err := mc.col.UpdateMany(context.Background(), filter, update) + return err +} + +func (mc *mongoClient) Finddoc(filter bson.M) ([]bson.M, error) { + cur, err := mc.col.Find(context.Background(), filter) + if err != nil { + return nil, err + } + + var results []bson.M + err = cur.All(context.Background(), &results) + + return results, err +} + +func (mc *mongoClient) Drop() error { + return mc.col.Drop(context.Background()) +} + +func (mc *mongoClient) Disconnect() error { + err := mc.client.Disconnect(context.Background()) + if err != nil { + return err + } + mc.col = nil + mc.client = nil + mc.dbName = "" + mc.colName = "" + return nil +} diff --git a/src/organize.go b/src/organize.go index bb6736a..d963288 100644 --- a/src/organize.go +++ b/src/organize.go @@ -1,9 +1,11 @@ package main import ( + "fmt" "regexp" "strconv" "strings" + "sync" "github.com/elastic/go-libaudit/v2" "github.com/elastic/go-libaudit/v2/auparse" @@ -19,16 +21,20 @@ func orgnaze() { // 事件信息 var eventId, argc int var err [6]error - var event, cooked Event + var event Event + var pEvent *Event + var tmp any // 为每个事务id存储其信息,事务id在操作系统运行期间是唯一的 - eventTable := make(map[int]*Event) + var eventTable sync.Map + // 要用的正则匹配列表 - syscallRegex := regexp.MustCompile(`audit\((\d+\.\d+):(\d+)\).*?syscall=(\d+).*?(exit=([-+]?\d+).*?)?ppid=(\d+) pid=(\d+).*?$`) + syscallRegex := regexp.MustCompile(`audit\((\d+\.\d+):(\d+)\).*?syscall=(\d+).*?(exit=([-+]?\d+))? a0=([0-9a-fA-F]+).*?ppid=(\d+) pid=(\d+).*?$`) execveRegex := regexp.MustCompile(`audit\(\d+\.\d+:(\d+)\): argc=(\d+)`) argsRegex := regexp.MustCompile(`a\d+=("(.*?)"|([0-9a-fA-F]+))`) cwdRegex := regexp.MustCompile(`audit\(\d+\.\d+:(\d+)\): cwd="(.*?)"`) proctitleRegex := regexp.MustCompile(`audit\(\d+\.\d+:(\d+)\): proctitle=("(.*?)"|([0-9a-fA-F]+))$`) eoeRegex := regexp.MustCompile(`audit\(\d+\.\d+:(\d+)\)`) + for { raw, ok = <-rawChan if !ok { @@ -44,39 +50,53 @@ func orgnaze() { eventId, err[1] = strconv.Atoi(string(match[2])) event.syscall, err[2] = strconv.Atoi(string(match[3])) var exit int - // exit, err[3] = strconv.Atoi(string(match[4])) + var a0 uint64 if string(match[5]) == "" { // exit没捕获到 exit = 0 } else { exit, err[3] = strconv.Atoi(string(match[5])) } - event.ppid, err[4] = strconv.Atoi(string(match[5])) - event.pid, err[5] = strconv.Atoi(string(match[6])) + if string(match[6]) == "" { + a0 = 0 + } else { + // 系统调用的第一个参数 + // exit和exit_group都是syscall_define1,只有一个参数 + // fork没参数,clone几个参数不重要 + // execve三个参数咱也不关心 + // 所以看第一个就够了 + a0, err[4] = strconv.ParseUint(string(match[6]), 16, 64) + } + event.ppid, err[4] = strconv.Atoi(string(match[7])) + event.pid, err[5] = strconv.Atoi(string(match[8])) if syscallTable[event.syscall] == "clone" { - if exit == 0 { + if exit == 0 || event.pid > exit { + // exit=0是给新进程的返回,没用 + // pid>exit,证明有问题,抛弃 break } else { - eventTable[eventId] = &Event{ + eventTable.Store(eventId, &Event{ timestamp: event.timestamp, syscall: event.syscall, + exit_code: 0, ppid: event.pid, pid: exit, argc: 0, argv: make([]string, 0), cwd: "", - } + }) } } else { - eventTable[eventId] = &Event{ + eventTable.Store(eventId, &Event{ timestamp: event.timestamp, syscall: event.syscall, + exit_code: a0, ppid: event.ppid, pid: event.pid, argc: 0, argv: make([]string, 0), cwd: "", - } + }) } } case auparse.AUDIT_EXECVE: @@ -84,34 +104,45 @@ func orgnaze() { match := execveRegex.FindSubmatch(rawEvent.Data) eventId, err[0] = strconv.Atoi(string(match[1])) argc, err[1] = strconv.Atoi(string(match[2])) + tmp, ok = eventTable.Load(eventId) + if !ok { + break + } + pEvent = tmp.(*Event) if err[0] == nil && err[1] == nil && argsRegex.Match(rawEvent.Data) { match := argsRegex.FindAllSubmatch(rawEvent.Data, -1) for i := 0; i < argc; i++ { if len(match[i][2]) == 0 { // 代表着匹配到的是十六进制数 str := hexToAscii(string(match[i][3])) - eventTable[eventId].argv = append(eventTable[eventId].argv, str) + pEvent.argv = append(pEvent.argv, str) } else { - eventTable[eventId].argv = append(eventTable[eventId].argv, string(match[i][2])) + pEvent.argv = append(pEvent.argv, string(match[i][2])) } } - eventTable[eventId].argc = argc + pEvent.argc = argc } } - // case auparse.AUDIT_PATH: case auparse.AUDIT_CWD: if cwdRegex.Match(rawEvent.Data) { match := cwdRegex.FindSubmatch(rawEvent.Data) eventId, err[0] = strconv.Atoi(string(match[1])) - eventTable[eventId].cwd = string(match[2]) + tmp, ok = eventTable.Load(eventId) + if !ok { + break + } + tmp.(*Event).cwd = string(match[2]) } case auparse.AUDIT_PROCTITLE: if proctitleRegex.Match(rawEvent.Data) { var cmdline string - var pEvent *Event match := proctitleRegex.FindSubmatch(rawEvent.Data) eventId, err[0] = strconv.Atoi(string(match[1])) - pEvent = eventTable[eventId] + tmp, ok = eventTable.Load(eventId) + if !ok { + break + } + pEvent = tmp.(*Event) if pEvent.argc == 0 { // 只有等于0,才证明没经过EXECVE提取参数,才允许使用PROCTITLE提取参数 if match[3] == nil { @@ -121,17 +152,21 @@ func orgnaze() { cmdline = string(match[3]) } pEvent.argv = strings.Split(cmdline, " ") - pEvent.argc = len(eventTable[eventId].argv) + pEvent.argc = len(pEvent.argv) } } case auparse.AUDIT_EOE: if eoeRegex.Match(rawEvent.Data) { match := eoeRegex.FindSubmatch(rawEvent.Data) eventId, err[0] = strconv.Atoi(string(match[1])) - // ATTENTION: 事件整理完毕,即刻发出,是否合理呢? - cooked = *eventTable[eventId] // 应当采用深拷贝吗?有待实验 + tmp, ok = eventTable.Load(eventId) + if !ok { + break + } + cooked := *(tmp.(*Event)) cookedChan <- cooked - delete(eventTable, eventId) //发出之后就从信息表扔掉,死人别占地 + eventTable.Delete(eventId) // 死人别占地 + fmt.Printf("%d: %3d %6d %6d\n", eventId, cooked.syscall, cooked.ppid, cooked.pid) } default: // ATTENTION: 这里也需要做防护 -- cgit v1.2.3-70-g09d2