Android lmkd机制详解

一、lmkd介绍

二、lmkd实现原理

2.1 工作原理图

2.2 初始化

2.3 oom_adj获取

2.4 监听psi事件及处理

2.5 进程选取与查杀

2.5.1 进程选取

2.5.2 进程查杀

三、关键系统属性

四、核心数据结构

五、代码时序

一、lmkd介绍

Android lmkd采用epoll方式监听linux内核psi内存压力等级的触发事件，并根据psi内存压力等级及进程优先级（oom_adj）来选择目标进程并查杀，缓解系统内存的压力。

二、lmkd实现原理

2.1 工作原理图

工作原理主要分为3部分：

1）app启动时，AMS将oom_adj并通过socket发送给lmkd进程，并由lmkd存起来

2）lmkd进程进入循环监听kernel psi事件

3）当发生psi事件时，根据lmkd策略进行进程查杀

2.2 初始化

在lmkd启动时，会进行初始化。主要做了两个事：

1）启动 lmkd socket，进入监听，等待client端连接

2）初始化psi monitor

static int init(void) {

    ...
    update_psi_window_size();
    ...
    // 1.设置并监听 lmkd 控制套接字，以便处理来自客户端的连接请求
    ctrl_sock.sock = android_get_control_socket("lmkd");
    listen(ctrl_sock.sock, MAX_DATA_CONN);
    ctrl_sock.handler_info.handler = ctrl_connect_handler;
    epev.data.ptr = (void *)&(ctrl_sock.handler_info);
    epoll_ctl(epollfd, EPOLL_CTL_ADD, ctrl_sock.sock, &epev);
    ...
    // 2.初始化psi monitor
    init_monitors();
    // 3. 初始化reaper，创建杀进程的异步线程
    init_reaper();
    ...
}

具体看如何进行psi poll监听及注册回调函数：

static bool init_monitors() {
    /* Try to use psi monitor first if kernel has it */
    // 采用psi内存压力监听---新策略
    use_psi_monitors = GET_LMK_PROPERTY(bool, "use_psi", true) &&
        init_psi_monitors();
     // vmpressure老策略，不会走
    /* Fall back to vmpressure */
    if (!use_psi_monitors &&
        (!init_mp_common(VMPRESS_LEVEL_LOW) ||
        !init_mp_common(VMPRESS_LEVEL_MEDIUM) ||
        !init_mp_common(VMPRESS_LEVEL_CRITICAL))) {
        ALOGE("Kernel does not support memory pressure events or in-kernel low memory killer");
        return false;
    }
    ...
}

static bool init_psi_monitors() {
    /*
     * When PSI is used on low-ram devices or on high-end devices without memfree levels
     * use new kill strategy based on zone watermarks, free swap and thrashing stats.
     * Also use the new strategy if memcg has not been mounted in the v1 cgroups hiearchy since
     * the old strategy relies on memcg attributes that are available only in the v1 cgroups
     * hiearchy.
     */
    bool use_new_strategy =
        GET_LMK_PROPERTY(bool, "use_new_strategy", low_ram_device || !use_minfree_levels);
    if (force_use_old_strategy) {
        use_new_strategy = false;
    }
    
    /* In default PSI mode override stall amounts using system properties */
    if (use_new_strategy) {
        /* Do not use low pressure level */
        psi_thresholds[VMPRESS_LEVEL_LOW].threshold_ms = 0;
        psi_thresholds[VMPRESS_LEVEL_MEDIUM].threshold_ms = psi_partial_stall_ms;
        psi_thresholds[VMPRESS_LEVEL_CRITICAL].threshold_ms = psi_complete_stall_ms;

    } else {
        psi_thresholds[VMPRESS_LEVEL_LOW].threshold_ms = PSI_OLD_LOW_THRESH_MS;
        psi_thresholds[VMPRESS_LEVEL_MEDIUM].threshold_ms = PSI_OLD_MED_THRESH_MS;
        psi_thresholds[VMPRESS_LEVEL_CRITICAL].threshold_ms = PSI_OLD_CRIT_THRESH_MS;
    }
    // 分别对VMPRESS_LEVEL_LOW、VMPRESS_LEVEL_MEDIUM、VMPRESS_LEVEL_CRITICAL、VMPRESS_LEVEL_SUPER_CRITICAL
    // 压力等级的节点
    init_mp_psi(VMPRESS_LEVEL_LOW, use_new_strategy);
    init_mp_psi(VMPRESS_LEVEL_MEDIUM, use_new_strategy);
    init_mp_psi(VMPRESS_LEVEL_CRITICAL, use_new_strategy));
    init_mp_psi(VMPRESS_LEVEL_SUPER_CRITICAL, use_new_strategy);
    ...
}

static bool init_mp_psi(enum vmpressure_level level, bool use_new_strategy) {
    int fd;

    /* Do not register a handler if threshold_ms is not set */
    if (!psi_thresholds[level].threshold_ms) {
        return true;
    }
    // 1.初始化psi，往"/proc/pressure/memory"节点中写入初始值
    fd = init_psi_monitor(psi_thresholds[level].stall_type,
        psi_thresholds[level].threshold_ms * US_PER_MS,
        psi_window_size_ms * US_PER_MS);
    // 监听函数---mp_event_psi
    vmpressure_hinfo[level].handler = use_new_strategy ? mp_event_psi : mp_event_common;
    vmpressure_hinfo[level].data = level;
    // 2.注册poll事件的监听回调函数
    register_psi_monitor(epollfd, fd, &vmpressure_hinfo[level]);
    ...
}

int init_psi_monitor(enum psi_stall_type stall_type,
             int threshold_us, int window_us) {
    ...
    // 打开"/proc/pressure/memory"节点
    fd = TEMP_FAILURE_RETRY(open(PSI_PATH_MEMORY, O_WRONLY | O_CLOEXEC));
   ...

    res = TEMP_FAILURE_RETRY(write(fd, buf, strlen(buf) + 1));
    ...
}

// 注册事件监听回调函数
int register_psi_monitor(int epollfd, int fd, void* data) {
    int res;
    struct epoll_event epev;

    epev.events = EPOLLPRI;
    epev.data.ptr = data;
    res = epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &epev);
    if (res < 0) {
        ALOGE("epoll_ctl for psi monitor failed; errno=%d", errno);
    }
    return res;
}

2.3 oom_adj获取

进程启动后会通过AMS服务将oom_adj信息通过lmkd socket传给lmkd进程，存入到adjslot_list结构体数组，用于根据oom_adj来选取进程查杀。

基本流程：

1）init初始化阶段，设置lmkd socket的epoll监听

2）AMS客户端写入数据到lmkd socket

3）触发lmkd socket的epoll事件，服务端接收数据，并执行处理函数，将进程及oom_adj信息存入adjslot_list结构体数组

核心代码，如下：

1）lmkd代码

static int init(void) {
    ...
    //设置并监听 lmkd 控制套接字，以便处理来自客户端的连接请求
    ctrl_sock.sock = android_get_control_socket("lmkd");
    listen(ctrl_sock.sock, MAX_DATA_CONN);
    ctrl_sock.handler_info.handler = ctrl_connect_handler;    // epoll事件处理函数
    epev.data.ptr = (void *)&(ctrl_sock.handler_info);
    epoll_ctl(epollfd, EPOLL_CTL_ADD, ctrl_sock.sock, &epev);
    ...
}


static void ctrl_data_handler(int data, uint32_t events,
                              struct polling_params *poll_params __unused) {
    if (events & EPOLLIN) {
        ctrl_command_handler(data);
    }
}

static void ctrl_command_handler(int dsock_idx) {
    ...
    // 读取lmkd socket中数据
    ctrl_data_read(dsock_idx, (char *)packet, CTRL_PACKET_MAX_SIZE, &cred);

    cmd = lmkd_pack_get_cmd(packet);

    switch(cmd) {
    // AMS客户端会往lmkd socket中写入LMK_PROCPRIO及进程信息（pid、oom_adj）
    case LMK_PROCPRIO:
        /* process type field is optional for backward compatibility */
        if (nargs < 3 || nargs > 4)
            goto wronglen;
        cmd_procprio(packet, nargs, &cred);
        break;
        ...
}

static void cmd_procprio(LMKD_CTRL_PACKET packet, int field_count, struct ucred *cred) {
    ...
        proc_slot(procp);
}

static void proc_slot(struct proc *procp) {
    int adjslot = ADJTOSLOT(procp->oomadj);
    std::scoped_lock lock(adjslot_list_lock);

    adjslot_insert(&procadjslot_list[adjslot], &procp->asl);
}

static void adjslot_insert(struct adjslot_list *head, struct adjslot_list *new_element)
{
    struct adjslot_list *next = head->next;
    new_element->prev = head;
    new_element->next = next;
    next->prev = new_element;
    head->next = new_element;
}

2）AMS代码


// frameworks/base/services/core/java/com/android/server/am/ProcessList.java 

/**
 * Set the out-of-memory badness adjustment for a process.
 * If {@code pid <= 0}, this method will be a no-op.
 *
 * @param pid The process identifier to set.
 * @param uid The uid of the app
 * @param amt Adjustment value -- lmkd allows -1000 to +1000
 *
 * {@hide}
 */
public static void setOomAdj(int pid, int uid, int amt) {
        // This indicates that the process is not started yet and so no need to proceed further.
        if (pid <= 0) {
                return;
        }
        if (amt == UNKNOWN_ADJ)
                return;

        long start = SystemClock.elapsedRealtime();
        ByteBuffer buf = ByteBuffer.allocate(4 * 4);
        buf.putInt(LMK_PROCPRIO);
        buf.putInt(pid);
        buf.putInt(uid);
        buf.putInt(amt);
        // 往lmkd socket中写数据
        writeLmkd(buf, null);
        long now = SystemClock.elapsedRealtime();
        if ((now-start) > 250) {
                Slog.w("ActivityManager", "SLOW OOM ADJ: " + (now-start) + "ms for pid " + pid
                                + " = " + amt);
        }
}

2.4 监听psi事件及处理

在初始化阶段，已经对psi事件做了监听及注册了回调函数---mp_event_psi，当监听到psi事件时，将回调mp_event_psi.

主要做了以下事情：

1）计算和更新文件页缓存（file-backed pagecache）的回写（refault）和抖动（thrashing）相关的指标

2）根据这些指标和其他系统状态决定是否需要杀死进程以及杀死进程的条件和原因

static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_params) {
        ...

    if (level == VMPRESS_LEVEL_LOW) {
        if (enable_preferred_apps &&
                (get_time_diff_ms(&last_pa_update_tm, &curr_tm) >= pa_update_timeout_ms)) {
            if (!use_perf_api_for_pref_apps) {
                if (perf_ux_engine_trigger) {
                    perf_ux_engine_trigger(PAPP_OPCODE, preferred_apps);
                }
            } else {
                if (perf_sync_request) {
                    const char * tmp = perf_sync_request(PAPP_PERF_TRIGGER);
                    if (tmp != NULL) {
                        strlcpy(preferred_apps, tmp, strlen(tmp));
                        free((void *)tmp);
                    }
                }
            }
            last_pa_update_tm = curr_tm;
        }
    }

    record_wakeup_time(&curr_tm, events ? Event : Polling, &wi);

    bool kill_pending = is_kill_pending();
    if (kill_pending && (kill_timeout_ms == 0 ||
        get_time_diff_ms(&last_kill_tm, &curr_tm) < static_cast<long>(kill_timeout_ms))) {
        /* Skip while still killing a process */
        wi.skipped_wakeups++;
        ULMK_LOG(D, "Ignoring %s pressure event; kill already in progress",
                 level_name[level]);
        goto no_kill;
    }
    /*
     * Process is dead or kill timeout is over, stop waiting. This has no effect if pidfds are
     * supported and death notification already caused waiting to stop.
     */
    stop_wait_for_proc_kill(!kill_pending);

    if (vmstat_parse(&vs) < 0) {
        ALOGE("Failed to parse vmstat!");
        return;
    }
    /* Starting 5.9 kernel workingset_refault vmstat field was renamed workingset_refault_file */
    workingset_refault_file = vs.field.workingset_refault ? : vs.field.workingset_refault_file;

    if (meminfo_parse(&mi) < 0) {
        ALOGE("Failed to parse meminfo!");
        return;
    }

    /* Reset states after process got killed */
    if (killing) {
        killing = false;
        cycle_after_kill = true;
        /* Reset file-backed pagecache size and refault amounts after a kill */
        base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
        init_ws_refault = workingset_refault_file;
        thrashing_reset_tm = curr_tm;
        prev_thrash_growth = 0;
    }

    if (vs.field.compact_stall > init_compact_stall) {
        init_compact_stall = vs.field.compact_stall;
        in_compaction = true;
    }

    /* Identify reclaim state */
    if (vs.field.pgscan_direct != init_pgscan_direct) {
        init_pgscan_direct = vs.field.pgscan_direct;
        init_pgscan_kswapd = vs.field.pgscan_kswapd;
        init_pgrefill = vs.field.pgrefill;
        for (i = VS_PGSKIP_FIRST_ZONE; i <= VS_PGSKIP_LAST_ZONE; i++) {
            init_pgskip[PGSKIP_IDX(i)] = vs.arr[i];
        }
        reclaim = DIRECT_RECLAIM;
    }  else if (vs.field.pgscan_direct_throttle > init_direct_throttle) {
        init_direct_throttle = vs.field.pgscan_direct_throttle;
        reclaim = DIRECT_RECLAIM_THROTTLE;

    } else if (vs.field.pgscan_kswapd > init_pgscan_kswapd) {

        init_pgscan_kswapd = vs.field.pgscan_kswapd;
        init_pgrefill = vs.field.pgrefill;
        for (i = VS_PGSKIP_FIRST_ZONE; i <= VS_PGSKIP_LAST_ZONE; i++) {
            init_pgskip[PGSKIP_IDX(i)] = vs.arr[i];
        }
        reclaim = KSWAPD_RECLAIM;
    } else if (vs.field.pgrefill != init_pgrefill) {
        init_pgrefill = vs.field.pgrefill;
        for (i = VS_PGSKIP_FIRST_ZONE; i <= VS_PGSKIP_LAST_ZONE; i++) {
            init_pgskip[PGSKIP_IDX(i)] = vs.arr[i];
        }
        /*
         * On a system with only 2 zones, pgrefill indicating that pages are not eligible.
         * Then there may be real refilling happens for normal zone pages too.
         *
         * This makes to consider only normal zone stats when system is under reclaim, under
         * calc_zone_watermarks.
         */
        if (MGLRU_status) {
            pgskip_deltas[PGSKIP_IDX(VS_PGSKIP_MOVABLE)] = 1;
        }
        reclaim = PGREFILL;
    } else if (workingset_refault_file == prev_workingset_refault) {
        if (enable_preferred_apps &&
                  (get_time_diff_ms(&last_pa_update_tm, &curr_tm) >= pa_update_timeout_ms)) {
              if (!use_perf_api_for_pref_apps) {
                  if (perf_ux_engine_trigger) {
                      perf_ux_engine_trigger(PAPP_OPCODE, preferred_apps);
                  }
              } else {
                  if (perf_sync_request) {
                      const char * tmp = perf_sync_request(PAPP_PERF_TRIGGER);
                      if (tmp != NULL) {
                          strlcpy(preferred_apps, tmp, strlen(tmp));
                          free((void *)tmp);
                      }
                  }
              }
              last_pa_update_tm = curr_tm;
        }
    }


    prev_workingset_refault = workingset_refault_file;

     /*
     * It's possible we fail to find an eligible process to kill (ex. no process is
     * above oom_adj_min). When this happens, we should retry to find a new process
     * for a kill whenever a new eligible process is available. This is especially
     * important for a slow growing refault case. While retrying, we should keep
     * monitoring new thrashing counter as someone could release the memory to mitigate
     * the thrashing. Thus, when thrashing reset window comes, we decay the prev thrashing
     * counter by window counts. If the counter is still greater than thrashing limit,
     * we preserve the current prev_thrash counter so we will retry kill again. Otherwise,
     * we reset the prev_thrash counter so we will stop retrying.
     */
    since_thrashing_reset_ms = get_time_diff_ms(&thrashing_reset_tm, &curr_tm);
    if (since_thrashing_reset_ms > THRASHING_RESET_INTERVAL_MS) {
        long windows_passed;
        /* Calculate prev_thrash_growth if we crossed THRASHING_RESET_INTERVAL_MS */
        prev_thrash_growth = (workingset_refault_file - init_ws_refault) * 100
                            / (base_file_lru + 1);
        windows_passed = (since_thrashing_reset_ms / THRASHING_RESET_INTERVAL_MS);
        /*
         * Decay prev_thrashing unless over-the-limit thrashing was registered in the window we
         * just crossed, which means there were no eligible processes to kill. We preserve the
         * counter in that case to ensure a kill if a new eligible process appears.
         */
        if (windows_passed > 1 || prev_thrash_growth < thrashing_limit) {
            prev_thrash_growth >>= windows_passed;
        }

        /* Record file-backed pagecache size when crossing THRASHING_RESET_INTERVAL_MS */
        base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
        init_ws_refault = workingset_refault_file;
        thrashing_reset_tm = curr_tm;
        thrashing_limit = thrashing_limit_pct;
    } else {
        /* Calculate what % of the file-backed pagecache refaulted so far */
        thrashing = (workingset_refault_file - init_ws_refault) * 100 / (base_file_lru + 1);

    }
    /* Add previous cycle's decayed thrashing amount */
    thrashing += prev_thrash_growth;
    if (max_thrashing < thrashing) {
        max_thrashing = thrashing;
    }

    if (zoneinfo_parse(&zi) < 0) {
        ALOGE("Failed to parse zoneinfo!");
        return;
    }

    calc_zone_watermarks(&zi, &zone_mem_info, pgskip_deltas);

    /* Find out which watermark is breached if any */
    wmark = get_lowest_watermark(&mi, &zone_mem_info, level, events, in_compaction);

    LmkdStub::log_meminfo(&mi, wmark);

    if (tune_qcom_lmkd && events == 0) {
        qcom_psi_event = false;
    }

    /*
     * TODO: move this logic into a separate function
     * Decide if killing a process is necessary and record the reason
     */
    if (cycle_after_kill && wmark <= WMARK_LOW) {
        /*
         * Prevent kills not freeing enough memory which might lead to OOM kill.
         * This might happen when a process is consuming memory faster than reclaim can
         * free even after a kill. Mostly happens when running memory stress tests.
         */
        kill_reason = PRESSURE_AFTER_KILL;
        strlcpy(kill_desc, "min watermark is breached even after kill", sizeof(kill_desc));
        min_score_adj = PERCEPTIBLE_RECENT_FOREGROUND_APP_ADJ;
        if (wmark > WMARK_MIN) {
            min_score_adj = VISIBLE_APP_ADJ;
        }
    } else if (reclaim == DIRECT_RECLAIM_THROTTLE) {
        kill_reason = DIRECT_RECL_AND_THROT;
        strlcpy(kill_desc, "system processes are being throttled", sizeof(kill_desc));
   
    } else if (level == VMPRESS_LEVEL_CRITICAL && wmark <= WMARK_HIGH && qcom_psi_event) {
        /*
         * Device is too busy reclaiming memory which might lead to ANR.
         * Critical level is triggered when PSI complete stall (all tasks are blocked because
         * of the memory congestion) breaches the configured threshold.
         */
        kill_reason = CRITICAL_KILL;
        strlcpy(kill_desc, "critical pressure and device is low on memory", sizeof(kill_desc));
        min_score_adj = PERCEPTIBLE_LOW_APP_ADJ + 1;
    } else if (level == VMPRESS_LEVEL_SUPER_CRITICAL && wmark <= WMARK_HIGH && qcom_psi_event) {

        kill_reason = NOT_RESPONDING;
        strlcpy(kill_desc, "device is not responding", sizeof(kill_desc));
    } else if (swap_is_low && thrashing > thrashing_limit_pct) {
        /* Page cache is thrashing while swap is low */
        kill_reason = LOW_SWAP_AND_THRASHING;
        snprintf(kill_desc, sizeof(kill_desc), "device is low on swap (%" PRId64
            "kB < %" PRId64 "kB) and thrashing (%" PRId64 "%%)",
            get_free_swap(&mi) * page_k, swap_low_threshold * page_k, thrashing);
        /* Do not kill perceptible apps unless below min watermark or heavily thrashing */
        if (wmark > WMARK_MIN && thrashing < thrashing_critical_pct) {
            min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
        }
        check_filecache = true;
    } else if (swap_is_low && wmark <= WMARK_HIGH) {
        /* Both free memory and swap are low */
        kill_reason = LOW_MEM_AND_SWAP;
        snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and swap is low (%"
            PRId64 "kB < %" PRId64 "kB)", wmark < WMARK_LOW ? "min" : "low",
            get_free_swap(&mi) * page_k, swap_low_threshold * page_k);
        /* Do not kill perceptible apps unless below min watermark or heavily thrashing */
        if (wmark > WMARK_MIN && thrashing < thrashing_critical_pct) {
            min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
        }
    } else if (wmark < WMARK_HIGH && swap_util_max < 100 &&
               (swap_util = calc_swap_utilization(&mi)) > swap_util_max) {
        /*
         * Too much anon memory is swapped out but swap is not low.
         * Non-swappable allocations created memory pressure.
         */
        kill_reason = LOW_MEM_AND_SWAP_UTIL;
        snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and swap utilization"
            " is high (%d%% > %d%%)", wmark < WMARK_LOW ? "min" : "low",
            swap_util, swap_util_max);
    } else if (wmark <= WMARK_HIGH && thrashing > thrashing_limit) {
        /* Page cache is thrashing while memory is low */
        kill_reason = LOW_MEM_AND_THRASHING;
        snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and thrashing (%"
            PRId64 "%%)", wmark < WMARK_LOW ? "min" : "low", thrashing);
        cut_thrashing_limit = true;
        min_score_adj = VISIBLE_APP_ADJ;

        check_filecache = true;
    } else if (reclaim == DIRECT_RECLAIM && thrashing > thrashing_limit) {
        /* Page cache is thrashing while in direct reclaim (mostly happens on lowram devices) */
        kill_reason = DIRECT_RECL_AND_THRASHING;
        snprintf(kill_desc, sizeof(kill_desc), "device is in direct reclaim and thrashing (%"
            PRId64 "%%)", thrashing);
        cut_thrashing_limit = true;
        /* Do not kill perceptible apps unless thrashing at critical levels */
        min_score_adj = PERCEPTIBLE_APP_ADJ + 1;

        check_filecache = true;
    } else if (check_filecache) {
        int64_t file_lru_kb = (vs.field.nr_inactive_file + vs.field.nr_active_file) * page_k;

        if (file_lru_kb < filecache_min_kb) {
            /* File cache is too low after thrashing, keep killing background processes */
            kill_reason = LOW_FILECACHE_AFTER_THRASHING;
            snprintf(kill_desc, sizeof(kill_desc),
                "filecache is low (%" PRId64 "kB < %" PRId64 "kB) after thrashing",
                file_lru_kb, filecache_min_kb);
            min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
        } else {
            /* File cache is big enough, stop checking */
            check_filecache = false;
        }
    } else if (reclaim == DIRECT_RECLAIM && wmark <= WMARK_HIGH) {
        kill_reason = DIRECT_RECL_AND_LOW_MEM;
        strlcpy(kill_desc, "device is in direct reclaim and low on memory", sizeof(kill_desc));
        min_score_adj = PERCEPTIBLE_APP_ADJ;
    } else if (in_compaction && wmark <= WMARK_HIGH) {
        kill_reason = COMPACTION;
        strlcpy(kill_desc, "device is in compaction and low on memory", sizeof(kill_desc));
        min_score_adj = VISIBLE_APP_ADJ;
    }

    /* Kill a process if necessary */
    if (kill_reason != NONE) {
        struct kill_info ki = {
            .kill_reason = kill_reason,
            .kill_desc = kill_desc,
            .thrashing = (int)thrashing,
            .max_thrashing = max_thrashing,
        };

        /* Allow killing perceptible apps if the system is stalled */
        if (critical_stall) {
            min_score_adj = 0;
        }
        psi_parse_io(&psi_data);
        psi_parse_cpu(&psi_data);
        // 找到合适的进程杀死
        int pages_freed = find_and_kill_process(min_score_adj, &ki, &mi, &wi, &curr_tm, &psi_data);
        if (pages_freed > 0) {
            killing = true;
            max_thrashing = 0;
            /* Killed..Just reduce/increase the boost... */
            if (kill_reason == CRITICAL_KILL || kill_reason == DIRECT_RECL_AND_THROT) {
                wbf_effective =  std::min(wbf_effective + wbf_step, wmark_boost_factor);
            } else {
                wbf_effective = std::max(wbf_effective - wbf_step, 1);
            }
            if (cut_thrashing_limit) {
                /*
                 * Cut thrasing limit by thrashing_limit_decay_pct percentage of the current
                 * thrashing limit until the system stops thrashing.
                 */
                thrashing_limit = (thrashing_limit * (100 - thrashing_limit_decay_pct)) / 100;
            }
        } 
    } else {
        ...
    }
        ...
}

进程查杀的原因：

查杀原因	说明
PRESSURE_AFTER_KILL	执行了一次 kill 操作后&内存水位低于低水位标记
CRITICAL_KILL	内存压力达到了CRITICAL临界值
NOT_RESPONDING	内存压力达到了超级临界级别&当前内存水位低于或等于高水位标记&触发PSI事件
LOW_SWAP_AND_THRASHING	当前抖动程度超过了抖动阈值
LOW_MEM_AND_SWAP	swap空间不足&内存水位低于高水位标记
LOW_MEM_AND_SWAP_UTIL	内存水位低于高水位标记&交换空间利用率最大值小于100 &交换空间利用率超过了利用率最大值
LOW_MEM_AND_THRASHING	内存水位低于或等于高水位标记&当前抖动程度超过了抖动阈值
DIRECT_RECL_AND_THRASHING	内核直接从用户空间进程回收内存&当前抖动程度超过了抖动阈值
LOW_FILECACHE_AFTER_THRASHING	计算得到的文件缓存大小 file_lru_kb 小于预设的最小文件缓存大小 filecache_min_kb，即文件缓存过低
DIRECT_RECL_AND_LOW_MEM	内核直接从用户空间进程回收内存&内存水位低于高水位标记
COMPACTION	当前正在进行内存压缩&内存水位低于高水位标记

自定义的oom_adj：

ADJ	Value	说明
PRE_PREVIOUS_APP_ADJ	701	前一个应用程序
PERCEPTIBLE_LOW_APP_ADJ	250	可感知但优先级较低的应用程序
PERCEPTIBLE_APP_ADJ	200	可感的应用程序
VISIBLE_APP_ADJ	100	用户可见的应用程序
PERCEPTIBLE_RECENT_FOREGROUND_APP_ADJ	50	最近的前台可感知应用程序

2.5 进程选取与查杀

主要流程：

1）一些参数和状态的计算

2）根据得出的状态确定查杀原因

3）选择进程进行查杀

2.5.1 进程选取

在系统中找到并杀死一个符合给定 oom_score_adj 级别的进程，以释放内存。该函数通过循环查找并选择合适的进程，如下：

/*
 * Find one process to kill at or above the given oom_score_adj level.
 * Returns size of the killed process.
 */
static int find_and_kill_process(int min_score_adj, struct kill_info *ki, union meminfo *mi,
                                 struct wakeup_info *wi, struct timespec *tm,
                                 struct psi_data *pd) {
    int i;
    int killed_size = 0;
    bool lmk_state_change_start = false;
    bool choose_heaviest_task = kill_heaviest_task;

    for (i = OOM_SCORE_ADJ_MAX; i >= min_score_adj; i--) {
        struct proc *procp;

        if (!choose_heaviest_task && i <= PERCEPTIBLE_APP_ADJ) {    // 可感的应用程序
            /*
             * If we have to choose a perceptible process, choose the heaviest one to
             * hopefully minimize the number of victims.
             */
            choose_heaviest_task = true;
        }
        // 可感知的应用程序，走proc_get_heaviest流程。否则，走proc_adj_tail流程
        while (true) {
            procp = choose_heaviest_task ?
                proc_get_heaviest(i) : proc_adj_tail(i);

            if (!procp)
                break;
            // 在2.5.2中进行进程查杀
            killed_size = kill_one_process(procp, min_score_adj, ki, mi, wi, tm, pd);
            if (killed_size >= 0) {
                if (!lmk_state_change_start) {
                    lmk_state_change_start = true;
                    stats_write_lmk_state_changed(STATE_START);
                }
                break;
            }
        }
        if (killed_size) {
            break;
        }
    }

    if (!killed_size && !min_score_adj && is_userdebug_or_eng_build) {
        killed_size = LmkdStub::proc_get_script();
    }

    if (lmk_state_change_start) {
        stats_write_lmk_state_changed(STATE_STOP);
    }

    return killed_size;
}

可感知的应用程序（PRE_PREVIOUS_APP_ADJ、VISIBLE_APP_ADJ、PERCEPTIBLE_RECENT_FOREGROUND_APP_ADJ），走proc_get_heaviest流程来选择进程，如下：

// 从给定的 oomadj 级别中选择内存占用最多的进程
static struct proc *proc_get_heaviest(int oomadj) {
    struct adjslot_list *head = &procadjslot_list[ADJTOSLOT(oomadj)];
    struct adjslot_list *curr = head->next;
    struct proc *maxprocp = NULL;
    int maxsize = 0;

    /* Filter out PApps */
    struct proc *maxprocp_pa = NULL;
    int maxsize_pa = 0;
    char *tmp_taskname;
    char buf[LINE_MAX];
    // 循环遍历进程链表，对比各进程占用的内存大小，找到内存占用最多的那个进程
    while (curr != head) {
        int pid = ((struct proc *)curr)->pid;
        // 从"/proc/%d/statm"节点中获取rss的信息
        long tasksize = LmkdStub::proc_get_size(pid);
        if (tasksize < 0) {
            struct adjslot_list *next = curr->next;
            pid_remove(pid);
            curr = next;
        } else {
            tmp_taskname = proc_get_name(pid, buf, sizeof(buf));
            if (tmp_taskname != NULL &&
                    CamOptLmkdPolicy::getInstance()->is_protected_task(oomadj, tmp_taskname, pid)) {
                // protect last recent app for camera forground.
                if (debug_process_killing) {
                    ALOGI("%s, skip %s in oomadj %d", __func__, tmp_taskname, oomadj);
                }
            } else if (enable_preferred_apps && tmp_taskname != NULL && strstr(preferred_apps, tmp_taskname)) {
                if (tasksize > maxsize_pa) {
                    maxsize_pa = tasksize;
                    maxprocp_pa = (struct proc *)curr;
                }
            } else {
                if (tasksize > maxsize) {
                    maxsize = tasksize;
                    maxprocp = (struct proc *)curr;
                }
            }
            curr = curr->next;
        }
    }
    if (maxsize > 0) {
        return maxprocp;
    } else if (maxsize_pa > 0) {
        return maxprocp_pa;
    }
    return (struct proc *)CamOptLmkdPolicy::getInstance()->proc_get_heaviest_extend(oomadj, procadjslot_list);
}

// 从"/proc/%d/statm"节点中获取rss的信息
long proc_get_rss(int pid) {
    static char path[PATH_MAX];
    static char line[LINE_MAX];
    int fd;
    long rss = 0;
    long total;
    ssize_t ret;

    /* gid containing AID_READPROC required */
    snprintf(path, PATH_MAX, "/proc/%d/statm", pid);
    fd = open(path, O_RDONLY | O_CLOEXEC);
    if (fd == -1) {
        return -1;
    }

    ret = read_all(fd, line, sizeof(line) - 1);
    if (ret < 0) {
        close(fd);
        return -1;
    }

    sscanf(line, "%ld %ld ", &total, &rss);
    close(fd);
    return rss;
}

不可感知的或优先级低的应用程序（PRE_PREVIOUS_APP_ADJ、PERCEPTIBLE_LOW_APP_ADJ），走proc_adj_tail流程来选择进程，如下：

// 从procadjslot_list列表的队尾找到目标进程
static struct proc *proc_adj_tail(int oomadj) {
    return (struct proc *)adjslot_tail(&procadjslot_list[ADJTOSLOT(oomadj)]);
}

static struct adjslot_list *adjslot_tail(struct adjslot_list *head) {
    struct adjslot_list *asl = head->prev;

    return asl == head ? NULL : asl;
}

struct adjslot_list procadjslot_list[ADJTOSLOT_COUNT];

2.5.2 进程查杀

主线程将进程pid、uid先存入queue队列，异步线程从queue中取出pid、uid进行杀进程。

主线程将进程pid、uid先存入queue队列，如下：


static int kill_one_process(struct proc* procp, int min_oom_score, struct kill_info *ki,
                            union meminfo *mi, struct wakeup_info *wi, struct timespec *tm,
                            struct psi_data *pd) {
    ...
    start_wait_for_proc_kill(pidfd < 0 ? pid : pidfd);
    // 采用异步查杀进程
    kill_result = reaper.kill({ pidfd, pid, uid }, false);
    ...
}

int Reaper::kill(const struct target_proc& target, bool synchronous) {
    /* CAP_KILL required */
    if (target.pidfd < 0) {
        return ::kill(target.pid, SIGKILL);
    }
    // 异步方式查杀进程
    if (!synchronous && async_kill(target)) {
        // we assume the kill will be successful and if it fails we will be notified
        return 0;
    }

    int result = pidfd_send_signal(target.pidfd, SIGKILL, NULL, 0);
    if (result) {
        return result;
    }

    return 0;
}

// 目标进程的pid、uid放入queue_队列，唤醒reaper thread线程
bool Reaper::async_kill(const struct target_proc& target) {
    ...

    mutex_.lock();

    active_requests_++;

    queue_.push_back({ dup(target.pidfd), target.pid, target.uid });
    // Wake up a reaper thread
    cond_.notify_one();
    mutex_.unlock();

    return true;
}

在init初始化阶段创建子线程，监控queue队列是否为空。如果为空，进入wait，当主线程往queue中push进程数据时，唤醒该子线程。子线程从queue中取出pid、uid进行杀进程，如下：

bool Reaper::init(int comm_fd) {
    ...
        pthread_create(&thread_pool_[thread_cnt_], NULL, reaper_main, this)
        ...
    queue_.reserve(thread_cnt_);
        ...
}

static void* reaper_main(void* param) {
    ...

    for (;;) {
        // 从队列中取出pid
        target = reaper->dequeue_request();
        // 给目标进程发送SIGKILL信号
        pidfd_send_signal(target.pidfd, SIGKILL, NULL, 0);
        ...
    }
    ...
}

Reaper::target_proc Reaper::dequeue_request() {
    struct target_proc target;
    std::unique_lock<std::mutex> lock(mutex_);
    // queue队列为空时，进入休眠。否则，从队列中取出uid、pid
    while (queue_.empty()) {
        cond_.wait(lock);
    }
    target = queue_.back();
    queue_.pop_back();

    return target;
}

三、关键系统属性

以persist.device_config.lmkd_native.*和 ro.lmk.*为属性前缀。

属性	作用
low、medium、critical	设置不同内存压力级别的 `oom_score_adj` 值。 `low` 对应较低的内存压力，`critical` 对应高内存压力。
debug	用于启用或禁用 lmkd 的调试信息输出。
critical_upgrade	用于启用或禁用内存压力升级逻辑，即在特定条件下提高内存压力级别。
upgrade_pressure 、downgrade_pressure	用于设置内存压力升级和降级的阈值。
kill_heaviest_task	用于决定是否优先杀死内存占用最高的任务。
kill_timeout_ms	设置杀死任务的超时时间。
use_minfree_levels	用于决定是否使用 `minfree` 级别的配置。
swap_free_low_percentage	设置交换空间低百分比的阈值，用于判断系统是否处于低交换空间状态
psi_partial_stall_ms、psi_complete_stall_ms	设置 PSI（Pressure Stall Information）部分和完全停滞的阈值，用于监控系统内存压力。
thrashing_limit_pct、thrashing_limit_decay_pct	设置内存抖动的限制和衰减百分比，用于控制内存抖动情况下的处理策略。
swap_util_max	设置交换利用率的最大值，用于限制交换空间的使用。
filecache_min_kb	设置文件缓存的最小值，用于控制文件缓存的大小。

四、核心数据结构

// epoll event结构体
struct epoll_event {
        uint32_t events;
        epoll_data_t data;
}

// 内存回收的水位
enum zone_watermark {
    WMARK_MIN = 0,
    WMARK_LOW,
    WMARK_HIGH,
    WMARK_NONE
};

// 用于存放进程信息
struct proc {
    struct adjslot_list asl;
    int pid;
    int pidfd;
    uid_t uid;
    int oomadj;
    pid_t reg_pid; /* PID of the process that registered this record */
    bool valid;
    struct proc *pidhash_next;
};

// 存放struct proc的结构体
struct adjslot_list {
    struct adjslot_list *next;
    struct adjslot_list *prev;
};

// 用于设置lmkd socket的epoll监听的sock信息封装
struct sock_event_handler_info {
    int sock;
    pid_t pid;
    uint32_t async_event_mask;
    struct event_handler_info handler_info;
};