简介
工作一年多了,天天CRUD,终于以前学习的算法排上用场了。
背景
我们的系统在用户注册时没有校验身份id(身份证)和电话号码的唯一,可能使用相同的身份id或者电话号码创建多个账号,导致有些人开多个账号领多份钱。现在公司想降本增效,让我统计出各个市场存在相同身份id或者电话号码的账号信息。
输入
两张表已过滤本次统计不涉及的字段
ops_tab
(operator 是站点员工表)
staff_tab
(员工表,里面包含的信息更详细。staff包括operator和driver,因此ops_tab的数据是staff_tab的子集)
ops_tab没有PassportId,因此需要拿staff Id去staff_tab获取对应passport id。
另外各个市场判断账号是否来自于同一个人是有差别的,有些市场允许多个账号使用同一个电话号码。
市场 | 判断条件 |
---|---|
印度尼西亚ID | 1、PassportId 2、Phone Number |
马来西亚MY | 1、PassportId 2、Phone Number |
菲律宾PH | 1、PassportId 2、Phone Number |
新加坡SG | 1、PassportId 2、Phone Number |
泰国TH | 1、Phone Number |
台湾TW | 1、PassportId |
越南VN | 1、PassportId 2、Phone Number |
比如印度尼西亚就不允许Passport Id和电话号码重复,而泰国允许Passport Id重复而电话号码不能重复。
输出
输出excel,相同信息在同一个组,只统计存在重复信息的账号,不重复的不需要展示。
详细设计
流程图
golang实现
算法主体
func Run(opsList []*OpsTab, staffList []*StaffTab) {
cid := strings.ToUpper(os.Getenv("CID"))
fmt.Println(cid)
if len(cid) == 0 {
return
}
NewExcelListWithOpsAndStaff(opsList, staffList).
Group(cid).
Export(cid)
}
type OpsTab struct {
Id uint64 `gorm:"column:id" json:"id"`
OpsId string `gorm:"column:ops_id" json:"ops_id"`
OpsName string `gorm:"column:ops_name" json:"ops_name"`
Phone string `gorm:"column:phone" json:"phone"`
OpsStatus int `gorm:"column:ops_status" json:"ops_status"`
StaffId string `gorm:"column:staff_id" json:"staff_id"`
}
type StaffTab struct {
StaffId string `gorm:"column:staff_id" json:"staff_id"`
PassportId string `gorm:"column:passport_id" json:"passport_id"`
}
// 根据staff_id将Ops_tab和staff_tab合并成一个结构体
type ExcelResultItem struct {
OpsName string
OpsId string
StaffId string
Phone string
PassportId string
OpsStatus int
}
type ExcelList []*ExcelResultItem
func NewExcelListWithOpsAndStaff(opsList []*OpsTab, staffList []*StaffTab) *ExcelList {
return (&ExcelList{}).toExcelResultItemList(opsList, staffList)
}
func NewExcelList() *ExcelList {
return &ExcelList{}
}
func (e *ExcelList) Append(item *ExcelResultItem) {
*e = append(*e, item)
}
func (e *ExcelList) Size() int {
return len(*e)
}
func (e *ExcelList) toExcelResultItemList(opsList []*OpsTab, staffList []*StaffTab) *ExcelList {
staffId2StaffMap := make(map[string]*StaffTab)
for _, item := range staffList {
staffId2StaffMap[item.StaffId] = item
}
for _, item := range opsList {
passportId := ""
if staff, ok := staffId2StaffMap[item.StaffId]; ok {
passportId = staff.PassportId
}
*e = append(*e, &ExcelResultItem{
OpsName: item.OpsName,
OpsId: item.OpsId,
StaffId: item.StaffId,
Phone: item.Phone,
PassportId: passportId,
OpsStatus: item.OpsStatus,
})
}
return e
}
func (e *ExcelList) Group(cid string) *GroupedExcelList {
excelItemList := *e
uf := NewPathCompressionUnionFind(len(excelItemList))
for i := range excelItemList {
for j := i + 1; j < len(excelItemList); j++ {
if handler.Check(cid, excelItemList[i], excelItemList[j]) {
uf.Union(i, j)
}
}
}
ufId2ExcelListMap := make(map[int]*ExcelList)
for id, item := range excelItemList {
root := uf.Find(id)
if uf.GetSize(root) < 2 {
continue
}
if _, ok := ufId2ExcelListMap[root]; !ok {
ufId2ExcelListMap[root] = NewExcelList()
}
ufId2ExcelListMap[root].Append(item)
}
res := NewGroupedExcelList()
for _, list := range ufId2ExcelListMap {
res.Append(list)
}
return res
}
type GroupedExcelList []*ExcelList
func NewGroupedExcelList() *GroupedExcelList {
return &GroupedExcelList{}
}
func (g *GroupedExcelList) Append(list *ExcelList) {
*g = append(*g, list)
}
校验是否重复(策略模式)
类似这个,实际简单一点
var handler = NewCheckWhetherSameStrategyHandler()
type CheckWhetherSameStrategyHandler struct {
region2StrategyMap map[string]CheckWhetherSameStrategy
}
func NewCheckWhetherSameStrategyHandler() *CheckWhetherSameStrategyHandler {
return &CheckWhetherSameStrategyHandler{
region2StrategyMap: make(map[string]CheckWhetherSameStrategy),
}
}
func (c *CheckWhetherSameStrategyHandler) Register(cid string, strategy CheckWhetherSameStrategy) {
c.region2StrategyMap[cid] = strategy
}
func (c *CheckWhetherSameStrategyHandler) Check(cid string, ops1, ops2 *ExcelResultItem) bool {
if _, ok := c.region2StrategyMap[cid]; !ok {
panic(fmt.Sprintf("cid[%v] not exist", cid))
}
return c.region2StrategyMap[cid].Check(ops1, ops2)
}
type CheckWhetherSameStrategy interface {
Check(ops1, ops2 *ExcelResultItem) bool
}
type CheckWhetherSameStrategyID struct{}
func (c *CheckWhetherSameStrategyID) Check(ops1, ops2 *ExcelResultItem) bool {
return len(ops1.PassportId) != 0 && ops1.PassportId == ops2.PassportId || len(ops1.Phone) != 0 && ops1.Phone == ops2.Phone
}
type CheckWhetherSameStrategyMY struct{}
func (c *CheckWhetherSameStrategyMY) Check(ops1, ops2 *ExcelResultItem) bool {
return len(ops1.PassportId) != 0 && ops1.PassportId == ops2.PassportId || len(ops1.Phone) != 0 && ops1.Phone == ops2.Phone
}
type CheckWhetherSameStrategyPH struct{}
func (c *CheckWhetherSameStrategyPH) Check(ops1, ops2 *ExcelResultItem) bool {
return len(ops1.PassportId) != 0 && ops1.PassportId == ops2.PassportId || len(ops1.Phone) != 0 && ops1.Phone == ops2.Phone
}
type CheckWhetherSameStrategySG struct{}
func (c *CheckWhetherSameStrategySG) Check(ops1, ops2 *ExcelResultItem) bool {
return len(ops1.PassportId) != 0 && ops1.PassportId == ops2.PassportId || len(ops1.Phone) != 0 && ops1.Phone == ops2.Phone
}
type CheckWhetherSameStrategyTH struct{}
func (c *CheckWhetherSameStrategyTH) Check(ops1, ops2 *ExcelResultItem) bool {
return len(ops1.Phone) != 0 && ops1.Phone == ops2.Phone
}
type CheckWhetherSameStrategyTW struct{}
func (c *CheckWhetherSameStrategyTW) Check(ops1, ops2 *ExcelResultItem) bool {
return len(ops1.PassportId) != 0 && ops1.PassportId == ops2.PassportId
}
type CheckWhetherSameStrategyVN struct{}
func (c *CheckWhetherSameStrategyVN) Check(ops1, ops2 *ExcelResultItem) bool {
return len(ops1.PassportId) != 0 && ops1.PassportId == ops2.PassportId || len(ops1.Phone) != 0 && ops1.Phone == ops2.Phone
}
func init() {
handler.Register("ID", &CheckWhetherSameStrategyID{})
handler.Register("MY", &CheckWhetherSameStrategyMY{})
handler.Register("PH", &CheckWhetherSameStrategyPH{})
handler.Register("SG", &CheckWhetherSameStrategySG{})
handler.Register("TH", &CheckWhetherSameStrategyTH{})
handler.Register("TW", &CheckWhetherSameStrategyTW{})
handler.Register("VN", &CheckWhetherSameStrategyVN{})
}
golang Excel导出操作
func (g *GroupedExcelList) Export(cid string) {
//创建excel文件
xlsx := excelize.NewFile()
sheet := fmt.Sprintf("same_field_ops_%v_excel", cid)
//创建新表单
index := xlsx.NewSheet(sheet)
//第一行 各个字段的名称
row := 1
setFieldValue2Excel(xlsx, sheet, row, FieldNameGroup, FieldNameGroup)
setFieldValue2Excel(xlsx, sheet, row, FieldNameOpsName, FieldNameOpsName)
setFieldValue2Excel(xlsx, sheet, row, FieldNameOpsId, FieldNameOpsId)
setFieldValue2Excel(xlsx, sheet, row, FieldNamePhoneNumber, FieldNamePhoneNumber)
setFieldValue2Excel(xlsx, sheet, row, FieldNamePassportId, FieldNamePassportId)
setFieldValue2Excel(xlsx, sheet, row, FieldNameStatus, FieldNameStatus)
row++
for groupId, listPtr := range *g {
//Excel 一组
list := *listPtr
startRow := row
row++
for _, item := range list {
//Excel 一行
setFieldValue2Excel(xlsx, sheet, row, FieldNameGroup, fmt.Sprintf("%v", groupId))
setFieldValue2Excel(xlsx, sheet, row, FieldNameOpsName, item.OpsName)
setFieldValue2Excel(xlsx, sheet, row, FieldNameOpsId, item.OpsId)
setFieldValue2Excel(xlsx, sheet, row, FieldNamePhoneNumber, item.Phone)
setFieldValue2Excel(xlsx, sheet, row, FieldNamePassportId, item.PassportId)
setFieldValue2Excel(xlsx, sheet, row, FieldNameStatus, fmt.Sprintf("%v", item.OpsStatus))
row++
}
//合并A列成一组
xlsx.MergeCell(sheet, fmt.Sprintf("A%v", startRow), fmt.Sprintf("A%v", row - 1))
}
//设置默认打开的表单
xlsx.SetActiveSheet(index)
//保存文件到指定路径
err := xlsx.SaveAs(fmt.Sprintf("./%v.xlsx", sheet))
if err != nil {
log.Fatal(err)
}
}
type FieldName = string
const (
FieldNameGroup FieldName = "Group"
FieldNameOpsName FieldName = "Ops Name"
FieldNameOpsId FieldName = "Ops Id"
FieldNamePhoneNumber FieldName = "Phone Number"
FieldNamePassportId FieldName = "Passport Id"
FieldNameStatus FieldName = "Status"
)
//如图,表格上角的ABCDEF
var posMap = map[FieldName]string{
FieldNameGroup: "A",
FieldNameOpsName: "B",
FieldNameOpsId: "C",
FieldNamePhoneNumber: "D",
FieldNamePassportId: "E",
FieldNameStatus: "F",
}
func setFieldValue2Excel(xlsx *excelize.File, sheet string, row int, key FieldName, val string) {
if _, err := strconv.ParseInt(val, 10, 64); len(val) > 1 && err == nil {
val = "'" + val
}
xlsx.SetCellValue(sheet, fmt.Sprintf("%v%v", posMap[key], row), val)
}
路径压缩并查集
type PathCompressionUnionFind struct {
parents []int
size []int
}
func NewPathCompressionUnionFind(len int) *PathCompressionUnionFind {
parents := make([]int, len)
size := make([]int, len)
for i := range parents {
parents[i] = i
size[i] = 1
}
return &PathCompressionUnionFind{
parents: parents,
size: size,
}
}
func (t *PathCompressionUnionFind) Union(p, q int) {
pRoot := t.Find(p)
qRoot := t.Find(q)
if pRoot == qRoot {
return
}
t.parents[qRoot] = t.parents[pRoot]
t.size[pRoot] += t.size[qRoot]
}
func (t *PathCompressionUnionFind) Find(p int) int {
for t.parents[p] != p {
t.parents[p] = t.parents[t.parents[p]]
p = t.parents[p]
}
return p
}
func (t *PathCompressionUnionFind) GetSize(p int) int {
if p < 0 || p >= len(t.parents) {
panic("Union Find Array out of bounds")
}
return t.size[t.Find(p)]
}
复杂度
时间复杂度
O(N^2)
空间复杂度
O(N)
其他
从k8s pod上连接数据库并下载表数据
我们线上数据数据量约200万,从公司提供的平台查会被限制单词1000条数据,因此下载ops_tab和staff_tab的数据需要写个脚本在k8s上执行。
本地(MAC)
安装交叉编译环境
brew install FiloSottile/musl-cross/musl-cross
切换到main函数所在包,执行编译
CGO_ENABLED=1 GOOS=linux GOARCH=amd64 CC=x86_64-linux-musl-gcc CGO_LDFLAGS="-static" go build
压缩并拆分为多个文件方便上传
tar cjf - $your_name$ |split -b 5m - $your_name$.tar.bz2.
在k8s pod上
安装上传插件
apt install -y lrzsz
安装解压插件
apt install -y bzip2
上传文件
rz
给文件赋执行权限
chmod +x $your_name$
运行
./$your_name$