配置环境
Gitalb 14.9
由docker搭建,使用的数据库是内置的postgresql
问题背景
近期发现自研系统在请求调用 gitlab 的 api 接口时,尤其是并发请求同一资源时,我这里是并发创建subgroup这个资源,会得到500的报错,起初以为是自己研发的系统代码在处理过程中由于数据不准确导致的500错误,没有太过在意。但是最近又重现了这个现象,因此去尝试查看下错误的原因。
模拟错误产生环境
那就来压测一把验证下是否真的是因为并发调用产生的问题。首先尝试在0.01秒内同时启动5个线程来请求创建subgroup资源
在小并发量下就直接复现了这个错误。可以看到只有第一个请求正常返回了结果,其余的4个请求都是报错 500 Internal Error
寻找错误原因
自己系统的错误
gitlab的nginx日志,/var/log/gitlab/nginx/gitlab_access.log 中找到了这个请求确实是返回的500
在gitlab中有个 gitlab-rails 模块
在该模块下的api_json.log 日志文件中找到了错误的原因。
{
"time": "2023-07-19T05:19:06.173Z",
"severity": "INFO",
"duration_s": 0.08902,
"db_duration_s": 0.0557,
"view_duration_s": 0.03332,
"status": 500,
"method": "POST",
"path": "/api/v4/groups",
"params": [{
"key": "path",
"value": "gcptest071902"
}, {
"key": "name",
"value": "gcptest071902"
}, {
"key": "parent_id",
"value": 234
}],
"host": "wx8vm00001.apac.bosch.com",
"remote_ip": "10.4.103.206, 127.0.0.1",
"ua": "python-requests/2.28.1",
"route": "/api/:version/groups",
"user_id": 1,
"username": "root",
"exception.class": "ActiveRecord::RecordNotUnique",
"exception.message": "PG::UniqueViolation: ERROR: duplicate key value violates unique constraint \"index_namespaces_name_parent_id_type\"\nDETAIL: Key (name, parent_id, type)=(gcptest071902, 234, Group) already exists.\n",
"exception.backtrace": ["lib/gitlab/database/load_balancing/connection_proxy.rb:126:in `block in write_using_load_balancer'", "lib/gitlab/database/load_balancing/load_balancer.rb:112:in `block in read_write'", "lib/gitlab/database/load_balancing/load_balancer.rb:172:in `retry_with_backoff'", "lib/gitlab/database/load_balancing/load_balancer.rb:110:in `read_write'", "lib/gitlab/database/load_balancing/connection_proxy.rb:125:in `write_using_load_balancer'", "lib/gitlab/database/load_balancing/connection_proxy.rb:67:in `block (2 levels) in <class:ConnectionProxy>'", "lib/gitlab/database/load_balancing/connection_proxy.rb:126:in `block in write_using_load_balancer'", "lib/gitlab/database/load_balancing/load_balancer.rb:112:in `block in read_write'", "lib/gitlab/database/load_balancing/load_balancer.rb:172:in `retry_with_backoff'", "lib/gitlab/database/load_balancing/load_balancer.rb:110:in `read_write'", "lib/gitlab/database/load_balancing/connection_proxy.rb:125:in `write_using_load_balancer'", "lib/gitlab/database/load_balancing/connection_proxy.rb:77:in `transaction'", "app/services/groups/create_service.rb:39:in `block in execute'", "lib/gitlab/database/load_balancing/connection_proxy.rb:126:in `block in write_using_load_balancer'", "lib/gitlab/database/load_balancing/load_balancer.rb:112:in `block in read_write'", "lib/gitlab/database/load_balancing/load_balancer.rb:172:in `retry_with_backoff'", "lib/gitlab/database/load_balancing/load_balancer.rb:110:in `read_write'", "lib/gitlab/database/load_balancing/connection_proxy.rb:125:in `write_using_load_balancer'", "lib/gitlab/database/load_balancing/connection_proxy.rb:77:in `transaction'", "lib/gitlab/database.rb:309:in `block in transaction'", "lib/gitlab/database.rb:308:in `transaction'", "app/models/concerns/cross_database_modification.rb:99:in `transaction'", "app/services/groups/create_service.rb:38:in `execute'", "lib/api/groups.rb:63:in `create_group'", "lib/api/groups.rb:207:in `block (2 levels) in <class:Groups>'", "lib/api/api_guard.rb:213:in `call'", "lib/gitlab/metrics/elasticsearch_rack_middleware.rb:16:in `call'", "lib/gitlab/middleware/rails_queue_duration.rb:33:in `call'", "lib/gitlab/middleware/memory_report.rb:13:in `call'", "lib/gitlab/middleware/speedscope.rb:13:in `call'", "lib/gitlab/request_profiler/middleware.rb:17:in `call'", "lib/gitlab/database/load_balancing/rack_middleware.rb:23:in `call'", "lib/gitlab/metrics/rack_middleware.rb:16:in `block in call'", "lib/gitlab/metrics/web_transaction.rb:46:in `run'", "lib/gitlab/metrics/rack_middleware.rb:16:in `call'", "lib/gitlab/jira/middleware.rb:19:in `call'", "lib/gitlab/middleware/go.rb:20:in `call'", "lib/gitlab/etag_caching/middleware.rb:21:in `call'", "lib/gitlab/middleware/query_analyzer.rb:11:in `block in call'", "lib/gitlab/database/query_analyzer.rb:46:in `within'", "lib/gitlab/middleware/query_analyzer.rb:11:in `call'", "lib/gitlab/middleware/multipart.rb:173:in `call'", "lib/gitlab/middleware/read_only/controller.rb:50:in `call'", "lib/gitlab/middleware/read_only.rb:18:in `call'", "lib/gitlab/middleware/same_site_cookies.rb:27:in `call'", "lib/gitlab/middleware/handle_malformed_strings.rb:21:in `call'", "lib/gitlab/middleware/basic_health_check.rb:25:in `call'", "lib/gitlab/middleware/handle_ip_spoof_attack_error.rb:25:in `call'", "lib/gitlab/middleware/request_context.rb:21:in `call'", "lib/gitlab/middleware/webhook_recursion_detection.rb:15:in `call'", "config/initializers/fix_local_cache_middleware.rb:11:in `call'", "lib/gitlab/middleware/compressed_json.rb:26:in `call'", "lib/gitlab/middleware/rack_multipart_tempfile_factory.rb:19:in `call'", "lib/gitlab/middleware/sidekiq_web_static.rb:20:in `call'", "lib/gitlab/metrics/requests_rack_middleware.rb:77:in `call'", "lib/gitlab/middleware/release_env.rb:13:in `call'"],
"exception.sql": "/*application:web,correlation_id:01H5P9CWAR3B4123CHYQSZKYH5,endpoint_id:POST /api/:version/groups,db_config_name:main*/ INSERT INTO \"namespaces\" (\"name\", \"path\", \"created_at\", \"updated_at\", \"type\", \"visibility_level\", \"description_html\", \"parent_id\", \"cached_markdown_version\") VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) RETURNING \"id\"",
"queue_duration_s": 0.009978,
"redis_calls": 5,
"redis_duration_s": 0.001121,
"redis_read_bytes": 609,
"redis_write_bytes": 304,
"redis_cache_calls": 3,
"redis_cache_duration_s": 0.000656,
"redis_cache_read_bytes": 609,
"redis_cache_write_bytes": 198,
"redis_shared_state_calls": 2,
"redis_shared_state_duration_s": 0.000465,
"redis_shared_state_write_bytes": 106,
"db_count": 15,
"db_write_count": 3,
"db_cached_count": 0,
"db_replica_count": 0,
"db_primary_count": 15,
"db_main_count": 15,
"db_main_replica_count": 0,
"db_replica_cached_count": 0,
"db_primary_cached_count": 0,
"db_main_cached_count": 0,
"db_main_replica_cached_count": 0,
"db_replica_wal_count": 0,
"db_primary_wal_count": 0,
"db_main_wal_count": 0,
"db_main_replica_wal_count": 0,
"db_replica_wal_cached_count": 0,
"db_primary_wal_cached_count": 0,
"db_main_wal_cached_count": 0,
"db_main_replica_wal_cached_count": 0,
"db_replica_duration_s": 0.0,
"db_primary_duration_s": 0.056,
"db_main_duration_s": 0.056,
"db_main_replica_duration_s": 0.0,
"cpu_s": 0.043004,
"mem_objects": 15895,
"mem_bytes": 1253816,
"mem_mallocs": 4559,
"mem_total_bytes": 1889616,
"pid": 413885,
"correlation_id": "01H5P9CWAR3B4123CHYQSZKYH5",
"meta.user": "root",
"meta.client_id": "user/1",
"meta.caller_id": "POST /api/:version/groups",
"meta.remote_ip": "10.4.103.206",
"meta.feature_category": "subgroups",
"content_length": "68",
"request_urgency": "default",
"target_duration_s": 1
}
同时在该模块下的 exceptions_json.log 日志文件中一样可以找到这段错误
{
"severity": "ERROR",
"time": "2023-07-19T05:19:06.171Z",
"correlation_id": "01H5P9CWAR3B4123CHYQSZKYH5",
"exception.class": "ActiveRecord::RecordNotUnique",
"exception.message": "PG::UniqueViolation: ERROR: duplicate key value violates unique constraint \"index_namespaces_name_parent_id_type\"\nDETAIL: Key (name, parent_id, type)=(gcptest071902, 234, Group) already exists.\n",
"exception.backtrace": ["lib/gitlab/database/load_balancing/connection_proxy.rb:126:in `block in write_using_load_balancer'", "lib/gitlab/database/load_balancing/load_balancer.rb:112:in `block in read_write'", "lib/gitlab/database/load_balancing/load_balancer.rb:172:in `retry_with_backoff'", "lib/gitlab/database/load_balancing/load_balancer.rb:110:in `read_write'", "lib/gitlab/database/load_balancing/connection_proxy.rb:125:in `write_using_load_balancer'", "lib/gitlab/database/load_balancing/connection_proxy.rb:67:in `block (2 levels) in \u003cclass:ConnectionProxy\u003e'", "lib/gitlab/database/load_balancing/connection_proxy.rb:126:in `block in write_using_load_balancer'", "lib/gitlab/database/load_balancing/load_balancer.rb:112:in `block in read_write'", "lib/gitlab/database/load_balancing/load_balancer.rb:172:in `retry_with_backoff'", "lib/gitlab/database/load_balancing/load_balancer.rb:110:in `read_write'", "lib/gitlab/database/load_balancing/connection_proxy.rb:125:in `write_using_load_balancer'", "lib/gitlab/database/load_balancing/connection_proxy.rb:77:in `transaction'", "app/services/groups/create_service.rb:39:in `block in execute'", "lib/gitlab/database/load_balancing/connection_proxy.rb:126:in `block in write_using_load_balancer'", "lib/gitlab/database/load_balancing/load_balancer.rb:112:in `block in read_write'", "lib/gitlab/database/load_balancing/load_balancer.rb:172:in `retry_with_backoff'", "lib/gitlab/database/load_balancing/load_balancer.rb:110:in `read_write'", "lib/gitlab/database/load_balancing/connection_proxy.rb:125:in `write_using_load_balancer'", "lib/gitlab/database/load_balancing/connection_proxy.rb:77:in `transaction'", "lib/gitlab/database.rb:309:in `block in transaction'", "lib/gitlab/database.rb:308:in `transaction'", "app/models/concerns/cross_database_modification.rb:99:in `transaction'", "app/services/groups/create_service.rb:38:in `execute'", "lib/api/groups.rb:63:in `create_group'", "lib/api/groups.rb:207:in `block (2 levels) in \u003cclass:Groups\u003e'", "lib/api/api_guard.rb:213:in `call'", "lib/gitlab/metrics/elasticsearch_rack_middleware.rb:16:in `call'", "lib/gitlab/middleware/rails_queue_duration.rb:33:in `call'", "lib/gitlab/middleware/memory_report.rb:13:in `call'", "lib/gitlab/middleware/speedscope.rb:13:in `call'", "lib/gitlab/request_profiler/middleware.rb:17:in `call'", "lib/gitlab/database/load_balancing/rack_middleware.rb:23:in `call'", "lib/gitlab/metrics/rack_middleware.rb:16:in `block in call'", "lib/gitlab/metrics/web_transaction.rb:46:in `run'", "lib/gitlab/metrics/rack_middleware.rb:16:in `call'", "lib/gitlab/jira/middleware.rb:19:in `call'", "lib/gitlab/middleware/go.rb:20:in `call'", "lib/gitlab/etag_caching/middleware.rb:21:in `call'", "lib/gitlab/middleware/query_analyzer.rb:11:in `block in call'", "lib/gitlab/database/query_analyzer.rb:46:in `within'", "lib/gitlab/middleware/query_analyzer.rb:11:in `call'", "lib/gitlab/middleware/multipart.rb:173:in `call'", "lib/gitlab/middleware/read_only/controller.rb:50:in `call'", "lib/gitlab/middleware/read_only.rb:18:in `call'", "lib/gitlab/middleware/same_site_cookies.rb:27:in `call'", "lib/gitlab/middleware/handle_malformed_strings.rb:21:in `call'", "lib/gitlab/middleware/basic_health_check.rb:25:in `call'", "lib/gitlab/middleware/handle_ip_spoof_attack_error.rb:25:in `call'", "lib/gitlab/middleware/request_context.rb:21:in `call'", "lib/gitlab/middleware/webhook_recursion_detection.rb:15:in `call'", "config/initializers/fix_local_cache_middleware.rb:11:in `call'", "lib/gitlab/middleware/compressed_json.rb:26:in `call'", "lib/gitlab/middleware/rack_multipart_tempfile_factory.rb:19:in `call'", "lib/gitlab/middleware/sidekiq_web_static.rb:20:in `call'", "lib/gitlab/metrics/requests_rack_middleware.rb:77:in `call'", "lib/gitlab/middleware/release_env.rb:13:in `call'"],
"exception.sql": "/*application:web,correlation_id:01H5P9CWAR3B4123CHYQSZKYH5,endpoint_id:POST /api/:version/groups,db_config_name:main*/ INSERT INTO \"namespaces\" (\"name\", \"path\", \"created_at\", \"updated_at\", \"type\", \"visibility_level\", \"description_html\", \"parent_id\", \"cached_markdown_version\") VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) RETURNING \"id\"",
"user.username": "root",
"tags.program": "web",
"tags.locale": "en",
"tags.feature_category": "subgroups",
"tags.correlation_id": "01H5P9CWAR3B4123CHYQSZKYH5"
}
初步分析
报错信息涉及到GitLab的数据库负载均衡问题。GitLab中的数据库负载均衡是为了处理数据库连接的高可用性和性能而设计的。然而,在使用负载均衡时,有时可能会遇到问题导致数据库连接失败并出现此错误
似乎在日志中也看到了GitLab的数据库事务处理问题。事务是一种用于保证数据库操作的原子性、一致性、隔离性和持久性的机制。在GitLab中,数据库事务用于确保数据的一致性和完整性。
这种错误通常可能由以下原因引起:
-
数据库连接问题:数据库连接可能出现问题,导致GitLab无法正常连接数据库。这可能是由于数据库服务器故障、网络问题或配置错误引起的。
-
数据库锁定:如果多个事务同时对相同的数据库资源进行操作,并且出现锁定竞争,可能导致其中一个事务失败。
-
数据库死锁:如果多个事务相互等待对方释放锁,并且形成了循环等待,就会发生数据库死锁,导致其中一个事务失败。
-
事务超时:如果事务执行时间过长,超过了数据库设置的事务超时时间,可能导致事务失败。
可能的解决办法:
-
使用外置的数据库,调整数据库事务的相关设置
-
检查数据库服务器的状态和性能,确保数据库服务器正常运行且性能正常。
-
检查数据库连接的配置和设置,确保GitLab服务器可以正确连接到数据库。
-
检查并优化数据库的索引和查询语句,以减少数据库锁定和死锁的可能性。
-
调整数据库事务的设置,如增加事务超时时间,以适应较长的事务处理时间。
-
在数据库服务器和GitLab服务器上查看日志,查找其他可能导致事务失败的错误信息。
以上分析更多的可能我们无法去涉及更改,同时也不确定新版gitlab中是否已经修复了这个问题。可以尝试更新一下gitlab版本是否能够解决,或是在业务层,或分布式架构中加上全局锁,禁止在毫秒级别同时并发请求操作同一资源,应该就可以解决掉这个问题。