跳到主要内容

Python 源码为了安全而编译为 C 代码

· 阅读需 7 分钟

背景

做大模型相关的需求,对于 python 代码需要混淆,不要让客户知道源码,因为 python 是解释型语言,直接 python main.py 就运行了,不像 go 一样可以打包为二进制文件,用户不知道源码是什么样子的。因此 python 代码需要做混淆。

Cython

调研了一圈后发现,大家都在使用 Cython 将 python 代码编译为 C 代码,从而混淆源码。达到不让对方直接看到源码的诉求。找到了一个工具 https://github.com/Boris-code/jmpy ,但是这个仓库中依赖的 Cython 实在太久了,因此直接把源码拿出来,直接修改了一下源代码。在这里感谢 jmpy 这个项目的作者。

步骤

将这个文件保存为 entrypt_py.py 文件。

# -*- coding: utf-8 -*-
"""
Created on 2018-07-18 18:24
---------
@summary: 加密python代码为pyd/so
---------
@author: Boris
"""
import os
import re
import getopt
import sys
import shutil
import tempfile
from asyncio.log import logger
from distutils.command.build_py import build_py
from distutils.core import setup
from typing import Union, List

from Cython.Build import cythonize


def get_package_dir(*args, **kwargs):
return ""


# 重写get_package_dir, 否者生成的so文件路径有问题
build_py.get_package_dir = get_package_dir


class TemporaryDirectory(object):
def __enter__(self):
self.name = tempfile.mkdtemp()
return self.name

def __exit__(self, exc_type, exc_value, traceback):
shutil.rmtree(self.name)


def search(content, regexs):
if isinstance(regexs, str):
return re.search(regexs, content)

for regex in regexs:
if re.search(regex, content):
return True


def walk_file(file_path):
if os.path.isdir(file_path):
for current_path, sub_folders, files_name in os.walk(file_path):
for file in files_name:
file_path = os.path.join(current_path, file)
yield file_path

else:
yield file_path


def copy_files(src_path, dst_path):
if os.path.isdir(src_path):
if os.path.exists(dst_path):
shutil.rmtree(dst_path)

def callable(src, names: list):
if search(src, dst_path):
return names
return ["dist", ".git", "venv", ".idea", "__pycache__"]

shutil.copytree(src_path, dst_path, ignore=callable)
else:
if not os.path.exists(dst_path):
os.makedirs(dst_path)
shutil.copyfile(src_path, os.path.join(dst_path, os.path.basename(src_path)))


def get_py_files(files, ignore_files: Union[List, str, None] = None):
"""
@summary:
---------
@param files: 文件列表
#param ignore_files: 忽略的文件,支持正则
---------
@result:
"""
for file in files:
if file.endswith(".py"):
if ignore_files and search(file, regexs=ignore_files): # 该文件是忽略的文件
pass
else:
yield file


def filter_cannot_encrypted_py(files, except_main_file):
"""
过滤掉不能加密的文件,如 log.py __main__.py 以及包含 if __name__ == "__main__": 的文件
Args:
files:

Returns:

"""
_files = []
for file in files:
if search(file, regexs="__.*?.py"):
continue

if except_main_file:
with open(file, "r", encoding="utf-8") as f:
content = f.read()
if search(content, regexs="__main__"):
continue

_files.append(file)

return _files


def encrypt_py(py_files: list):
encrypted_py = []

with TemporaryDirectory() as td:
total_count = len(py_files)
for i, py_file in enumerate(py_files):
try:
dir_name = os.path.dirname(py_file)
file_name = os.path.basename(py_file)

os.chdir(dir_name)

logger.debug("正在加密 {}/{}, {}".format(i + 1, total_count, file_name))

setup(
ext_modules=cythonize([file_name], quiet=True, language_level=3),
script_args=["build_ext", "-t", td, "--inplace"],
)

encrypted_py.append(py_file)
logger.debug("加密成功 {}".format(file_name))

except Exception as e:
logger.exception("加密失败 {} , error {}".format(py_file, e))
temp_c = py_file.replace(".py", ".c")
if os.path.exists(temp_c):
os.remove(temp_c)

return encrypted_py


def delete_files(files_path):
"""
@summary: 删除文件
---------
@param files_path: 文件路径 py 及 c 文件
---------
@result:
"""
try:
# 删除python文件及c文件
for file in files_path:
os.remove(file) # py文件
os.remove(file.replace(".py", ".c")) # c文件

except Exception as e:
pass


def rename_excrypted_file(output_file_path):
files = walk_file(output_file_path)
for file in files:
if file.endswith(".pyd") or file.endswith(".so"):
new_filename = re.sub("(.*)\..*\.(.*)", r"\1.\2", file)
os.rename(file, new_filename)


def start_encrypt(
input_file_path,
output_file_path: str = None,
ignore_files: Union[List, str, None] = None,
except_main_file: int = 1,
):
assert input_file_path, "input_file_path cannot be null"

assert (
input_file_path != output_file_path
), "output_file_path must be diffent with input_file_path"

if output_file_path and os.path.isfile(output_file_path):
raise ValueError("output_file_path need a dir path")

input_file_path = os.path.abspath(input_file_path)
if not output_file_path: # 无输出路径
if os.path.isdir(
input_file_path
): # 如果输入路径是文件夹 则输出路径为input_file_path/dist/project_name
output_file_path = os.path.join(
input_file_path, "dist", os.path.basename(input_file_path)
)
else:
output_file_path = os.path.join(os.path.dirname(input_file_path), "dist")
else:
output_file_path = os.path.abspath(output_file_path)

# 拷贝原文件到目标文件
copy_files(input_file_path, output_file_path)

files = walk_file(output_file_path)
py_files = get_py_files(files, ignore_files)

# 过滤掉不需要加密的文件
need_encrypted_py = filter_cannot_encrypted_py(py_files, except_main_file)

encrypted_py = encrypt_py(need_encrypted_py)

delete_files(encrypted_py)
rename_excrypted_file(output_file_path)

logger.debug(
"加密完成 total_count={}, success_count={}, 生成到 {}".format(
len(need_encrypted_py), len(encrypted_py), output_file_path
)
)


def usage():
"""
python代码 加密|加固
参数说明:
-i | --input_file_path 待加密文件或文件夹路径,可是相对路径或绝对路径
-o | --output_file_path 加密后的文件输出路径,默认在input_file_path下创建dist文件夹,存放加密后的文件
-I | --ignore_files 不需要加密的文件或文件夹,逗号分隔
-m | --except_main_file 不加密包含__main__的文件(主文件加密后无法启动), 值为0、1。 默认为1
"""


def execute():
try:
options, args = getopt.getopt(
sys.argv[1:],
"hi:o:I:m:",
[
"help",
"input_file_path=",
"output_file_path=",
"ignore_files=",
"except_main_file=",
],
)
input_file_path = output_file_path = ignore_files = ""
except_main_file = 1

for name, value in options:
if name in ("-h", "--help"):
print(usage.__doc__)
sys.exit()

elif name in ("-i", "--input_file_path"):
input_file_path = value

elif name in ("-o", "--output_file_path"):
output_file_path = value

elif name in ("-I", "--ignore_files"):
ignore_files = value.split(",")

elif name in ("-m", "--except_main_file"):
except_main_file = int(value)

if not input_file_path:
print("需指定-i 或 input_file_path")
print(usage.__doc__)
sys.exit()

start_encrypt(input_file_path, output_file_path, ignore_files, except_main_file)

except getopt.GetoptError:
print(usage.__doc__)
sys.exit()


if __name__ == '__main__':
execute()

编译

将上面的源代码保存为 encrypt_py.py 文件,执行。

-I: 忽略 .venv encrypt_py.pytests/* 目录

python encrypt_py.py -i . -I ".venv/*,encrypt_py.py,tests/*"
(.venv) (base) ➜  inference_server git:(main) ✗ python encrypt_py.py -i . -I ".venv/*,encrypt_py.py,tests/*"
delete.c:3235:21: warning: fallthrough annotation in unreachable code [-Wunreachable-code-fallthrough]
CYTHON_FALLTHROUGH;
^
delete.c:556:34: note: expanded from macro 'CYTHON_FALLTHROUGH'
#define CYTHON_FALLTHROUGH __attribute__((fallthrough))
^
delete.c:3246:21: warning: fallthrough annotation in unreachable code [-Wunreachable-code-fallthrough]
CYTHON_FALLTHROUGH;
^
delete.c:556:34: note: expanded from macro 'CYTHON_FALLTHROUGH'
#define CYTHON_FALLTHROUGH __attribute__((fallthrough))

中间的警告可以忽略,都是一些声明的方法,但是没有使用而已。

成果

会在源码目录的 dist 下生成对应的编译好的 C 代码。试着运行,发现一点儿问题没有。

(.venv) (base) ➜  inference_server git:(main) ✗ python encrypt_py.py -i . -I ".venv/*,encrypt_py.py,tests/*"
delete.c:3235:21: warning: fallthrough annotation in unreachable code [-Wunreachable-code-fallthrough]
CYTHON_FALLTHROUGH;
^
delete.c:556:34: note: expanded from macro 'CYTHON_FALLTHROUGH'
#define CYTHON_FALLTHROUGH __attribute__((fallthrough))
^
1 warning generated.
inference.c:8168:26: warning: code will never be executed [-Wunreachable-code]
module = PyImport_ImportModuleLevelObject(
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1 warning generated.
inference.c:8168:26: warning: code will never be executed [-Wunreachable-code]
module = PyImport_ImportModuleLevelObject(
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1 warning generated.
(.venv) (base) ➜ inference_server git:(main) ✗ pwd
/Users/bytedance/GolandProjects/inference_server
(.venv) (base) ➜ inference_server git:(main) ✗ cd dist/inference_server
(.venv) (base) ➜ inference_server git:(main) ✗ python main.py
INFO: Will watch for changes in these directories: ['/Users/bytedance/GolandProjects/inference_server/dist/inference_server']
INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
INFO: Started reloader process [50384] using StatReload
INFO: Started server process [50394]
INFO: Waiting for application startup.
2024-09-06 16:48:48,095 DEBUG Using AsyncIOEngine.POLLER as I/O engine
INFO: Application startup complete.


检查是否还包含源码

(.venv) (base) ➜  inference_server git:(main) ✗ tree src/chain/http
src/chain/http
├── __init__.py
├── __pycache__
│   └── __init__.cpython-310.pyc
├── app.so
├── build
│   └── lib.macosx-10.9-universal2-cpython-310
│   └── inference_server
│   └── src
│   └── chain
│   └── http
│   ├── app.so
│   ├── middleware.so
│   └── streaming_response.so
├── middleware.so
└── streaming_response.so

7 directories, 8 files

可以看到已经没有 python 源码啦,都是 so 库文件了。完美~

大模型 LLM function call

· 阅读需 11 分钟
wencaiwulue
Senior DevOps / System Engineer @ bytedance

背景

在做大模型私有化相关的内容,遇到需要支持 function call 的功能

主要流程

流程

由于大模型不支持流式输入,因此实际上对于 function call,都是通过单次的 http 请求来实现的。

举例说明

使用大模型 function call 查询 南山图书馆位置的需求。

按照 OpenAI 的 function 格式发送请求

假设我们有个 function LocationTool, 作用是根据指定名称,查询具体位置。详细参数描述如下:

LocationTool:根据用户给出的问题查询具体位置或推荐具体地点。当问题中给出指定地点询问位置时可以返回位置信息,包含酒店、餐厅、博物馆、政府/医疗/教学机构等,但不包含市辖区、城市名称、乡镇名称。问题中可对餐厅、博物馆、政府单位、景点等进行推荐,返回推荐的具体地点。
支持如下参数:
location_keyword:表示用户想要查找的地点,必填参数,字符串类型;
poi_keyword:表示地点类型,必填参数,字符串类型;
latitude:表示UserInfo中纬度,必填参数,浮点型;
longitude:表示UserInfo中经度,必填参数,浮点型;
sort:表示排序规则,0=不排序,1=最近优先,2=最便宜优先,3=最贵优先,默认值为0,缺省参数,整型。
使用举例,需要出本插件的情况:问题是 "上海台儿庄路的特色小餐馆",参数为 {\"location_keyword\":\"上海台儿庄路\",\"poi_keyword\":\"特色小餐馆\",\"latitude\":\"30.11\",\"longitude\":\"121.45\"}。类似以下文本不出本插件:北京好玩的地方、门头沟在哪、方恒的消防栓在哪、推荐一些当地特产。
➜  ~ curl -X POST localhost:8000/chat/completions -H "Content-Type: application/json" -d '{
"model": "xxxxxx",
"stream": false,
"messages": [
{
"role": "user",
"content": "请帮我查询一下南山图书馆的位置"
}
],
"temperature": 0.95,
"top_p": 0.8,
"top_k": 10,
"repetition_penalty": 1,
"max_new_tokens": 2048,
"tools": [{
"id": "1",
"type": "function",
"function": {
"name": "LocationTool",
"description": "根据用户给出的问题查询具体位置或推荐具体地点。当问题中给出指定地点询问位置时可以返回位置信息,包含酒店、餐厅、博物馆、政府/医疗/教学机构等,但不包含市辖区、城市名称、乡镇名称。问题中可对餐厅、博物馆、政府单位、景点等进行推荐,返回推荐的具体地点。",
"parameters": {
"type": "object",
"properties": {
"location_keyword": {
"type": "string",
"description": "表示用户想要查找的地点"
},
"poi_keyword": {
"type": "string",
"description": "表示UserInfo中纬度"
},
"latitude": {
"type": "string",
"description": "表示UserInfo中纬度"
},
"longitude": {
"type": "string",
"description": "表示UserInfo中纬度"
},
"sort": {
"type": "integer",
"description": "0=不排序,1=最近优先,2=最便宜优先,3=最贵优先,默认值为0"
}
},
"required": ["location_keyword","poi_keyword","latitude","longitude"]
}
}
}]
}'
{"model":"xxxxxx","choices":[{"delta":{"role":"assistant","content":"\n当前提供了 1 个工具,分别是['LocationTool'],需求为查询南山图书馆的位置,需要调用 LocationTool 获取相关信息。","tool_calls":[{"type":"function","function":{"name":"LocationTool","arguments":"{\"latitude\": \"22.536444\", \"location_keyword\": \"南山图书馆\"}"},"index":0,"id":"call_nbkpltop1qzycolf8gw3n7zk"}]},"index":0,"finish_reason":"tool_calls","logprobs":null}],"usage":{"prompt_tokens":220,"completion_tokens":82,"total_tokens":302}}

➜ ~

可以看到响应包含

{
"tool_calls": [
{
"type": "function",
"function": {
"name": "LocationTool",
"arguments": "{\"latitude\": \"22.536444\", \"location_keyword\": \"南山图书馆\"}"
},
"index": 0,
"id": "call_nbkpltop1qzycolf8gw3n7zk"
}
]
}

调用 function call 获取结果

{
"type": "function",
"function": {
"name": "LocationTool",
"arguments": "{\"latitude\": \"22.536444\", \"location_keyword\": \"南山图书馆\"}"
},
"index": 0,
"id": "call_nbkpltop1qzycolf8gw3n7zk"
}

使用给到的参数,调用其他组件 LocationTool 方法,获取到结果。

南山区南山大道2093号

将结果给到大模型,做最终的输出

curl -X POST localhost:8000/chat/completions -H "Content-Type: application/json" -d '{
"model": "xxxxxx",
"stream": false,
"messages": [
{
"role": "user",
"content": "请帮我查询一下南山图书馆的位置"
},
{
"role":"assistant",
"content":"\n当前提供了 1 个工具,分别是[\"LocationTool\"],需求是查询南山图书馆的位置,需要调用 LocationTool 获取相关信息。","tool_calls":[{"type":"function","function":{"name":"LocationTool","arguments":"{\"latitude\": \"22.5385\", \"location_keyword\": \"南山图书馆\"}"},"index":0,"id":"1"}]
},
{
"role": "tool",
"content": "南山区南山大道2093号",
"tool_call_id": "1"
}
],
"temperature": 0.95,
"top_p": 0.8,
"top_k": 10,
"repetition_penalty": 1,
"max_new_tokens": 2048,
"tools": [{
"id": "1",
"type": "function",
"function": {
"name": "LocationTool",
"description": "根据用户给出的问题查询具体位置或推荐具体地点。当问题中给出指定地点询问位置时可以返回位置信息,包含酒店、餐厅、博物馆、政府/医疗/教学机构等,但不包含市辖区、城市名称、乡镇名称。问题中可对餐厅、博物馆、政府单位、景点等进行推荐,返回推荐的具体地点。",
"parameters": {
"type": "object",
"properties": {
"location_keyword": {
"type": "string",
"description": "表示用户想要查找的地点"
},
"poi_keyword": {
"type": "string",
"description": "表示UserInfo中纬度"
},
"latitude": {
"type": "string",
"description": "表示UserInfo中纬度"
},
"longitude": {
"type": "string",
"description": "表示UserInfo中纬度"
},
"sort": {
"type": "integer",
"description": "0=不排序,1=最近优先,2=最便宜优先,3=最贵优先,默认值为0"
}
},
"required": ["location_keyword","poi_keyword","latitude","longitude"]
}
}
}]
}'
{"model":"xxxxxx","choices":[{"message":{"role":"assistant","content":"南山图书馆的位置是南山区南山大道 2093 号。","tool_calls":[]},"index":0,"finish_reason":"stop","logprobs":null}],"usage":{"prompt_tokens":291,"completion_tokens":28,"total_tokens":319}}

最终的结果

模型回复 南山图书馆的位置是南山区南山大道 2093 号,得到了我们想要的答案。

探究本质

本质还是以训练的格式,直接发送给大模型。让大模型按照指定格式来识别。

➜  ~ curl -X POST localhost:8000/chat/completions -H "Content-Type: application/json" -d '{
"model": "xxxxxx",
"stream": false,
"messages": [
{
"role":"assistant",
"content":"<|Functions|>:\n- LocationTool:根据用户给出的问题查询具体位置或推荐具体地点。当问题中给出指定地点询问位置时可以返回位置信息,包含酒店、餐厅、博物馆、政府/医疗/教学机构等,但不包含市辖区、城市名称、乡镇名称。问题中可对餐厅、博物馆、政府单位、景点等进行推荐,返回推荐的具体地点。支持如下参数:location_keyword:表示用户 想要查找的地点,必填参数,字符串类型;poi_keyword:表示地点类型,必填参数,字符串类型;latitude:表示UserInfo中纬度,必填参数,浮点型;longitude:表示UserInfo中经度,必填 参数,浮点型;sort:表示排序规则,0=不排序,1=最近优先,2=最便宜优先,3=最贵优先,默认值为0,缺省参数,整型。比如,需要出本插件的情况:问题是 \"上海台儿庄路的特色小餐馆\" ,参数为 {\"location_keyword\":\"上海台儿庄路\",\"poi_keyword\":\"特色小餐馆\",\"latitude\":\"30.11\",\"longitude\":\"121.45\"}。类似以下文本不出本插件:北京好玩的地方、门头沟在哪、方恒的消防栓在哪、推荐一些当地特产。\n\n<|UserInfo|>:\n{\"system_time\": \"2023-09-14T19:27:42\", \"longitude\": 106.03, \"latitude\": 42.04}"
},
{
"role": "user",
"content": "请帮我查询一下南山图书馆的位置"
}
],
"temperature": 0.95,
"top_p": 0.8,
"top_k": 10,
"repetition_penalty": 1,
"max_new_tokens": 2048
}'
{"model":"xxxxxx","choices":[{"message":{"role":"assistant","content":"\n当前提供了 1 个工具,分别是['LocationTool'],需求是查询南山图书馆的位置,需要调用 LocationTool 获取相关信息。","tool_calls":[{"type":"function","function":{"name":"LocationTool","arguments":"{\"latitude\": 42.04, \"longitude\": 106.03, \"location_keyword\": \"南山图书馆\"}"},"index":0,"id":"call_uwy26bq6wvx7ud0g3anhrxhu"}]},"index":0,"finish_reason":"tool_calls","logprobs":null}],"usage":{"prompt_tokens":361,"completion_tokens":87,"total_tokens":448}}
➜  ~ curl -X POST localhost:8000/chat/completions -H "Content-Type: application/json" -d '{
"model": "xxxxxx",
"stream": false,
"messages": [
{
"role":"assistant",
"content":"<|Functions|>:\n- LocationTool:根据用户给出的问题查询具体位置或推荐具体地点。当问题中给出指定地点询问位置时可以返回位置信息,包含酒店、餐厅、博物馆、政府/医疗/教学机构等,但不包含市辖区、城市名称、乡镇名称。问题中可对餐厅、博物馆、政府单位、景点等进行推荐,返回推荐的具体地点。支持如下参数:location_keyword:表示用户 想要查找的地点,必填参数,字符串类型;poi_keyword:表示地点类型,必填参数,字符串类型;latitude:表示UserInfo中纬度,必填参数,浮点型;longitude:表示UserInfo中经度,必填 参数,浮点型;sort:表示排序规则,0=不排序,1=最近优先,2=最便宜优先,3=最贵优先,默认值为0,缺省参数,整型。比如,需要出本插件的情况:问题是 \"上海台儿庄路的特色小餐馆\" ,参数为 {\"location_keyword\":\"上海台儿庄路\",\"poi_keyword\":\"特色小餐馆\",\"latitude\":\"30.11\",\"longitude\":\"121.45\"}。类似以下文本不出本插件:北京好玩的地方、门头沟在哪、方恒的消防栓在哪、推荐一些当地特产。\n\n<|UserInfo|>:\n{\"system_time\": \"2023-09-14T19:27:42\", \"longitude\": 106.03, \"latitude\": 42.04}"
},
{
"role": "user",
"content": "请帮我查询一下南山图书馆的位置"
},
{
"role":"assistant",
"content":"<|Observation|>:南山区南山大道2093号"
}
],
"temperature": 0.95,
"top_p": 0.8,
"top_k": 10,
"repetition_penalty": 1,
"max_new_tokens": 2048
}'
{"model":"xxxxxx","choices":[{"message":{"role":"assistant","content":"为您查询到南山图书馆的地址为:南山区南山大道 2093 号。\n\n如果您想了解更多关于南山图书馆的信息,或者有其他需求,请继续提问。","tool_calls":[]},"index":0,"finish_reason":"stop","logprobs":null}],"usage":{"prompt_tokens":384,"completion_tokens":53,"total_tokens":437}}

Kubernetes Pod pending 很久问题分析及解决

· 阅读需 7 分钟
wencaiwulue
Senior DevOps / System Engineer @ bytedance

背景

在 Cloud IDE 运行过程中,会启动一个 deployment 用做工作空间,然后在 pod ide-server。但是当性能压测的时候,出现了问题。

问题

当 ns 下存在 5w deployment 时,启动 500 个 Pod 从 create 到 running 耗时 5分30秒。不满足需求,需要优化。

现象

Pod 处于 Pending 状态,并且 node ip 是空的,也没有任何 event,说明 Pod 还没被调度起来。

feiyan-1000000000   ws-cr3ipb4uccam7di1b9u0-6dbf68f48c-z6vc6                0/1     Pending             0          2m50s   <none>            <none>      <none>           <none>
feiyan-1000000000 ws-cr3ipb4uccam7di1b9u0-6dbf68f48c-zcbqn 0/1 Pending 0 2m52s <none> <none> <none> <none>
feiyan-1000000000 ws-cr3ipb4uccam7di1b9u0-6dbf68f48c-zfz8g 0/1 Pending 0 2m49s <none> <none> <none> <none>
feiyan-1000000000 ws-cr3ipb4uccam7di1b9u0-6dbf68f48c-zml6s 0/1 Pending 0 2m50s <none> <none> <none> <none>
feiyan-1000000000 ws-cr3ipb4uccam7di1b9u0-6dbf68f48c-zmmql 0/1 Pending 0 2m52s <none> <none> <none> <none>
Name: ws-cr3ipb4uccam7di1b9u0-6dbf68f48c-zmmql
Namespace: feiyan-1000000000
Priority: 0
Service Account: default
Node: <none>
Labels: app.kubernetes.io/instance=ws-cr3ipb4uccam7di1b9u0
app.kubernetes.io/name=ws-cr3ipb4uccam7di1b9u0
plugin=code-server
pod-template-hash=6dbf68f48c
workspace=ws-cr3ipb4uccam7di1b9u0
Annotations: <none>
Status: Pending
IP:
IPs: <none>
Controlled By: ReplicaSet/ws-cr3ipb4uccam7di1b9u0-6dbf68f48c
Containers:
code-server:
Image: eps-beijing.cr.xxx.com/infcprelease/ide-server:v1.4.0-2408131650
Port: 8910/TCP
Host Port: 0/TCP
Args:
server
--config=/etc/feiyan/config.yaml
--webide-workspace-id=cr3ipb4uccam7di1b9u0
Limits:
cpu: 2
memory: 4Gi
Requests:
cpu: 500m
memory: 512Mi
Liveness: http-get http://:http/healthz%3Ftype=liveness delay=0s timeout=1s period=10s #success=1 #failure=3
Readiness: http-get http://:http/healthz%3Ftype=readiness delay=0s timeout=1s period=10s #success=1 #failure=3
Environment:
USER_ID: 189331
GROUP_ID: 334242
WS_HOME: /home/runner/code
WS_ID: cr3ipb4uccam7di1b9u0
HOME: /home/runner
Mounts:
/etc/feiyan from config (ro)
/home/runner/code from ws-path (rw)
/nix from nix (rw)
/var/run/secrets/kubernetes.io/serviceaccount from default-token-bj4kb (ro)
Volumes:
ws-path:
Type: HostPath (bare host directory volume)
Path: /shared/ws/1000000000/luo
HostPathType: Directory
config:
Type: ConfigMap (a volume populated by a ConfigMap)
Name: feiyan-config
Optional: false
nix:
Type: HostPath (bare host directory volume)
Path: /shared/nix/nix
HostPathType: Directory
default-token-bj4kb:
Type: Secret (a volume populated by a Secret)
SecretName: default-token-bj4kb
Optional: false
QoS Class: Burstable
Node-Selectors: <none>
Tolerations: node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events: <none>

发现 event 是空的

常规检查

  • Node 资源足够。(未出现任何 node 压力)。排除资源不够的可能性。
  • 查看 Pod 状态。

查看 kube-scheduler 日志,发现调度一个 Pod 需要 700ms 左右。500 * 700ms = 350s,光是调度就需要花费350s,太慢了。

I0827 18:35:36.050146       1 trace.go:205] Trace[401520597]: "Scheduling" namespace:feiyan-1000000000,name:ws-cr48mgc00l3h770u0880-78b6b5bf5c-brl62 (27-Aug-2024 18:35:35.424) (total time: 625ms):
Trace[401520597]: ---"Prioritizing done" 625ms (18:35:00.050)
Trace[401520597]: [625.270367ms] [625.270367ms] END
I0827 18:35:36.813014 1 trace.go:205] Trace[1574660176]: "Scheduling" namespace:feiyan-1000000000,name:ws-cr48mgb1lsr0vb3dkji0-8645fd9665-wmvw9 (27-Aug-2024 18:35:36.050) (total time: 762ms):
Trace[1574660176]: ---"Prioritizing done" 762ms (18:35:00.812)
Trace[1574660176]: [762.728076ms] [762.728076ms] END
I0827 18:35:37.460196 1 trace.go:205] Trace[1439543403]: "Scheduling" namespace:feiyan-1000000000,name:ws-cr48mgb1lsr0vb3dkjm0-84c544f4b7-b4kcp (27-Aug-2024 18:35:36.813) (total time: 647ms):
Trace[1439543403]: ---"Prioritizing done" 646ms (18:35:00.460)
Trace[1439543403]: [647.020389ms] [647.020389ms] END
I0827 18:35:38.140289 1 trace.go:205] Trace[980518911]: "Scheduling" namespace:feiyan-1000000000,name:ws-cr48mg3lpur1r9npv5vg-76b6b8c9d5-5n8wb (27-Aug-2024 18:35:37.460) (total time: 679ms):
Trace[980518911]: ---"Prioritizing done" 679ms (18:35:00.140)
Trace[980518911]: [679.90811ms] [679.90811ms] END

从 kube-scheduler 入手

既然处理慢,我们就来看看做了什么。

// Schedule tries to schedule the given pod to one of the nodes in the node list.
// If it succeeds, it will return the name of the node.
// If it fails, it will return a FitError error with reasons.
func (g *genericScheduler) Schedule(ctx context.Context, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) (result ScheduleResult, err error) {
trace := utiltrace.New("Scheduling", utiltrace.Field{Key: "namespace", Value: pod.Namespace}, utiltrace.Field{Key: "name", Value: pod.Name})
defer trace.LogIfLong(100 * time.Millisecond)

...
trace.Step("Snapshotting scheduler cache and node infos done")
...
trace.Step("Computing predicates done")
...
trace.Step("Prioritizing done")

return ScheduleResult{
SuggestedHost: host,
EvaluatedNodes: len(feasibleNodes) + len(filteredNodesStatuses),
FeasibleNodes: len(feasibleNodes),
}, err
}

从日志上分析,打印了两行日志。

  • "Snapshotting scheduler cache and node infos done"
  • "Computing predicates done"
  • "Prioritizing done"
  • "Scheduling" namespace:feiyan-1000000000,name:ws-cr48meotaisig5m53nc0-584fbb955c-qpppn

打印顺序是这样的,说明前面两步不耗时,后边两部耗时久。看看从 "Computing predicates done""Prioritizing done" 之间做了什么。看了代码,是 prioritizeNodes 方法在执行。看源代码是运行了许多 plugin

登陆 master node,修改 kube-scheduler 配置

我们先把所有的 plugin 都禁用,发现调度很快。 后来,当我们一个一个插件排除的时候,发现了罪魁祸首。PodTopologySpread

root@ncp3cnq6q0djb2ejvfct0:/etc/kubernetes/manifests# cat /etc/kubernetes/scheduler/kubescheduler-config.yaml
---
apiVersion: kubescheduler.config.k8s.io/v1beta1
clientConnection:
acceptContentTypes: application/json
burst: 100
contentType: application/vnd.kubernetes.protobuf
kubeconfig: /etc/kubernetes/scheduler.conf
qps: 50
enableContentionProfiling: false
enableProfiling: false
healthzBindAddress: 0.0.0.0:10251
kind: KubeSchedulerConfiguration
leaderElection:
leaderElect: true
leaseDuration: 15s
renewDeadline: 10s
resourceLock: leases
resourceName: kube-scheduler
resourceNamespace: kube-system
retryPeriod: 2s
metricsBindAddress: 0.0.0.0:10251
parallelism: 16
percentageOfNodesToScore: 0
podInitialBackoffSeconds: 1
podMaxBackoffSeconds: 10
profiles:
- schedulerName: default-scheduler
plugins:
queueSort:
enabled:
- name: Coscheduling
disabled:
- name: "*"
preFilter:
enabled:
- name: Coscheduling
- name: GPUShare
filter:
enabled:
- name: GPUShare
postFilter:
enabled:
- name: Coscheduling
score:
enabled:
- name: RequestedToCapacityRatio
weight: 4
disabled:
- name: NodeResourcesLeastAllocated
permit:
enabled:
- name: Coscheduling
reserve:
enabled:
- name: Coscheduling
- name: GPUShare
preBind:
enabled:
- name: GPUShare
postBind:
enabled:
- name: Coscheduling
pluginConfig:
- name: Coscheduling
args:
permitWaitingTimeSeconds: 10
deniedPGExpirationTimeSeconds: 3
kubeConfigPath: /etc/kubernetes/scheduler.conf
- name: GPUShare
args:
policy: binpack
weightOfCore: 20
scheduleMode: index
maxContainersPerCard: 16
- name: RequestedToCapacityRatio
args:
shape:
- utilization: 0
score: 10
- utilization: 100
score: 0
resources:
- name: vke.volcengine.com/mgpu-core
weight: 1
- name: vke.volcengine.com/mgpu-memory
weight: 4
- name: cpu
weight: 1
- name: memory
weight: 1

有意思的是,kube-scheduler 是静态 Pod,由 kubelet 拉起来的,启动方法和常规不同。

root@ncp3cnq6q0djb2ejvfct0:/etc/kubernetes/manifests# ls -alh /etc/kubernetes/manifests
total 28K
drwxr-xr-x 2 root root 4.0K Aug 29 18:17 .
drwxr-xr-x 5 root root 4.0K Aug 29 18:17 ..
-rw------- 1 root root 2.7K May 17 11:17 etcd.yaml
-rw------- 1 root root 4.8K Aug 27 21:04 kube-apiserver.yaml
-rw------- 1 root root 3.8K Aug 27 18:26 kube-controller-manager.yaml
-rw------- 1 root root 2.1K Aug 27 21:04 kube-scheduler.yaml
root@ncp3cnq6q0djb2ejvfct0:/etc/kubernetes/manifests#

当把 kube-scheduler.yaml 移出 manifests 目录,kube-scheduler pod 会自动销毁。移动回来,则会新建出来。

root@ncp3cnq6q0djb2ejvfct0:/etc/kubernetes/manifests# ls
etcd.yaml kube-apiserver.yaml kube-controller-manager.yaml kube-scheduler.yaml
root@ncp3cnq6q0djb2ejvfct0:/etc/kubernetes/manifests# kubectl get pods -n kube-system | grep sche
kube-scheduler-192.168.160.103 1/1 Running 0 80m
kube-scheduler-192.168.161.127 1/1 Running 0 76m
kube-scheduler-192.168.161.254 1/1 Running 2 106m
scheduler-controller-manager-5479cd5fbf-8bhgk 1/1 Running 0 104d
scheduler-controller-manager-5479cd5fbf-lgpqz 1/1 Running 0 104d
root@ncp3cnq6q0djb2ejvfct0:/etc/kubernetes/manifests# mv kube-scheduler.yaml ../
root@ncp3cnq6q0djb2ejvfct0:/etc/kubernetes/manifests# kubectl get pods -n kube-system | grep sche
kube-scheduler-192.168.160.103 1/1 Running 0 80m
kube-scheduler-192.168.161.254 1/1 Running 2 106m
scheduler-controller-manager-5479cd5fbf-8bhgk 1/1 Running 0 104d
scheduler-controller-manager-5479cd5fbf-lgpqz 1/1 Running 0 104d
root@ncp3cnq6q0djb2ejvfct0:/etc/kubernetes/manifests# mv ../kube-scheduler.yaml .
root@ncp3cnq6q0djb2ejvfct0:/etc/kubernetes/manifests# kubectl get pods -n kube-system | grep sche
kube-scheduler-192.168.160.103 1/1 Running 0 80m
kube-scheduler-192.168.161.127 0/1 Running 1 2s
kube-scheduler-192.168.161.254 1/1 Running 2 106m
scheduler-controller-manager-5479cd5fbf-8bhgk 1/1 Running 0 104d
scheduler-controller-manager-5479cd5fbf-lgpqz 1/1 Running 0 104d

kube-scheduler-config.jpg

效果

启动很快啦

  • 现在10s内启动500个ide,全部启动成功running耗时61左右,之前是5分28s。
  • 10s内启动 300 个ide,全部启动成功running耗时44s左右,之前是3分22s。
ws-cr490ueon2flis42a6kg-656b5d498-wfmnm    1/1     Running   0          77s   172.16.37.222   192.168.161.191   <none>           <none>
ws-cr490ueon2flis42a6lg-74c75fc6bb-vrnsr 1/1 Running 0 77s 172.16.86.238 192.168.161.64 <none> <none>
ws-cr490ueon2flis42a6mg-7c677b94cc-ntwdk 1/1 Running 0 77s 172.16.59.121 192.168.160.206 <none> <none>
ws-cr490ueon2flis42a6og-784695b496-flskx 1/1 Running 0 77s 172.16.59.53 192.168.160.206 <none> <none>
ws-cr490ueon2flis42a6pg-5fb44dc64d-7c8lx 1/1 Running 0 77s 172.16.87.186 192.168.160.198 <none> <none>
ws-cr490ueon2flis42a6qg-5d74677688-z87qq 1/1 Running 0 76s 172.16.54.202 192.168.161.65 <none> <none>
ws-cr490ueon2flis42a73g-dc8d4cc75-tqp92 1/1 Running 0 77s 172.16.86.237 192.168.161.64 <none> <none>

LLM 大模型 Olloma 项目解读

· 阅读需 6 分钟
wencaiwulue
Senior DevOps / System Engineer @ bytedance

前言

最近在看大模型的相关知识,然后想在本地启动一个 LLM ,用来做实验,然后找到了 ollama 。体验挺好的,所以拿来研究下。

项目

在本地构建 ollama 二进制可执行文件

  • 克隆项目:git clone https://github.com/ollama/ollama.git
  • 构建依赖的 llama-server 二进制:cd ollama && ./scripts/build_darwin.sh
  • 构建 ollama 二进制:CGO_ENABLED=1 GOOS=darwin GOARCH=arm64 go build

可以看到,还有许多接口

POST   /api/pull
POST /api/generate
POST /api/chat
POST /api/embed
POST /api/embeddings
POST /api/create
POST /api/push
POST /api/copy
DELETE /api/delete
POST /api/show
POST /api/blobs/:digest
HEAD /api/blobs/:digest
GET /api/ps
POST /v1/chat/completions
POST /v1/completions
POST /v1/embeddings
GET /v1/models
GET /v1/models/:model
GET /
GET /api/tags
GET /api/version
HEAD /
HEAD /api/tags
HEAD /api/version
time=2024-08-14T16:27:00.222+08:00 level=INFO source=memory.go:309 msg="offload to metal" layers.requested=-1 layers.model=29 layers.offload=29 layers.split="" memory.available="[21.3 GiB]" memory.required.full="5.4 GiB" memory.required.partial="5.4 GiB" memory.required.kv="448.0 MiB" memory.required.allocations="[5.4 GiB]" memory.weights.total="3.9 GiB" memory.weights.repeating="3.4 GiB" memory.weights.nonrepeating="426.4 MiB" memory.graph.full="478.0 MiB" memory.graph.partial="478.0 MiB"
time=2024-08-14T16:27:00.756+08:00 level=INFO source=server.go:393 msg="starting llama server" cmd="/var/folders/30/cmv9c_5j3mq_kthx63sb1t5c0000gn/T/ollama3971698503/runners/metal/ollama_llama_server --model /Users/bytedance/.ollama/models/blobs/sha256-43f7a214e5329f672bb05404cfba1913cbb70fdaa1a17497224e1925046b0ed5 --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 29 --parallel 4 --port 61974"

仔细看命令

ollama_llama_server --model /Users/bytedance/.ollama/models/blobs/sha256-43f7a214e5329f672bb05404cfba1913cbb70fdaa1a17497224e1925046b0ed5 --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 29 --parallel 4 --port 61974

本质上使用的是 llama.cpp 提供的功能。直接使用 llama.cpp 的功能也可以实现相同的效果。

➜  bin ./llama-server --model /Users/bytedance/.ollama/models/blobs/sha256-43f7a214e5329f672bb05404cfba1913cbb70fdaa1a17497224e1925046b0ed5 --ctx-size 8192 --batch-size 512 --log-disable --n-gpu-layers 29 --parallel 4 --port 61974
INFO [ main] build info | tid="0x1fa794c00" timestamp=1723624128 build=3581 commit="06943a69"
INFO [ main] system info | tid="0x1fa794c00" timestamp=1723624128 n_threads=8 n_threads_batch=-1 total_threads=10 system_info="AVX = 0 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 1 | SVE = 0 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | SSSE3 = 0 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | "
llama_model_loader: loaded meta data with 21 key-value pairs and 339 tensors from /Users/bytedance/.ollama/models/blobs/sha256-43f7a214e5329f672bb05404cfba1913cbb70fdaa1a17497224e1925046b0ed5 (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv 0: general.architecture str = qwen2
llama_model_loader: - kv 1: general.name str = Qwen2-7B-Instruct
llama_model_loader: - kv 2: qwen2.block_count u32 = 28
llama_model_loader: - kv 3: qwen2.context_length u32 = 32768
llama_model_loader: - kv 4: qwen2.embedding_length u32 = 3584
llama_model_loader: - kv 5: qwen2.feed_forward_length u32 = 18944
llama_model_loader: - kv 6: qwen2.attention.head_count u32 = 28
llama_model_loader: - kv 7: qwen2.attention.head_count_kv u32 = 4
llama_model_loader: - kv 8: qwen2.rope.freq_base f32 = 1000000.000000
llama_model_loader: - kv 9: qwen2.attention.layer_norm_rms_epsilon f32 = 0.000001
llama_model_loader: - kv 10: general.file_type u32 = 2
llama_model_loader: - kv 11: tokenizer.ggml.model str = gpt2
llama_model_loader: - kv 12: tokenizer.ggml.pre str = qwen2
llama_model_loader: - kv 13: tokenizer.ggml.tokens arr[str,152064] = ["!", "\"", "#", "$", "%", "&", "'", ...
llama_model_loader: - kv 14: tokenizer.ggml.token_type arr[i32,152064] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv 15: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
llama_model_loader: - kv 16: tokenizer.ggml.eos_token_id u32 = 151645
llama_model_loader: - kv 17: tokenizer.ggml.padding_token_id u32 = 151643
llama_model_loader: - kv 18: tokenizer.ggml.bos_token_id u32 = 151643
llama_model_loader: - kv 19: tokenizer.chat_template str = {% for message in messages %}{% if lo...
llama_model_loader: - kv 20: general.quantization_version u32 = 2
llama_model_loader: - type f32: 141 tensors
llama_model_loader: - type q4_0: 197 tensors
llama_model_loader: - type q6_K: 1 tensors
...

打开 本地服务

llama cpp.png

可以看到,llama-server 已经内嵌了一个简单的 UI 页面,也可以切换到右上角的 New UI,有个更加美观的页面。在这里简单的页面上我们就可以和 LLM 大模型进行交流了。

llama_cpp_new_ui.png

但是 Ollama 启动的 Ollama-llama-server 这个 UI 被移除了。

curl 'http://localhost:61974/completion' \
-H 'Accept: text/event-stream' \
-H 'Accept-Language: en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7' \
-H 'Cache-Control: no-cache' \
-H 'Connection: keep-alive' \
-H 'Content-Type: application/json' \
-H 'Cookie: gitea_incredible=jRSMcBghtF%3A63878871dcbaf7a40498c267e6df0b786550c2b7f8ab9e1d46d610e0affc4286' \
-H 'DNT: 1' \
-H 'Origin: http://localhost:61974' \
-H 'Pragma: no-cache' \
-H 'Referer: http://localhost:61974/' \
-H 'Sec-Fetch-Dest: empty' \
-H 'Sec-Fetch-Mode: cors' \
-H 'Sec-Fetch-Site: same-origin' \
-H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36' \
-H 'sec-ch-ua: "Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"' \
-H 'sec-ch-ua-mobile: ?0' \
-H 'sec-ch-ua-platform: "macOS"' \
--data-raw '{"stream":true,"n_predict":400,"temperature":0.7,"stop":["</s>","Llama:","User:"],"repeat_last_n":256,"repeat_penalty":1.18,"penalize_nl":false,"top_k":40,"top_p":0.95,"min_p":0.05,"tfs_z":1,"typical_p":1,"presence_penalty":0,"frequency_penalty":0,"mirostat":0,"mirostat_tau":5,"mirostat_eta":0.1,"grammar":"","n_probs":0,"min_keep":0,"image_data":[],"cache_prompt":true,"api_key":"","slot_id":-1,"prompt":"This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.\n\nUser: hello\nLlama:"}'
➜  ~ curl 'http://localhost:61974/completion' \
-H 'Accept: text/event-stream' \
-H 'Accept-Language: en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7' \
-H 'Cache-Control: no-cache' \
-H 'Connection: keep-alive' \
-H 'Content-Type: application/json' \
-H 'Cookie: gitea_incredible=jRSMcBghtF%3A63878871dcbaf7a40498c267e6df0b786550c2b7f8ab9e1d46d610e0affc4286' \
-H 'DNT: 1' \
-H 'Origin: http://localhost:61974' \
-H 'Pragma: no-cache' \
-H 'Referer: http://localhost:61974/' \
-H 'Sec-Fetch-Dest: empty' \
-H 'Sec-Fetch-Mode: cors' \
-H 'Sec-Fetch-Site: same-origin' \
-H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36' \
-H 'sec-ch-ua: "Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"' \
-H 'sec-ch-ua-mobile: ?0' \
-H 'sec-ch-ua-platform: "macOS"' \
--data-raw '{"stream":true,"n_predict":400,"temperature":0.7,"stop":["</s>","Llama:","User:"],"repeat_last_n":256,"repeat_penalty":1.18,"penalize_nl":false,"top_k":40,"top_p":0.95,"min_p":0.05,"tfs_z":1,"typical_p":1,"presence_penalty":0,"frequency_penalty":0,"mirostat":0,"mirostat_tau":5,"mirostat_eta":0.1,"grammar":"","n_probs":0,"min_keep":0,"image_data":[],"cache_prompt":true,"api_key":"","slot_id":-1,"prompt":"This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.\n\nUser: hello\nLlama:"}'
data: {"content":" Hello","stop":false,"id_slot":0,"multimodal":false}

data: {"content":" there","stop":false,"id_slot":0,"multimodal":false}

data: {"content":"!","stop":false,"id_slot":0,"multimodal":false}

data: {"content":" How","stop":false,"id_slot":0,"multimodal":false}

data: {"content":" can","stop":false,"id_slot":0,"multimodal":false}

data: {"content":" I","stop":false,"id_slot":0,"multimodal":false}

data: {"content":" assist","stop":false,"id_slot":0,"multimodal":false}

data: {"content":" you","stop":false,"id_slot":0,"multimodal":false}

data: {"content":" today","stop":false,"id_slot":0,"multimodal":false}

data: {"content":"?\n\n","stop":false,"id_slot":0,"multimodal":false}

data: {"content":"","stop":false,"id_slot":0,"multimodal":false}

data: {"content":"","stop":false,"id_slot":0,"multimodal":false}

data: {"content":"","id_slot":0,"stop":true,"model":"/Users/bytedance/.ollama/models/blobs/sha256-43f7a214e5329f672bb05404cfba1913cbb70fdaa1a17497224e1925046b0ed5","tokens_predicted":12,"tokens_evaluated":47,"generation_settings":{"n_ctx":2048,"n_predict":-1,"model":"/Users/bytedance/.ollama/models/blobs/sha256-43f7a214e5329f672bb05404cfba1913cbb70fdaa1a17497224e1925046b0ed5","seed":4294967295,"temperature":0.699999988079071,"dynatemp_range":0.0,"dynatemp_exponent":1.0,"top_k":40,"top_p":0.949999988079071,"min_p":0.05000000074505806,"tfs_z":1.0,"typical_p":1.0,"repeat_last_n":256,"repeat_penalty":1.1799999475479126,"presence_penalty":0.0,"frequency_penalty":0.0,"penalty_prompt_tokens":[],"use_penalty_prompt_tokens":false,"mirostat":0,"mirostat_tau":5.0,"mirostat_eta":0.10000000149011612,"penalize_nl":false,"stop":["</s>","Llama:","User:"],"n_keep":0,"n_discard":0,"ignore_eos":false,"stream":true,"logit_bias":[],"n_probs":0,"min_keep":0,"grammar":"","samplers":["top_k","tfs_z","typical_p","top_p","min_p","temperature"]},"prompt":"This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.\n\nUser: hello\nLlama:","truncated":false,"stopped_eos":false,"stopped_word":true,"stopped_limit":false,"stopping_word":"User:","tokens_cached":58,"timings":{"prompt_n":1,"prompt_ms":1468.578,"prompt_per_token_ms":1468.578,"prompt_per_second":0.6809308051734398,"predicted_n":12,"predicted_ms":336.452,"predicted_per_token_ms":28.037666666666667,"predicted_per_second":35.66630604068337}}

➜ ~

Ollama UI/GUI

UI: open-webui

pip install open-webui && open-webui serve

open_ui.png

GUI: Hollama hollama.png

架构

ollama_arch.svg

亮点

  • 将 llama.cpp 中繁杂的操作,包装成更加简单的操作。用户不需要关心如何在本地启动项目,模型文件放在哪里,只需要选择是哪种模型,就可以直接对话。
  • 更加动态的支持更多的模型,定义了 Modelfile 文件,和 Dockerfile 文件一样。支持模型仓库
  • 支持自定义 Modelfile 文件,支持导入 GGUF(GPT-Generated Unified Format) 模型文件。或者发布模型。(pull/push)
  • 更加友好的图形化页面。有丰富的 UI

Linux 上修改 Docker 镜像存储位置

· 阅读需 3 分钟
wencaiwulue
Senior DevOps / System Engineer @ bytedance

有一台 Linux 开发机,用于构建镜像或者转存镜像,但是系统盘很小,只有 120G,挂载了一块数据盘,500G 的,但是 Docker 默认的镜像存储位置在系统盘,导致系统盘空间不足,需要修改 Docker 镜像存储位置。

现状

df -h
Filesystem      Size  Used Avail Use% Mounted on
udev 32G 0 32G 0% /dev
tmpfs 6.3G 27M 6.3G 1% /run
/dev/vda1 119G 87G 27G 77% /
tmpfs 32G 3.5M 32G 1% /dev/shm
tmpfs 5.0M 0 5.0M 0% /run/lock
tmpfs 32G 0 32G 0% /sys/fs/cgroup
tmpfs 128M 4.0K 128M 1% /.syskrbonly
/dev/vdb 492G 23G 444G 5% /data00
tmpfs 6.3G 8.0K 6.3G 1% /run/user/0
tmpfs 6.3G 0 6.3G 0% /run/user/2000
tmpfs 6.3G 24K 6.3G 1% /run/user/1000
tmpfs 6.3G 0 6.3G 0% /run/user/1001

可以看到系统盘在 / 下,数据盘在 /data00 下。

➜  ~ docker info | grep "Docker Root Dir"
Docker Root Dir: /var/lib/docker
➜  ~ docker version
Client: Docker Engine - Community
Version: 24.0.7
API version: 1.43
Go version: go1.20.10
Git commit: afdd53b
Built: Thu Oct 26 09:08:20 2023
OS/Arch: linux/amd64
Context: default

Server: Docker Engine - Community
Engine:
Version: 24.0.7
API version: 1.43 (minimum version 1.12)
Go version: go1.20.10
Git commit: 311b9ff
Built: Thu Oct 26 09:08:20 2023
OS/Arch: linux/amd64
Experimental: false
containerd:
Version: 1.6.26
GitCommit: 3dd1e886e55dd695541fdcd67420c2888645a495
runc:
Version: 1.1.10
GitCommit: v1.1.10-0-g18a0cb0
docker-init:
Version: 0.19.0
GitCommit: de40ad0
➜ ~

可以看到,docker 的镜像存储位置在 /var/lib/docker,但是系统盘空间不足,需要修改。

解决

sudo mv /var/lib/docker /data00

添加配置 "data-root": "/data00/docker"

➜  ~ cat /etc/docker/daemon.json
{
"insecure-registries": ["hub.byted.org", "hub.byted.org:443"],
"live-restore": true,
"data-root": "/data00/docker"
}

重启 docker

systemctl restart docker

效果

➜  ~ docker info | grep "Docker Root Dir"
Docker Root Dir: /data00/docker

检查镜像

➜  ~ docker images
REPOSITORY TAG IMAGE ID CREATED SIZE
gcr.io/k8s-minikube/kicbase v0.0.43 619d67e74933 2 months ago 1.26GB
moby/buildkit buildx-stable-1 be698b50dea4 7 months ago 172MB
➜ ~

查看磁盘空间

➜  ~ df -h
Filesystem Size Used Avail Use% Mounted on
udev 32G 0 32G 0% /dev
tmpfs 6.3G 27M 6.3G 1% /run
/dev/vda1 119G 17G 97G 15% /
tmpfs 32G 3.6M 32G 1% /dev/shm
tmpfs 5.0M 0 5.0M 0% /run/lock
tmpfs 32G 0 32G 0% /sys/fs/cgroup
tmpfs 128M 4.0K 128M 1% /.syskrbonly
/dev/vdb 492G 89G 378G 19% /data00
tmpfs 6.3G 8.0K 6.3G 1% /run/user/0
tmpfs 6.3G 0 6.3G 0% /run/user/2000
tmpfs 6.3G 24K 6.3G 1% /run/user/1000
tmpfs 6.3G 0 6.3G 0% /run/user/1001

系统盘空间充足,数据盘空间充足。

排查公有云 Cloud IDE websocket 链接异常断开问题

· 阅读需 7 分钟
wencaiwulue
Senior DevOps / System Engineer @ bytedance

背景

之前做了一个 Cloud IDE,可以实现多人同时编辑同一个文件。在私有云是好好地,但是上线到公有云后,出现了 WebSocket 异常断开的问题。 img.png

现象

同时打开两个工作空间,一个工作空间打开多个标签页,开启 network 查看网络流量。然后放置不理,过一段时间后,WebSocket 断开。通过 WebSocket 中的 pingpong 观察到多个网页的 WebSocket 基本上都是在同一时刻断开的。

  • WebSocket 断开链接都是同时发生,不是一个个顺序发生的
  • 相同工作空间多个标签页 WebSocket 断开的时刻相同
  • 不同工作空间标签页 WebSocket 断开的时刻不同

初步尝试

起初的排查是链接超时时间。但是修改了超时时间后,依旧有问题。

apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
annotations:
nginx.ingress.kubernetes.io/proxy-connect-timeout: "120"
nginx.ingress.kubernetes.io/proxy-read-timeout: "120"
nginx.ingress.kubernetes.io/proxy-send-timeout: "120"
nginx.ingress.kubernetes.io/proxy-request-buffering: "off"

再次尝试

没有办法了,祭出大招 tcpdump

root@iv-yd6qd3t0qo5i3z3c3v05:~# ps -ef | grep ingress
systemd+ 2410639 2410439 0 Jun25 ? 00:00:00 /usr/bin/dumb-init -- /nginx-ingress-controller --publish-service=kube-system/ingress-nginx-controller --election-id=ingress-controller-leader-nginx --controller-class=k8s.io/ingress-nginx --ingress-class=nginx --configmap=kube-system/ingress-nginx-controller --validating-webhook=:8443 --validating-webhook-certificate=/usr/local/certificates/cert --validating-webhook-key=/usr/local/certificates/key --watch-ingress-without-class=true -v=10
systemd+ 2410653 2410639 0 Jun25 ? 00:08:37 /nginx-ingress-controller --publish-service=kube-system/ingress-nginx-controller --election-id=ingress-controller-leader-nginx --controller-class=k8s.io/ingress-nginx --ingress-class=nginx --configmap=kube-system/ingress-nginx-controller --validating-webhook=:8443 --validating-webhook-certificate=/usr/local/certificates/cert --validating-webhook-key=/usr/local/certificates/key --watch-ingress-without-class=true -v=10
root 2602901 2602885 0 10:24 pts/0 00:00:00 grep ingress
root@iv-yd6qd3t0qo5i3z3c3v05:~# nsenter -n -t 2410653
root@iv-yd6qd3t0qo5i3z3c3v05:~# tcpdump -i eth0 -vvv -w tcpdump.pcap

在流量经过的每个节点抓包。从而绘制出网络拓扑图。 cloudide-websocket-arch.svg 常用的 wireshark 过滤语句

  • ip.addr == 192.0.2.1 and tcp.port not in 25
  • ip.dst == 192.168.0.65 and tcp.flags.fin

使用 tcp.flags.fin 过滤出 FIN 包的链接

观察

从图中可以看出,ingress-nginx-controlleride-server 和 控制面的 proxy,都发送了 FIN 包,从而导致链接断开。因此定位是 ingress-nginx-controller 的问题。

表层原因

虽然找到了是 ingress-nginx-controller 的问题,但是不知道为什么会断开,于是先去检查 ingress 的配置。

➜  ~ kubectl get ingress -n shared-webapp-01
NAME CLASS HOSTS ADDRESS PORTS AGE
ide-wco5mu3vihefepmq77ap0 nginx ide.volcengine.com 192.168.1.7 80 62s
ide-wcptaca7ihefdlmf7ci4g nginx ide.volcengine.com 192.168.1.7 80 77s
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
annotations:
nginx.ingress.kubernetes.io/proxy-body-size: 1G
nginx.ingress.kubernetes.io/proxy-connect-timeout: "120"
nginx.ingress.kubernetes.io/proxy-http-version: "1.1"
nginx.ingress.kubernetes.io/proxy-read-timeout: "120"
nginx.ingress.kubernetes.io/proxy-request-buffering: "off"
nginx.ingress.kubernetes.io/proxy-send-timeout: "120"
nginx.ingress.kubernetes.io/rewrite-target: /$2
name: ide-wco5mu3vihefepmq77ap0
namespace: shared-webapp-01
spec:
ingressClassName: nginx
rules:
- host: ide.volcengine.com
http:
paths:
- backend:
service:
name: ide-wco5mu3vihefepmq77ap0
port:
name: http
path: /webapp-01/ide-wco5mu3vihefepmq77ap0(/|$)(.*)
pathType: Prefix
status:
loadBalancer:
ingress:
- ip: 192.168.1.7

一个工作空间,会创建一个 ingress。起初并没有觉着有什么问题。然后又对比了数据面和控制面的 /etc/nginx/nginx.conf ,并没有发现很明显的差异。

架构入手

后来看了一下设计文档,发现有个组件 agent 用来控制和管理 ide 生命周期的。想着会不会是因为这个组件出 bug 了,导致 websocket 被异常关闭。因此将数据面的 agent 的副本数设置为 0,然后 websocket 就稳定了。翻看了一下 agent 的代码,发现有创建/删除 ingress的操作。于是去搜了一下 ingress-nginx-controller 的 issue,发现有个 #2461,当 nginx reload 的时候,会把已经存在的 websocket 链接关闭。找到个选项 worker-shutdown-timeout,可以设置 nginx reload 的超时时间。试了之后,发现的确有效果。至此,问题的真正原因找到了。

真正原因

当 ingress 发生变化后,ingress-nginx-controller 会收集 ingress 的配置,然后转换为 nginx 的配置,然后 nginx -s reload。reload 的时候。先发送 HUP 信号给 master process,然后 master process 会发送消息给 worker processes,通知 worker processes 关闭。但如果此时 worker processes 上还有链接正在处理,worker processes 会等待链接处理完成后再关闭。如果处理时间过长,worker processes 会超时关闭。而超时时间就是 worker-shutdown-timeout。默认值是 240s,也就是 4 分钟。

https://nginx.org/en/docs/ngx_core_module.html#worker_shutdown_timeout

Configures a timeout for a graceful shutdown of worker processes. When the time expires, nginx will try to close all the connections currently open to facilitate shutdown.

https://nginx.org/en/docs/control.html

In order for nginx to re-read the configuration file, a HUP signal should be sent to the master process. The master process first checks the syntax validity, then tries to apply new configuration, that is, to open log files and new listen sockets. If this fails, it rolls back changes and continues to work with old configuration. If this succeeds, it starts new worker processes, and sends messages to old worker processes requesting them to shut down gracefully. Old worker processes close listen sockets and continue to service old clients. After all clients are serviced, old worker processes are shut down.

临时解决方法

增加 worker-shutdown-timeout 的值,比如 10 小时。测试后发现,可以解决问题。但是,这个值不能太大,否则会导致 worker processes 无法及时关闭,导致资源无法回收。从而导致文件描述符耗尽。无法创建出新的 worker processes

https://kubernetes.github.io/ingress-nginx/user-guide/nginx-configuration/configmap/#worker-shutdown-timeout

解决方向

  • 改架构。使用动态代理,而不是静态代理。这样就不需要 ingress 了。
events {
worker_connections 1024;
}

http {
# 定义服务列表
map $uri $service_name {
~/service/(?<service>\w+)/* $service;
default "";
}

server {
listen 80;

# 根据 URL 中的 path 转发到对应的服务
location / {
proxy_pass http://$service_name;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
}
}
  • 不改架构。使用 higress ,阿里 MSE 的开源项目,基于 envoy 实现的动态代理,没有 reload。

使用 kubevpn dev 模式在本地开发云原生工程

· 阅读需 6 分钟
wencaiwulue
Senior DevOps / System Engineer @ bytedance

使用 KubeVPN dev 模式在本地开发项目

前言

最近在开发 WebIDE 功能,开发其中的聊天功能,原理不是很复杂,但是需要存储用户的聊天信息。因此需要使用到 MySQL 数据库,但是我们的集群现在都关闭了公网访问,开发环境的集群需要通过其他集群的网络做跳板。并且这个 MySQL 是个外置的数据库,也就是没有运行在集群中,而是在同一个 VPC 下。

诉求

能够在本地启动项目,快速开发。

简单的解决之道

在本地直接使用 docker 启动一个 MySQL,修改一下配置文件,就可以在本地启动项目了。但需要手动改配置,也不是很方便。

使用 kubevpn 在本地开发

kubevpn dev deployment/ws-co98bl9mhb1pisov4tc0 -n feiyan-1000000001 --rm --dev-image naison/kubevpn:v2.2.4 --entrypoint bash -v ~/GolandProjects/ide-server:/app -p 2345:2345 --extra-domain mysql1a1bb5fc80a6.rds.ivolces.com -it --connect-mode container --no-proxy

参数解释:

  • -n 指定工作负载所在的工作空间。
  • --rm 自动删除本地 container。
  • --dev-image 指定在本地启动 container 的镜像,如果不指定的话,默认使用工作负载的镜像。
  • --entrypoint 指定启动命令,如果不指定,默认使用工作负载的启动命令。
  • -v 挂载磁盘。将本地目录挂载在 docker 容器内。
  • -p 指定暴露端口,将 docker 容器端口暴露在主机上。
  • --extra-domain 额外的域名。将本域名解析的 ip 加入到本地路由表。
  • -it 交互模式,并启动 tty。
  • --connect-mode,指定链接模式,可以直接在容器中链接集群网络。
  • --no-proxy 指定是否将工作负载的流量拦截到本地。

熟悉 kubectl 和 docker 的同学可以很明显的看出,这些参数都是 kubectl 以及 docker 的选项,没错,就是这样的。

启动完成后,就会自动进入到 terminal 中,然后这个 terminal 的网路就是和 k8s 集群的网络是打通的。并且环境变量,磁盘挂载,也都是集群中工作负载的配置一模一样。这样就可以在这里启动项目啦~

➜  ~ kubevpn dev deployment/ws-co98bl9mhb1pisov4tc0 -n feiyan-1000000001 --rm --dev-image naison/kubevpn:v2.2.4 --entrypoint bash -v ~/GolandProjects/ide-server:/app -p 2345:2345 --extra-domain mysql1a1bb5fc80a6.rds.ivolces.com -it --connect-mode container --no-proxy
starting container connect to cluster
Created container: kubevpn_local_96e04
Wait container kubevpn_local_96e04 to be running...
Container kubevpn_local_96e04 is running now
start to connect
got cidr from cache
get cidr successfully
update ref count successfully
traffic manager already exist, reuse it
port forward ready
tunnel connected
adding route...
dns service ok
container connect to cluster successfully
tar: Removing leading `/' from member names
tar: Removing leading `/' from hard link targets
/var/folders/30/cmv9c_5j3mq_kthx63sb1t5c0000gn/T/522107804466811630:/code
tar: Removing leading `/' from member names
tar: Removing leading `/' from hard link targets
/var/folders/30/cmv9c_5j3mq_kthx63sb1t5c0000gn/T/6989754788069797291:/etc/feiyan
tar: Removing leading `/' from member names
tar: Removing leading `/' from hard link targets
/var/folders/30/cmv9c_5j3mq_kthx63sb1t5c0000gn/T/7236198221188709503:/var/run/secrets/kubernetes.io/serviceaccount
network mode is container:4edc150aceaa685763e7d380d667eb5a02eb1e69df00768f8b8825c23e93acdd
root@4edc150aceaa:/app#
root@4edc150aceaa:/app# ping -c 4 mysql1a1bb5fc80a6.rds.ivolces.com
PING mysql1a1bb5fc80a6.rds.ivolces.com (10.0.0.32) 56(84) bytes of data.
64 bytes from mysql1a1bb5fc80a6.rds.ivolces.com (10.0.0.32): icmp_seq=1 ttl=62 time=120 ms
64 bytes from mysql1a1bb5fc80a6.rds.ivolces.com (10.0.0.32): icmp_seq=2 ttl=62 time=127 ms
64 bytes from mysql1a1bb5fc80a6.rds.ivolces.com (10.0.0.32): icmp_seq=3 ttl=62 time=50.4 ms
64 bytes from mysql1a1bb5fc80a6.rds.ivolces.com (10.0.0.32): icmp_seq=4 ttl=62 time=54.9 ms

--- mysql1a1bb5fc80a6.rds.ivolces.com ping statistics ---
4 packets transmitted, 4 received, 0% packet loss, time 3017ms
rtt min/avg/max/mdev = 50.389/88.098/126.673/35.539 ms

这样就可以在此容器中启动自己的项目啦~

root@4edc150aceaa:/app/cmd# alias start='./cmd server --config /etc/feiyan/config.yaml --webide-workspace-id co98bl9mhb1pisov4tc0'
root@4edc150aceaa:/app/cmd# start
Using config file: /etc/feiyan/config.yaml
2024/04/08 19:09:41.483649 cmd.go:59: [Info] FLAG: --config="/etc/feiyan/config.yaml"
2024/04/08 19:09:41.488000 cmd.go:59: [Info] FLAG: --help="false"
2024/04/08 19:09:41.488006 cmd.go:59: [Info] FLAG: --hertz-max-request-body-size="4194304"
2024/04/08 19:09:41.488009 cmd.go:59: [Info] FLAG: --hertz-port="6789"
2024/04/08 19:09:41.488012 cmd.go:59: [Info] FLAG: --hertz-tls="false"
2024/04/08 19:09:41.488017 cmd.go:59: [Info] FLAG: --kitex-port="8888"
2024/04/08 19:09:41.488019 cmd.go:59: [Info] FLAG: --kitex-tls-enable="false"
2024/04/08 19:09:41.488021 cmd.go:59: [Info] FLAG: --log-caller-key="caller"
2024/04/08 19:09:41.488023 cmd.go:59: [Info] FLAG: --log-compress="true"
2024/04/08 19:09:41.488025 cmd.go:59: [Info] FLAG: --log-level="debug"
2024/04/08 19:09:41.488027 cmd.go:59: [Info] FLAG: --log-level-key="level"
2024/04/08 19:09:41.488029 cmd.go:59: [Info] FLAG: --log-max-age="1"
2024/04/08 19:09:41.488033 cmd.go:59: [Info] FLAG: --log-max-backups="3"
2024/04/08 19:09:41.488035 cmd.go:59: [Info] FLAG: --log-max-size="100"
2024/04/08 19:09:41.488037 cmd.go:59: [Info] FLAG: --log-message-key="msg"
2024/04/08 19:09:41.488039 cmd.go:59: [Info] FLAG: --log-path="app.log"
2024/04/08 19:09:41.488041 cmd.go:59: [Info] FLAG: --log-time-key=""
2024/04/08 19:09:41.488054 cmd.go:59: [Info] FLAG: --mysql-conn-max-idle-time="30s"
2024/04/08 19:09:41.488059 cmd.go:59: [Info] FLAG: --mysql-conn-max-life-time="1h0m0s"
2024/04/08 19:09:41.488061 cmd.go:59: [Info] FLAG: --mysql-create-batch-size="1000"
2024/04/08 19:09:41.488063 cmd.go:59: [Info] FLAG: --mysql-database="MYSQL_DB"
2024/04/08 19:09:41.488065 cmd.go:59: [Info] FLAG: --mysql-host="MYSQL_HOST"
2024/04/08 19:09:41.488067 cmd.go:59: [Info] FLAG: --mysql-max-idle-conns="1"
2024/04/08 19:09:41.488068 cmd.go:59: [Info] FLAG: --mysql-max-open-conns="100"
2024/04/08 19:09:41.488070 cmd.go:59: [Info] FLAG: --mysql-password="MYSQL_PASSWORD"
2024/04/08 19:09:41.488072 cmd.go:59: [Info] FLAG: --mysql-port="MYSQL_PORT"
2024/04/08 19:09:41.488074 cmd.go:59: [Info] FLAG: --mysql-username="MYSQL_USERNAME"
2024/04/08 19:09:41.488076 cmd.go:59: [Info] FLAG: --server-ca-file=""
2024/04/08 19:09:41.488078 cmd.go:59: [Info] FLAG: --server-cert-file=""
2024/04/08 19:09:41.488088 cmd.go:59: [Info] FLAG: --server-key-file=""
2024/04/08 19:09:41.488099 cmd.go:59: [Info] FLAG: --webide-git-ignore="[*~,*.swo,*.swp,*.swpx,*.swx,.ccls-cache]"
2024/04/08 19:09:41.488143 cmd.go:59: [Info] FLAG: --webide-log-level="debug"
2024/04/08 19:09:41.488167 cmd.go:59: [Info] FLAG: --webide-max-file-size="10Mi"
2024/04/08 19:09:41.488170 cmd.go:59: [Info] FLAG: --webide-port="8910"
2024/04/08 19:09:41.488198 cmd.go:59: [Info] FLAG: --webide-revision-period="2s"
2024/04/08 19:09:41.488203 cmd.go:59: [Info] FLAG: --webide-root-dir="/code"
2024/04/08 19:09:41.488205 cmd.go:59: [Info] FLAG: --webide-sync-period="2s"
2024/04/08 19:09:41.488207 cmd.go:59: [Info] FLAG: --webide-workspace-id="co98bl9mhb1pisov4tc0"
init db witch config &{0x400032a100}init mysql feiyanadmin:Mirrors79Bio@tcp(mysql1a1bb5fc80a6.rds.ivolces.com:3306)/feiyan?charset=utf8mb4&parseTime=True&loc=Local

Linux namespace 简介

· 阅读需 15 分钟
wencaiwulue
Senior DevOps / System Engineer @ bytedance

我们知道容器使用 namespace 以及 cgroup 来资源隔离和限制,那么 namespace 都有哪些类型,cgroup 可以用来做什么尼?本文将和大家一起来探讨。

Linux namespace 简介

Linux Namespace是一种内核级别的隔离系统资源的方法,通过将系统全局资源放在不同的Namespace中,Linux提供了在一个单一系统上运行多个隔离的进程的能力。这意味着每个Namespace中的进程只能看到属于同一命名空间的资源,从而可以独立于其他Namespace的进程运行。这种隔离增强了系统的安全性,并为容器等技术提供了核心支持。 不同Namespace的程序,可以享有一份独立的系统资源。这有利于实现资源管理和限制,也可以避免不同服务或应用间的冲突问题。例如,每个Namespace都有自己的网络空间,这就意味着可以有多个网路接口,在每个Namespace中它们都可以叫eth0。 Linux Namespace主要有以下几种类型:

NamespaceFlagIsolates
CgroupCLONE_NEWCGROUPCgroup root directory
IPCCLONE_NEWIPCSystem V IPC, POSIX message queues
NetworkCLONE_NEWNETNetwork devices, stacks, ports, etc.
MountCLONE_NEWNSMount points
PIDCLONE_NEWPIDProcess IDs
TimeCLONE_NEWTIMEBoot and monotonic clocks
UserCLONE_NEWUSERUser and group IDs
UTSCLONE_NEWUTSHostname and NIS domain name

Linux man pages

PID namespace

PID namespaces用来隔离进程的ID空间,使得不同pid namespace里的进程ID可以重复且相互之间不影响。PID namespace可以嵌套,也就是说有父子关系,在当前namespace里面创建的所有新的namespace都是当前namespace的子namespace。父namespace里面可以看到所有子孙后代namespace里的进程信息,而子namespace里看不到祖先或者兄弟namespace里的进程信息。目前,PID namespace最多可以嵌套32层,由内核中的宏MAX_PID_NS_LEVEL来定义,由于ID为1的进程的特殊性,所以每个PID namespace的第一个进程的ID都是1。当这个进程运行停止后,内核将会给这个namespace里的所有其他进程发送SIGKILL信号,致使其他所有进程都停止,于是namespace被销毁掉。

➜  ~ sudo unshare --pid --mount-proc --fork /bin/sh
# ps -ef
UID PID PPID C STIME TTY TIME CMD
root 1 0 2 16:43 pts/1 00:00:00 /bin/sh
root 2 1 0 16:43 pts/1 00:00:00 ps -ef
#

可以创建一个隔离的名字空间,其中最常用的是用于创建 PID 名字空间的 "--pid" 参数。但是,只用 "unshare --pid /bin/bash" 创建一个新的 PID 名字空间并不够,因为 "/proc" 文件系统还是原来的,它仍然反映主名字空间的进程信息。为了使新的 PID 名名空间中的进程信息反映在 "/proc" 中,我们需要在新的 PID 名字空间中装载新的 "/proc" 文件系统。具体操作为先卸载旧的 " /proc" 文件系统,再装载新的 "/proc" 文件系统。这就是 "unshare" 命令中需要 "--mount-proc" 参数的原因。之所以使用 " --mount-proc",是为了让运行在新 PID 名字空间中的命令看到一份与其 PID 名字空间相匹配的进程列表。

在宿主机上可以看到 pid namespace 中的所有进程。 但是pid namespace看不到宿主机上的进程。

Net namespace

Linux的Network Namespace(网络命名空间)是内核的一种特性,它能够提供一种资源隔离的方式,使得在同一台主机上的不同进程可以有自己独立的网络环境。每个Network Namespace中的网络设备、IP地址、路由表、防火墙规则等都是互相隔离的。 具体来说,每个Network Namespace都有自己的:

  • 网络设备:例如eth0、eth1、lo(回环)等。
  • IPv4或IPv6协议栈:每个namespace都可以有自己的网络协议。
  • 网络端口:例如同一台主机上的不同Network Namespace可以分别监听相同的端口,互不干扰。
  • 路由和ARP表:不同Network Namespace的网络设备有各自独立的路由规则。 这就意味着在同一个Network Namespace中的进程相互之间可以通过网络进行通信,而与其他Namespace中的进程网络隔离。这有利于实现资源管理和限制,同时也可以提供更高的系统安全性。 我们平时使用的容器技术(如Docker)就是依赖于Network Namespace技术来实现网络的隔离和独立。例如,在Docker中创建一个新的容器时,Docker会创建一个新的Network Namespace,这样每个容器就可以有自己独立的网络环境。
➜  ~ sudo unshare --net --fork /bin/sh
# ping www.bing.com
connect: Network is unreachable
# ip addr
1: lo: <LOOPBACK> mtu 65536 qdisc noop state DOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
#

CNI

使用 veth pair 连接两个网络命名空间并进行通信

  1. 创建两个网络命名空间:
ip netns add net0
ip netns add net1
ip netns ls

进入 net ns 中

nsenter --net=/var/run/netns/net0
ip addr
nsenter --net=/var/run/netns/net1
ip addr
  1. 创建 veth pair 并将其两端分别放入两个网络命名空间:
ip link add veth0 type veth peer name veth1
ip link set veth0 netns net0
ip link set veth1 netns net1
  1. 启动 veth pair 的两端,并为其分配 IP 地址:
ip netns exec net0 ip link set veth0 up
ip netns exec net0 ip addr add 10.0.1.1/24 dev veth0

ip netns exec net1 ip link set veth1 up
ip netns exec net1 ip addr add 10.0.1.2/24 dev veth1

现在,两个网络命名空间 net0 和 net1 中的设备 veth0 和 veth1 分别拥有 IP 地址10.0.1.1和10.0.1.2,并且能够互相通信。

  1. 测试
ip netns exec ns1 ping -c2 10.0.1.2
ip netns exec ns2 ping -c2 10.0.1.1

上述命令会在 ns1 命名空间和 ns2 命名空间中分别发送 ICMP 回显请求,测试 ns1 与 ns2 之间的连接。

Mount namespace

Linux 的 Mount Namespace 可以为每个Namespace提供独立的文件系统挂载点视图。在一个Mount Namespace中所做的挂载和卸载操作不会影响到其他的Namespace。

➜  code ls /mnt
➜ code ls
asdfg a.sh a.shh main main.go main.go_1 main.go_back
➜ code cd ..
➜ ~ ls /mnt
➜ ~ sudo unshare --mount /bin/bash
root@n37-006-014:/data00/home/fengcaiwen# sudo mount -t tmpfs tmpfs /mnt
root@n37-006-014:/data00/home/fengcaiwen# cd /mnt/
root@n37-006-014:/mnt# ls
root@n37-006-014:/mnt# touch main.go
root@n37-006-014:/mnt# ls
main.go
root@n37-006-014:/mnt# exit
exit
➜ ~ ls /mnt
➜ ~

UTS namespace

Linux的UTS Namespace,主要用于隔离两个系统的标识符:hostname(主机名)和NIS domain name。当你创建一个新的UTS Namespace后,便可以在这个 Namespace 里面修改hostname和NIS domain name,而不会影响到其他Namespace。

➜  ~ hostname
n37-006-014
➜ ~ sudo unshare --uts --fork /bin/sh -c "hostname abc; /bin/sh"
# hostname
abc
#

User namespace

User Namespace是Linux命名空间中的一种,主要用于隔离与安全相关的标识符和属性。这主要包括用户ID和组ID,以及进程的能力。在同一台机器上,同一个进程在不同的User Namespace中可以具有不同的用户ID,组ID和能力。User Namespace的引入,使得我们可以在非特权用户的环境中运行具有root权限的程序。

root@ws-cm1agta5n77o3tcausf0-54f557b8f5-hkkjc:/app# unshare --user --map-user 12005 --map-group 12292 --fork /bin/sh
$ id
uid=12005(runner) gid=12292(runner) groups=12292(runner)
$ exit
root@ws-cm1agta5n77o3tcausf0-54f557b8f5-hkkjc:/app# unshare --user --fork /bin/sh
$ id
uid=65534(nobody) gid=65534(nogroup) groups=65534(nogroup)
$ exit
root@ws-cm1agta5n77o3tcausf0-54f557b8f5-hkkjc:/app# unshare --user --map-root-user --fork /bin/sh
# id
uid=0(root) gid=0(root) groups=0(root)
#

IPC namespace

IPC Namespace是Linux命名空间的一种,主要用于隔离进程间的通信(IPC,InterProcess Communication)资源,比如信号量(Semaphores)、消息队列(Message Queues)和共享内存段(Shared Memory Segments)。在同一台机器上,同一个进程在不同的IPC Namespace中可以具有不同的IPC资源。 TIPS: UNIX Socket被用于系统内进程间的通信(IPC),但不是被隔离在IPC Namespace中,而是在Network Namespace中进行隔离的。这意味着同一个Network Namespace内的进程可以通过UNIX Socket进行通信,但是不同Network Namespace中的进程则无法进行UNIX Socket通信。

Cgroups namespace

Cgroups(即控制组)是 Linux 内核的一个特性,用来限制、控制与隔离进程对系统资源(如 CPU、内存、磁盘 I/O、网络等)的使用。Cgroups 管理的资源可以按照层级进行划分,使得每个进程组可以有自己的资源限制。这是实现容器资源隔离的关键技术之一。 在 Kubernetes 中,Cgroups 被广泛用于实现工作负载的资源限制和隔离。例如,当你在 Pod 规范中设置 CPU 和内存限制时,Kubelet 会通过设置 Cgroups 参数来实现这些限制。 直到 Linux 内核 4.6 版本,引入了 Cgroup 命名空间(Cgroup Namespace)。Cgroup 命名空间用于虚拟化 Cgroup 树,使得在容器内部看到的 Cgroup 层级与宿主机上的实际 Cgroup 层级不一样。通过这种方式,可以在不改变容器进程在 Cgroup 树中位置的情况下,控制容器进程对 Cgroup 树的可见性。这可以增强容器的安全性和隔离性。

➜  ~ sudo unshare --pid --mount-proc --cgroup --fork /bin/bash
root@n37-006-014:/data00/home/fengcaiwen# cd /sys/fs/cgroup/cpu
root@n37-006-014:/sys/fs/cgroup/cpu# ls
agents.slice clamav_mem cpuacct.usage_percpu_sys cpu.cfs_period_us etrace_cpu system.slice
cgroup.clone_children cpuacct.stat cpuacct.usage_percpu_user cpu.cfs_quota_us etrace_mem tao_tasks
cgroup.procs cpuacct.usage cpuacct.usage_sys cpu.idle init.scope tasks
cgroup.sane_behavior cpuacct.usage_all cpuacct.usage_user cpu.shares notify_on_release tiger
clamav_cpu cpuacct.usage_percpu cpu.cfs_burst_us cpu.stat release_agent user.slice
root@n37-006-014:/sys/fs/cgroup/cpu# cat cgroup.procs
root@n37-006-014:/sys/fs/cgroup/cpu# while : ; do : ; done &
[1] 12
root@n37-006-014:/sys/fs/cgroup/cpu# top
top - 14:26:44 up 17:36, 2 users, load average: 0.69, 0.32, 0.27
Tasks: 4 total, 2 running, 2 sleeping, 0 stopped, 0 zombie
%Cpu(s): 15.1 us, 1.4 sy, 0.0 ni, 83.4 id, 0.0 wa, 0.0 hi, 0.1 si, 0.0 st
MiB Mem : 15773.5 total, 12523.4 free, 1087.1 used, 2163.0 buff/cache
MiB Swap: 0.0 total, 0.0 free, 0.0 used. 14331.1 avail Mem

PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
12 root 20 0 6900 1828 1316 R 100.0 0.0 0:14.61 bash
1 root 20 0 2380 700 632 S 0.0 0.0 0:00.00 sh
6 root 20 0 6900 3592 3084 S 0.0 0.0 0:00.00 bash
13 root 20 0 10936 3180 2876 R 0.0 0.0 0:00.00 top



root@n37-006-014:/sys/fs/cgroup/cpu# ps -ef
UID PID PPID C STIME TTY TIME CMD
root 1 0 0 14:24 pts/1 00:00:00 /bin/sh
root 6 1 0 14:25 pts/1 00:00:00 bash
root 12 6 99 14:26 pts/1 00:00:18 bash
root 14 6 0 14:26 pts/1 00:00:00 ps -ef
root@n37-006-014:/sys/fs/cgroup/cpu# cat /sys/fs/cgroup/cpu/cpu.cfs_quota_us
-1
root@n37-006-014:/sys/fs/cgroup/cpu# ls
agents.slice clamav_mem cpuacct.usage_percpu_sys cpu.cfs_period_us etrace_cpu system.slice
cgroup.clone_children cpuacct.stat cpuacct.usage_percpu_user cpu.cfs_quota_us etrace_mem tao_tasks
cgroup.procs cpuacct.usage cpuacct.usage_sys cpu.idle init.scope tasks
cgroup.sane_behavior cpuacct.usage_all cpuacct.usage_user cpu.shares notify_on_release tiger
clamav_cpu cpuacct.usage_percpu cpu.cfs_burst_us cpu.stat release_agent user.slice
root@n37-006-014:/sys/fs/cgroup/cpu# mkdir mycgroup
root@n37-006-014:/sys/fs/cgroup/cpu# cd mycgroup/
root@n37-006-014:/sys/fs/cgroup/cpu/mycgroup# ls
cgroup.clone_children cpuacct.usage cpuacct.usage_percpu_sys cpuacct.usage_user cpu.cfs_quota_us cpu.stat
cgroup.procs cpuacct.usage_all cpuacct.usage_percpu_user cpu.cfs_burst_us cpu.idle notify_on_release
cpuacct.stat cpuacct.usage_percpu cpuacct.usage_sys cpu.cfs_period_us cpu.shares tasks
root@n37-006-014:/sys/fs/cgroup/cpu/mycgroup# pwd
/sys/fs/cgroup/cpu/mycgroup
root@n37-006-014:/sys/fs/cgroup/cpu/mycgroup# ls
cgroup.clone_children cpuacct.usage cpuacct.usage_percpu_sys cpuacct.usage_user cpu.cfs_quota_us cpu.stat
cgroup.procs cpuacct.usage_all cpuacct.usage_percpu_user cpu.cfs_burst_us cpu.idle notify_on_release
cpuacct.stat cpuacct.usage_percpu cpuacct.usage_sys cpu.cfs_period_us cpu.shares tasks
root@n37-006-014:/sys/fs/cgroup/cpu/mycgroup# echo 12 > tasks
root@n37-006-014:/sys/fs/cgroup/cpu/mycgroup# cat cgroup.procs
12
root@n37-006-014:/sys/fs/cgroup/cpu/mycgroup# echo 20000 > cpu.cfs_quota_us
root@n37-006-014:/sys/fs/cgroup/cpu/mycgroup# top
top - 14:36:59 up 17:46, 2 users, load average: 1.01, 1.08, 0.74
Tasks: 4 total, 2 running, 2 sleeping, 0 stopped, 0 zombie
%Cpu(s): 3.4 us, 0.6 sy, 0.0 ni, 96.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
MiB Mem : 15773.5 total, 12530.3 free, 1075.9 used, 2167.4 buff/cache
MiB Swap: 0.0 total, 0.0 free, 0.0 used. 14341.3 avail Mem

PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
12 root 20 0 6900 1828 1316 R 20.0 0.0 10:22.27 bash
1 root 20 0 2380 700 632 S 0.0 0.0 0:00.00 sh
6 root 20 0 6900 3688 3148 S 0.0 0.0 0:00.15 bash
48 root 20 0 10936 3244 2940 R 0.0 0.0 0:00.00 top

Time namespace

在Linux系统中,Time Namespace允许每个命名空间可以拥有自己的一套时间流,Linux内核5.6及其内核版本以上才开始支持Time Namespace。

unshare --time /bin/bash

使用timedatectl命令来改变系统时间:

timedatectl set-time "2022-12-31 12:00:00"

在执行这个命令后,只有在新的Time命名空间中的进程会看到改变的时间,对于原来的Time Namespace中的进程,他们看到的系统时间还是没有改变的。

uname -r
5.4.143-2-velinux1-amd64 // 还不支持

总结

Namespace 用来做资源隔离 Cgroup 用来做资源限制