Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

创建服务失败 #76

Open
T-ze-yu opened this issue Jul 9, 2024 · 11 comments
Open

创建服务失败 #76

T-ze-yu opened this issue Jul 9, 2024 · 11 comments

Comments

@T-ze-yu
Copy link

T-ze-yu commented Jul 9, 2024

使用kuscia api创建serving服务请求如下:
{'serving_id': 'server-uxy3cewxmizgsql7', 'serving_input_config': '{"partyConfigs": {"p169": {"serverConfig": {"featureMapping": {"ahq96f_56": "ahq96f_56", "ahq96f_57": "ahq96f_57", "ahq96f_58": "ahq96f_58", "ahq96f_59": "ahq96f_59", "ahq96f_60": "ahq96f_60", "ahq96f_61": "ahq96f_61", "ahq96f_62": "ahq96f_62", "ahq96f_63": "ahq96f_63", "ahq96f_64": "ahq96f_64", "ahq96f_65": "ahq96f_65", "ahq96f_66": "ahq96f_66", "ahq96f_67": "ahq96f_67", "ahq96f_68": "ahq96f_68", "ahq96f_69": "ahq96f_69", "ahq96f_70": "ahq96f_70", "ahq96f_71": "ahq96f_71", "ahq96f_72": "ahq96f_72", "ahq96f_73": "ahq96f_73", "ahq96f_74": "ahq96f_74", "ahq96f_75": "ahq96f_75", "ahq96f_76": "ahq96f_76", "ahq96f_77": "ahq96f_77", "ahq96f_78": "ahq96f_78", "ahq96f_79": "ahq96f_79", "ahq96f_80": "ahq96f_80", "ahq96f_81": "ahq96f_81", "ahq96f_82": "ahq96f_82", "ahq96f_83": "ahq96f_83", "ahq96f_84": "ahq96f_84", "ahq96f_85": "ahq96f_85", "ahq96f_86": "ahq96f_86", "ahq96f_87": "ahq96f_87", "ahq96f_88": "ahq96f_88", "ahq96f_89": "ahq96f_89", "ahq96f_90": "ahq96f_90", "ahq96f_91": "ahq96f_91", "ahq96f_92": "ahq96f_92", "ahq96f_93": "ahq96f_93", "ahq96f_94": "ahq96f_94", "ahq96f_95": "ahq96f_95", "ahq96f_96": "ahq96f_96", "ahq96f_97": "ahq96f_97", "ahq96f_98": "ahq96f_98", "ahq96f_99": "ahq96f_99", "ahq96f_100": "ahq96f_100", "ahq96f_101": "ahq96f_101", "ahq96f_102": "ahq96f_102", "ahq96f_103": "ahq96f_103", "ahq96f_104": "ahq96f_104", "ahq96f_105": "ahq96f_105", "ahq96f_106": "ahq96f_106", "ahq96f_107": "ahq96f_107", "ahq96f_108": "ahq96f_108", "ahq96f_109": "ahq96f_109", "ahq96f_110": "ahq96f_110"}}, "modelConfig": {"modelId": "ed0862be3dbf11ef8af500505695ad3a", "basePath": "/tmp/p169", "sourcePath": "/home/kuscia/var/storage/data/jobs/202407091409079310520/f5290062-model-package.tar.gz", "sourceType": "ST_FILE"}, "featureSourceConfig": {"csv_opts": {"file_path": "/home/kuscia/var/storage/data/predict_dataset/\u9884\u6d4b\u670d\u52a1\u6837\u672c\u6a21\u677f_20240709173348.csv", "id_name": "id12"}}, "channel_desc": {"protocol": "http"}}, "p170": {"serverConfig": {"featureMapping": {"wzi527_1": "wzi527_1", "wzi527_2": "wzi527_2", "wzi527_3": "wzi527_3", "wzi527_4": "wzi527_4", "wzi527_5": "wzi527_5", "wzi527_6": "wzi527_6", "wzi527_7": "wzi527_7", "wzi527_8": "wzi527_8", "wzi527_9": "wzi527_9", "wzi527_10": "wzi527_10", "wzi527_11": "wzi527_11", "wzi527_12": "wzi527_12", "wzi527_13": "wzi527_13", "wzi527_14": "wzi527_14", "wzi527_15": "wzi527_15", "wzi527_16": "wzi527_16", "wzi527_17": "wzi527_17", "wzi527_18": "wzi527_18", "wzi527_19": "wzi527_19", "wzi527_20": "wzi527_20", "wzi527_21": "wzi527_21", "wzi527_22": "wzi527_22", "wzi527_23": "wzi527_23", "wzi527_24": "wzi527_24", "wzi527_25": "wzi527_25", "wzi527_26": "wzi527_26", "wzi527_27": "wzi527_27", "wzi527_28": "wzi527_28", "wzi527_29": "wzi527_29", "wzi527_30": "wzi527_30", "wzi527_31": "wzi527_31", "wzi527_32": "wzi527_32", "wzi527_33": "wzi527_33", "wzi527_34": "wzi527_34", "wzi527_35": "wzi527_35"}}, "modelConfig": {"modelId": "ed0862be3dbf11ef8af500505695ad3a", "basePath": "/tmp/p170", "sourcePath": "/home/kuscia/var/storage/data/jobs/202407091409079310520/f5290062-model-package.tar.gz", "sourceType": "ST_FILE"}, "featureSourceConfig": {"csv_opts": {"file_path": "/home/kuscia/var/storage/data/predict_dataset/\u9884\u6d4b\u670d\u52a1(\u5ba1\u6279)\u6837\u672c\u6a21\u677f_20240709173430.csv", "id_name": "id63"}}, "channel_desc": {"protocol": "http"}}}}', 'initiator': 'p169', 'parties': [{'domain_id': 'p169', 'app_image': 'secretflow-serving-image', 'role': '', 'replicas': 1, 'update_strategy': {'type': 'RollingUpdate', 'max_surge': '25%', 'max_unavailable': '25%'}, 'resources': [{'container_name': '', 'min_cpu': '0.1', 'max_cpu': '0.1', 'min_memory': '100Mi', 'max_memory': '100Mi'}]}, {'domain_id': 'p170', 'app_image': 'secretflow-serving-image', 'role': '', 'replicas': 1, 'update_strategy': {'type': 'RollingUpdate', 'max_surge': '25%', 'max_unavailable': '25%'}, 'resources': [{'container_name': '', 'min_cpu': '0.1', 'max_cpu': '0.1', 'min_memory': '100Mi', 'max_memory': '100Mi'}]}]}
返回如下:
{'status': {'code': 0, 'message': 'success', 'details': []}}
但是在kuscia容器中的日志文件中虽然建立了相关文件,但文件却为空,看不到serving创建的任何信息,使用该服务预测也失败

@T-ze-yu
Copy link
Author

T-ze-yu commented Jul 9, 2024

我还使用了serving/status/batchQuery接口去查询状态,结果如下:
{
"comments": "success",
"data": {
"data": {
"servings": [
{
"serving_id": "server-uxy3cewxmizgsql7",
"status": {
"available_parties": 0,
"create_time": "2024-07-09T09:50:02Z",
"message": "",
"party_statuses": [
{
"available_replicas": 0,
"create_time": "2024-07-09T09:50:03Z",
"domain_id": "p170",
"endpoints": [],
"replicas": 1,
"role": "",
"state": "Progressing",
"unavailable_replicas": 1,
"updatedReplicas": 1
},
{
"available_replicas": 0,
"create_time": "2024-07-09T09:50:02Z",
"domain_id": "p169",
"endpoints": [
{
"endpoint": "server-uxy3cewxmizgsql7-brpc-builtin.p169.svc:53511",
"port_name": "brpc-builtin",
"scope": "Domain"
},
{
"endpoint": "server-uxy3cewxmizgsql7-service.p169.svc:53508",
"port_name": "service",
"scope": "Domain"
},
{
"endpoint": "server-uxy3cewxmizgsql7-communication.p169.svc",
"port_name": "communication",
"scope": "Cluster"
},
{
"endpoint": "server-uxy3cewxmizgsql7-internal.p169.svc:53510",
"port_name": "internal",
"scope": "Domain"
}
],
"replicas": 1,
"role": "",
"state": "Progressing",
"unavailable_replicas": 1,
"updatedReplicas": 1
}
],
"reason": "",
"state": "Progressing",
"total_parties": 2
}
}
]
},
"status": {
"code": 0,
"details": [],
"message": "success"
}
},
"retcode": 0,
"retmsg": "success",
"statusCode": 0
}
我找不到任何的报错信息,导致我无法找到服务失败的原因

@zimu-yuxi
Copy link

看响应结果,创建和查询是正常的哈?现在是有什么问题吗?

@T-ze-yu
Copy link
Author

T-ze-yu commented Jul 9, 2024

日志文件是空的,只有个文件夹,预测也是返回结果404

@T-ze-yu
Copy link
Author

T-ze-yu commented Jul 9, 2024

image

@zimu-yuxi
Copy link

kuscia容器内,crictl ps -a,拿到id,crictl exec -it 容器id bash,进入容器内看下有没有日志

@T-ze-yu
Copy link
Author

T-ze-yu commented Jul 9, 2024

image
都是secretflow的

@T-ze-yu
Copy link
Author

T-ze-yu commented Jul 10, 2024

kubectl get deployments -A #查看确实部署了服务
NAMESPACE NAME READY UP-TO-DATE AVAILABLE AGE
p169 server-t1twa3ow8n3lm6tk 0/1 1 0 6m30s
kubectl describe pod server-t1twa3ow8n3lm6tk -n p169 #查看关于Pod的详细信息,包括容器内部的日志、事件、为什么无法启动等。
ee955958298cf2dde05369fa22a47e5b
可以看到是镜像拉取失败,将相关配置文件sf-serving.yaml的参数修改为"secretflow-registry.cn-hangzhou.cr.aliyuncs.com/secretflow/serving-anolis8" "0.3.1b0"就成功了

@T-ze-yu
Copy link
Author

T-ze-yu commented Jul 10, 2024

疑问:我尝试填写本地已经有的serving镜像名和标签,但程序还是要去docker pull拉取镜像,这样不得不依赖网络的拉取,可以修改相关设置先进行本地检查吗?

@zimu-yuxi
Copy link

kubectl get deployment 名称 -o yaml 看一下

@T-ze-yu
Copy link
Author

T-ze-yu commented Oct 15, 2024

kubectl get deployment 名称 -o yaml 看一下

image

@T-ze-yu
Copy link
Author

T-ze-yu commented Oct 15, 2024

kubectl get deployment 名称 -o yaml

[root@mpc-middle-autonomy-p42 kuscia]# kubectl get deployment -n p42 -o yaml
apiVersion: v1
items:

  • apiVersion: apps/v1
    kind: Deployment
    metadata:
    annotations:
    deployment.kubernetes.io/revision: "1"
    kuscia.secretflow/initiator: p42
    creationTimestamp: "2024-10-15T03:51:10Z"
    generation: 1
    labels:
    kuscia.secretflow/app-type: serving
    kuscia.secretflow/communication-role-client: "true"
    kuscia.secretflow/communication-role-server: "true"
    kuscia.secretflow/controller: KusciaDeployment
    kuscia.secretflow/deployment-name: server-are3sv4giytnufe0
    kuscia.secretflow/kd-name: server-are3sv4giytnufe0
    kuscia.secretflow/kd-uid: 206bbcbf-f895-4669-bc87-abcd55be6a5d
    kuscia.secretflow/owner_namespace: cross-domain
    name: server-are3sv4giytnufe0
    namespace: p42
    resourceVersion: "3010"
    uid: e437d93e-cba1-4ed1-b52e-244070fec078
    spec:
    progressDeadlineSeconds: 600
    replicas: 1
    revisionHistoryLimit: 10
    selector:
    matchLabels:
    kuscia.secretflow/app-type: serving
    kuscia.secretflow/communication-role-client: "true"
    kuscia.secretflow/communication-role-server: "true"
    kuscia.secretflow/controller: KusciaDeployment
    kuscia.secretflow/deployment-name: server-are3sv4giytnufe0
    kuscia.secretflow/kd-name: server-are3sv4giytnufe0
    kuscia.secretflow/kd-uid: 206bbcbf-f895-4669-bc87-abcd55be6a5d
    kuscia.secretflow/owner_namespace: cross-domain
    strategy:
    rollingUpdate:
    maxSurge: 25%
    maxUnavailable: 25%
    type: RollingUpdate
    template:
    metadata:
    annotations:
    kuscia.secretflow/config-template-volumes: config-template
    creationTimestamp: null
    labels:
    kuscia.secretflow/app-type: serving
    kuscia.secretflow/communication-role-client: "true"
    kuscia.secretflow/communication-role-server: "true"
    kuscia.secretflow/controller: KusciaDeployment
    kuscia.secretflow/deployment-name: server-are3sv4giytnufe0
    kuscia.secretflow/kd-name: server-are3sv4giytnufe0
    kuscia.secretflow/kd-uid: 206bbcbf-f895-4669-bc87-abcd55be6a5d
    kuscia.secretflow/owner_namespace: cross-domain
    spec:
    automountServiceAccountToken: false
    containers:
    - command:
    - sh
    - -c
    - ./secretflow_serving --flagfile=conf/gflags.conf --config_mode=kuscia
    --serving_config_file=/etc/kuscia/serving-config.conf
    env:
    - name: KUSCIA_DOMAIN_ID
    value: p42
    - name: CLUSTER_DEFINE
    value: '{"parties":[{"name":"p42", "role":"", "services":[{"portName":"service",
    "endpoints":["server-are3sv4giytnufe0-service.p42.svc:26030"]}, {"portName":"communication",
    "endpoints":["server-are3sv4giytnufe0-communication.p42.svc"]}, {"portName":"internal",
    "endpoints":["server-are3sv4giytnufe0-internal.p42.svc:26028"]}, {"portName":"brpc-builtin",
    "endpoints":["server-are3sv4giytnufe0-brpc-builtin.p42.svc:26029"]}]},
    {"name":"p41", "role":"", "services":[{"portName":"communication", "endpoints":["server-are3sv4giytnufe0-communication.p41.svc"]}]}],
    "selfPartyIdx":0, "selfEndpointIdx":0}'
    - name: ALLOCATED_PORTS
    value: '{"ports":[{"name":"communication", "port":26027, "scope":"Cluster",
    "protocol":"HTTP"}, {"name":"internal", "port":26028, "scope":"Domain",
    "protocol":"HTTP"}, {"name":"brpc-builtin", "port":26029, "scope":"Domain",
    "protocol":"HTTP"}, {"name":"service", "port":26030, "scope":"Domain",
    "protocol":"HTTP"}]}'
    - name: INPUT_CONFIG
    value: '{"partyConfigs": {"p42": {"serverConfig": {"featureMapping": {"dz6v67_11":
    "dz6v67_11", "dz6v67_12": "dz6v67_12", "dz6v67_13": "dz6v67_13", "dz6v67_14":
    "dz6v67_14", "dz6v67_15": "dz6v67_15", "dz6v67_16": "dz6v67_16", "dz6v67_17":
    "dz6v67_17", "dz6v67_18": "dz6v67_18", "dz6v67_19": "dz6v67_19", "dz6v67_20":
    "dz6v67_20"}}, "modelConfig": {"modelId": "a3d17b368aa811efaa5d0050568306e6",
    "basePath": "/tmp/p42", "sourcePath": "/home/kuscia/var/storage/data/jobs/202410151144040603430/20241015114954651334-7825ad3e-model-package.tar.gz",
    "sourceType": "ST_FILE"}, "featureSourceConfig": {"csv_opts": {"file_path":
    "/home/kuscia/var/storage/data/predict_dataset/\u9884\u6d4b\u670d\u52a1\u6837\u672c\u6a21\u677f_20241015115217.csv",
    "id_name": "id2"}}, "channel_desc": {"protocol": "http"}}, "p41": {"serverConfig":
    {"featureMapping": {"dz6v67_1": "dz6v67_1", "dz6v67_2": "dz6v67_2",
    "dz6v67_3": "dz6v67_3", "dz6v67_4": "dz6v67_4", "dz6v67_5": "dz6v67_5",
    "dz6v67_6": "dz6v67_6", "dz6v67_7": "dz6v67_7", "dz6v67_8": "dz6v67_8",
    "dz6v67_9": "dz6v67_9", "dz6v67_10": "dz6v67_10"}}, "modelConfig": {"modelId":
    "a3d17b368aa811efaa5d0050568306e6", "basePath": "/tmp/p41", "sourcePath":
    "/home/kuscia/var/storage/data/jobs/202410151144040603430/20241015114954651334-7825ad3e-model-package.tar.gz",
    "sourceType": "ST_FILE"}, "featureSourceConfig": {"csv_opts": {"file_path":
    "/home/kuscia/var/storage/data/predict_dataset/\u9884\u6d4b\u670d\u52a1(\u5ba1\u6279)\u6837\u672c\u6a21\u677f_20241015115234.csv",
    "id_name": "id45"}}, "channel_desc": {"protocol": "http"}}}}'
    - name: KUSCIA_PORT_COMMUNICATION_NUMBER
    value: "26027"
    - name: KUSCIA_PORT_INTERNAL_NUMBER
    value: "26028"
    - name: KUSCIA_PORT_BRPC_BUILTIN_NUMBER
    value: "26029"
    - name: KUSCIA_PORT_SERVICE_NUMBER
    value: "26030"
    - name: SERVING_ID
    value: server-are3sv4giytnufe0
    image: secretflow-registry.cn-hangzhou.cr.aliyuncs.com/secretflow/serving-anolis8:0.4.0b0
    imagePullPolicy: IfNotPresent
    livenessProbe:
    failureThreshold: 3
    httpGet:
    path: /health
    port: brpc-builtin
    scheme: HTTP
    periodSeconds: 10
    successThreshold: 1
    timeoutSeconds: 1
    name: secretflow
    ports:
    - containerPort: 26030
    name: service
    protocol: TCP
    - containerPort: 26027
    name: communication
    protocol: TCP
    - containerPort: 26028
    name: internal
    protocol: TCP
    - containerPort: 26029
    name: brpc-builtin
    protocol: TCP
    readinessProbe:
    failureThreshold: 3
    httpGet:
    path: /health
    port: brpc-builtin
    scheme: HTTP
    periodSeconds: 10
    successThreshold: 1
    timeoutSeconds: 1
    resources:
    limits:
    cpu: 100m
    memory: 100Mi
    requests:
    cpu: 100m
    memory: 100Mi
    startupProbe:
    failureThreshold: 30
    httpGet:
    path: /health
    port: brpc-builtin
    scheme: HTTP
    periodSeconds: 10
    successThreshold: 1
    timeoutSeconds: 1
    terminationMessagePath: /dev/termination-log
    terminationMessagePolicy: FallbackToLogsOnError
    volumeMounts:
    - mountPath: /etc/kuscia/serving-config.conf
    name: config-template
    subPath: serving-config.conf
    workingDir: /root/sf_serving
    dnsPolicy: ClusterFirst
    nodeSelector:
    kuscia.secretflow/namespace: p42
    restartPolicy: Always
    schedulerName: kuscia-scheduler
    securityContext: {}
    terminationGracePeriodSeconds: 30
    tolerations:
    - effect: NoSchedule
    key: kuscia.secretflow/agent
    operator: Exists
    volumes:
    - configMap:
    defaultMode: 420
    name: server-are3sv4giytnufe0-configtemplate
    name: config-template
    status:
    conditions:
    • lastTransitionTime: "2024-10-15T03:51:10Z"
      lastUpdateTime: "2024-10-15T03:51:10Z"
      message: Deployment does not have minimum availability.
      reason: MinimumReplicasUnavailable
      status: "False"
      type: Available
    • lastTransitionTime: "2024-10-15T03:51:10Z"
      lastUpdateTime: "2024-10-15T03:51:10Z"
      message: ReplicaSet "server-are3sv4giytnufe0-6f89bc49fb" is progressing.
      reason: ReplicaSetUpdated
      status: "True"
      type: Progressing
      observedGeneration: 1
      replicas: 1
      unavailableReplicas: 1
      updatedReplicas: 1
      kind: List
      metadata:
      resourceVersion: ""

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants