This section describes how to train a model by calling ModelArts APIs.
The process for creating a training job using PyTorch is as follows:
URI: GET https://{ma_endpoint}/v2/{project_id}/ training-job-flavors? flavor_type=CPU
Request header: X-Auth-Token →MIIZmgYJKoZIhvcNAQcCoIIZizCCGYcCAQExDTALBglghkgBZQMEAgEwgXXXXXX...
Set the following parameters based on site requirements:
{
"total_count": 2,
"flavors": [
{
"flavor_id": "modelarts.vm.cpu.2u",
"flavor_name": "Computing CPU(2U) instance",
"flavor_type": "CPU",
"billing": {
"code": "modelarts.vm.cpu.2u",
"unit_num": 1
},
"flavor_info": {
"max_num": 1,
"cpu": {
"arch": "x86",
"core_num": 2
},
"memory": {
"size": 8,
"unit": "GB"
},
"disk": {
"size": 50,
"unit": "GB"
}
}
},
{
"flavor_id": "modelarts.vm.cpu.8u",
"flavor_name": "Computing CPU(8U) instance",
"flavor_type": "CPU",
"billing": {
"code": "modelarts.vm.cpu.8u",
"unit_num": 1
},
"flavor_info": {
"max_num": 16,
"cpu": {
"arch": "x86",
"core_num": 8
},
"memory": {
"size": 32,
"unit": "GB"
},
"disk": {
"size": 50,
"unit": "GB"
}
}
}
]
}
URI: GET https://{ma_endpoint}/v2/{project_id}/job/ training-job-engines
Request header:
X-Auth-Token→MIIZmgYJKoZIhvcNAQcCoIIZizCCGYcCAQExDTALBglghkgBZQMEAgEwgXXXXXX...
Content-Type →application/json
Set the bold parameters based on site requirements.
{
"total": 28,
"items": [
......
{
"engine_id": "mindspore_1.6.0-cann_5.0.3.6-py_3.7-euler_2.8.3-aarch64",
"engine_name": "Powered-Engine",
"engine_version": "mindspore_1.6.0-cann_5.0.3.6-py_3.7-euler_2.8.3-aarch64",
"v1_compatible": false,
"run_user": "1000",
"image_info": {
"cpu_image_url": "",
"gpu_image_url": "atelier/mindspore_1_6_0:train",
"image_version": "mindspore_1.6.0-cann_5.0.3.6-py_3.7-euler_2.8.3-aarch64-snt9-roma-20211231193205-33131ee"
}
},
......
{
"engine_id": "pytorch_1.8.0-cuda_10.2-py_3.7-ubuntu_18.04-x86_64",
"engine_name": "PyTorch",
"engine_version": "pytorch_1.8.0-cuda_10.2-py_3.7-ubuntu_18.04-x86_64",
"tags": [
{
"key": "auto_search",
"value": "True"
}
],
"v1_compatible": false,
"run_user": "1102",
"image_info": {
"cpu_image_url": "aip/pytorch_1_8:train",
"gpu_image_url": "aip/pytorch_1_8:train",
"image_version": "pytorch_1.8.0-cuda_10.2-py_3.7-ubuntu_18.04-x86_64-20210912152543-1e0838d"
}
},
......
{
"engine_id": "tensorflow_2.1.0-cuda_10.1-py_3.7-ubuntu_18.04-x86_64",
"engine_name": "TensorFlow",
"engine_version": "tensorflow_2.1.0-cuda_10.1-py_3.7-ubuntu_18.04-x86_64",
"tags": [
{
"key": "auto_search",
"value": "True"
}
],
"v1_compatible": false,
"run_user": "1102",
"image_info": {
"cpu_image_url": "aip/tensorflow_2_1:train",
"gpu_image_url": "aip/tensorflow_2_1:train",
"image_version": "tensorflow_2.1.0-cuda_10.1-py_3.7-ubuntu_18.04-x86_64-20210912152543-1e0838d"
}
},
......
]
}
Select the engine flavor required for creating a training job based on the engine_name and engine_version fields, and record the field values. This section uses the PyTorch engine as an example to describe how to create a job. In this example, the engine_name value is PyTorch, and the engine_version value is pytorch_1.8.0-cuda_10.2-py_3.7-ubuntu_18.04-x86_64.
URI: POST https://{ma_endpoint}/v2/{project_id}/ algorithms
Request header:
X-Auth-Token→MIIZmgYJKoZIhvcNAQcCoIIZizCCGYcCAQExDTALBglghkgBZQMEAgEwgXXXXXX...
Content-Type →application/json
Set the bold parameters based on site requirements.
Request body:
{
"metadata": {
"name": "test-pytorch-cpu",
"description": "test pytorch job in cpu in mode gloo"
},
"job_config": {
"boot_file": "/cnnorth4-job-test-v2/pytorch/fast_example/code/cpu/test-pytorch.py",
"code_dir": "/cnnorth4-job-test-v2/pytorch/fast_example/code/cpu/",
"engine": {
"engine_name": "PyTorch",
"engine_version": "pytorch_1.8.0-cuda_10.2-py_3.7-ubuntu_18.04-x86_64"
},
"inputs": [{
"name": "data_url",
"description": "Data source 1"
}],
"outputs": [{
"name": "train_url",
"description": "Output data 1"
}],
"parameters": [{
"name": "dist",
"description": "",
"value": "False",
"constraint": {
"editable": true,
"required": false,
"sensitive": false,
"type": "Boolean",
"valid_range": [],
"valid_type": "None"
}
},
{
"name": "world_size",
"description": "",
"value": "1",
"constraint": {
"editable": true,
"required": false,
"sensitive": false,
"type": "Integer",
"valid_range": [],
"valid_type": "None"
}
}
],
"parameters_customization": true
},
"resource_requirements": []
}
Set the following parameters based on site requirements:
{
"metadata": {
"id": "01c399ae-8593-4ef5-9e4d-085950aacde1",
"name": "test-pytorch-cpu",
"description": "test pytorch job in cpu in mode gloo",
"create_time": 1641890623262,
"workspace_id": "0",
"ai_project": "default-ai-project",
"user_name": "",
"domain_id": "0659fbf6de00109b0ff1c01fc037d240",
"source": "custom",
"api_version": "",
"is_valid": true,
"state": "",
"size": 4790,
"tags": null,
"attr_list": null,
"version_num": 0,
"update_time": 0
},
"share_info": {},
"job_config": {
"code_dir": "/cnnorth4-job-test-v2/pytorch/fast_example/code/cpu/",
"boot_file": "/cnnorth4-job-test-v2/pytorch/fast_example/code/cpu/test-pytorch.py",
"parameters": [
{
"name": "dist",
"description": "",
"i18n_description": null,
"value": "False",
"constraint": {
"type": "Boolean",
"editable": true,
"required": false,
"sensitive": false,
"valid_type": "None",
"valid_range": []
}
},
{
"name": "world_size",
"description": "",
"i18n_description": null,
"value": "1",
"constraint": {
"type": "Integer",
"editable": true,
"required": false,
"sensitive": false,
"valid_type": "None",
"valid_range": []
}
}
],
"parameters_customization": true,
"inputs": [
{
"name": "data_url",
"description": "Data source 1"
}
],
"outputs": [
{
"name": "train_url",
"description": "Output data 1"
}
],
"engine": {
"engine_id": "pytorch_1.8.0-cuda_10.2-py_3.7-ubuntu_18.04-x86_64",
"engine_name": "PyTorch",
"engine_version": "pytorch_1.8.0-cuda_10.2-py_3.7-ubuntu_18.04-x86_64",
"tags": [
{
"key": "auto_search",
"value": "True"
}
],
"v1_compatible": false,
"run_user": "1102",
"image_info": {
"cpu_image_url": "aip/pytorch_1_8:train",
"gpu_image_url": "aip/pytorch_1_8:train",
"image_version": "pytorch_1.8.0-cuda_10.2-py_3.7-ubuntu_18.04-x86_64-20210912152543-1e0838d"
}
},
"code_tree": {
"name": "cpu/",
"children": [
{
"name": "test-pytorch.py"
}
]
}
},
"resource_requirements": [],
"advanced_config": {}
}
Record the value of id (algorithm ID, 32-bit UUID) in the metadata field for subsequent steps.
URI: POST https://{ma_endpoint}/v2/{project_id}/training-jobs
Request header:
Set the bold parameters based on site requirements.
Request body:
{
"kind": "job",
"metadata": {
"name": "test-pytorch-cpu01",
"description": "test pytorch work cpu in mode gloo"
},
"algorithm": {
"id": "01c399ae-8593-4ef5-9e4d-085950aacde1",
"parameters": [{
"name": "dist",
"value": "False"
},
{
"name": "world_size",
"value": "1"
}
],
"inputs": [{
"name": "data_url",
"remote": {
"obs": {
"obs_url": "/cnnorth4-job-test-v2/pytorch/fast_example/data/"
}
}
}],
"outputs": [{
"name": "train_url",
"remote": {
"obs": {
"obs_url": "/cnnorth4-job-test-v2/pytorch/fast_example/outputs/"
}
}
}]
},
"spec": {
"resource": {
"flavor_id": "modelarts.vm.cpu.8u",
"node_count": 1
},
"log_export_path": {
"obs_url": "/cnnorth4-job-test-v2/pytorch/fast_example/log/"
}
}
}
Set the following parameters based on site requirements:
{
"kind": "job",
"metadata": {
"id": "66ff6991-fd66-40b6-8101-0829a46d3731",
"name": "test-pytorch-cpu01",
"description": "test pytorch work cpu in mode gloo",
"create_time": 1641892642625,
"workspace_id": "0",
"ai_project": "default-ai-project",
"user_name": "",
"annotations": {
"job_template": "Template DL",
"key_task": "worker"
}
},
"status": {
"phase": "Creating",
"secondary_phase": "Creating",
"duration": 0,
"start_time": 0,
"node_count_metrics": null,
"tasks": [
"worker-0"
]
},
"algorithm": {
"id": "01c399ae-8593-4ef5-9e4d-085950aacde1",
"name": "test-pytorch-cpu",
"code_dir": "/cnnorth4-job-test-v2/pytorch/fast_example/code/cpu/",
"boot_file": "/cnnorth4-job-test-v2/pytorch/fast_example/code/cpu/test-pytorch.py",
"parameters": [
{
"name": "dist",
"description": "",
"i18n_description": null,
"value": "False",
"constraint": {
"type": "Boolean",
"editable": true,
"required": false,
"sensitive": false,
"valid_type": "None",
"valid_range": []
}
},
{
"name": "world_size",
"description": "",
"i18n_description": null,
"value": "1",
"constraint": {
"type": "Integer",
"editable": true,
"required": false,
"sensitive": false,
"valid_type": "None",
"valid_range": []
}
}
],
"parameters_customization": true,
"inputs": [
{
"name": "data_url",
"description": "Data source 1",
"local_dir": "/home/ma-user/modelarts/inputs/data_url_0",
"remote": {
"obs": {
"obs_url": "/cnnorth4-job-test-v2/pytorch/fast_example/data/"
}
}
}
],
"outputs": [
{
"name": "train_url",
"description": "Output data 1",
"local_dir": "/home/ma-user/modelarts/outputs/train_url_0",
"remote": {
"obs": {
"obs_url": "/cnnorth4-job-test-v2/pytorch/fast_example/outputs/"
}
},
"mode": "upload_periodically",
"period": 30
}
],
"engine": {
"engine_id": "pytorch_1.8.0-cuda_10.2-py_3.7-ubuntu_18.04-x86_64",
"engine_name": "PyTorch",
"engine_version": "pytorch_1.8.0-cuda_10.2-py_3.7-ubuntu_18.04-x86_64",
"usage": "training",
"support_groups": "public",
"tags": [
{
"key": "auto_search",
"value": "True"
}
],
"v1_compatible": false,
"run_user": "1102"
}
},
"spec": {
"resource": {
"flavor_id": "modelarts.vm.cpu.8u",
"flavor_name": "Computing CPU(8U) instance",
"node_count": 1,
"flavor_detail": {
"flavor_type": "CPU",
"billing": {
"code": "modelarts.vm.cpu.8u",
"unit_num": 1
},
"flavor_info": {
"cpu": {
"arch": "x86",
"core_num": 8
},
"memory": {
"size": 32,
"unit": "GB"
},
"disk": {
"size": 50,
"unit": "GB"
}
}
}
},
"log_export_path": {
"obs_url": "/cnnorth4-job-test-v2/pytorch/fast_example/log/"
},
"is_hosted_log": true
}
}
URI: GET https://{ma_endpoint}/v2/{project_id}/training-jobs/{training_job_id}
Request header: X-Auth-Token →MIIZmgYJKoZIhvcNAQcCoIIZizCCGYcCAQExDTALBglghkgBZQMEAgEwgXXXXXX...
Set the following parameter based on site requirements:
Set training_job_id to the training job ID recorded in 5.
{
"kind": "job",
"metadata": {
"id": "66ff6991-fd66-40b6-8101-0829a46d3731",
"name": "test-pytorch-cpu01",
"description": "test pytorch work cpu in mode gloo",
"create_time": 1641892642625,
"workspace_id": "0",
"ai_project": "default-ai-project",
"user_name": "hwstaff_z00424192",
"annotations": {
"job_template": "Template DL",
"key_task": "worker"
}
},
"status": {
"phase": "Running",
"secondary_phase": "Running",
"duration": 268000,
"start_time": 1641892655000,
"node_count_metrics": [
[
1641892645000,
0
],
[
1641892654000,
0
],
[
1641892655000,
1
],
[
1641892922000,
1
],
[
1641892923000,
1
]
],
"tasks": [
"worker-0"
]
},
"algorithm": {
"id": "01c399ae-8593-4ef5-9e4d-085950aacde1",
"name": "test-pytorch-cpu",
"code_dir": "/cnnorth4-job-test-v2/pytorch/fast_example/code/cpu/",
"boot_file": "/cnnorth4-job-test-v2/pytorch/fast_example/code/cpu/test-pytorch.py",
"parameters": [
{
"name": "dist",
"description": "",
"i18n_description": null,
"value": "False",
"constraint": {
"type": "Boolean",
"editable": true,
"required": false,
"sensitive": false,
"valid_type": "None",
"valid_range": []
}
},
{
"name": "world_size",
"description": "",
"i18n_description": null,
"value": "1",
"constraint": {
"type": "Integer",
"editable": true,
"required": false,
"sensitive": false,
"valid_type": "None",
"valid_range": []
}
}
],
"parameters_customization": true,
"inputs": [
{
"name": "data_url",
"description": "Data source 1",
"local_dir": "/home/ma-user/modelarts/inputs/data_url_0",
"remote": {
"obs": {
"obs_url": "/cnnorth4-job-test-v2/pytorch/fast_example/data/"
}
}
}
],
"outputs": [
{
"name": "train_url",
"description": "Output data 1",
"local_dir": "/home/ma-user/modelarts/outputs/train_url_0",
"remote": {
"obs": {
"obs_url": "/cnnorth4-job-test-v2/pytorch/fast_example/outputs/"
}
},
"mode": "upload_periodically",
"period": 30
}
],
"engine": {
"engine_id": "pytorch_1.8.0-cuda_10.2-py_3.7-ubuntu_18.04-x86_64",
"engine_name": "PyTorch",
"engine_version": "pytorch_1.8.0-cuda_10.2-py_3.7-ubuntu_18.04-x86_64",
"usage": "training",
"support_groups": "public",
"tags": [
{
"key": "auto_search",
"value": "True"
}
],
"v1_compatible": false,
"run_user": "1102"
}
},
"spec": {
"resource": {
"flavor_id": "modelarts.vm.cpu.8u",
"flavor_name": "Computing CPU(8U) instance",
"node_count": 1,
"flavor_detail": {
"flavor_type": "CPU",
"billing": {
"code": "modelarts.vm.cpu.8u",
"unit_num": 1
},
"flavor_info": {
"cpu": {
"arch": "x86",
"core_num": 8
},
"memory": {
"size": 32,
"unit": "GB"
},
"disk": {
"size": 50,
"unit": "GB"
}
}
}
},
"log_export_path": {
"obs_url": "/cnnorth4-job-test-v2/pytorch/fast_example/log/"
},
"is_hosted_log": true
}
}
You can learn about the version details of the training job based on the response. The status value is Running, indicating that the training job is running.
URI format: GET https://{ma_endpoint}/v2/{project_id}/training-jobs/{training_job_id}/tasks/{task_id}/logs/url
Request header:
X-Auth-Token→MIIZmgYJKoZIhvcNAQcCoIIZizCCGYcCAQExDTALBglghkgBZQMEAgEwgXXXXXX...
Content-Type→text/plain
Set the following parameters based on site requirements:
The returned field indicates the OBS path of logs. You can copy the value to the browser to view the result.
URI format: GET https://{ma_endpoint}/v2/{project_id}/training-jobs/{training_job_id}/metrics/{task_id}
Request header: X-Auth-Token →MIIZmgYJKoZIhvcNAQcCoIIZizCCGYcCAQExDTALBglghkgBZQMEAgEwgXXXXXX...
Set the bold parameters based on site requirements.
{
"metrics": [
{
"metric": "cpuUsage",
"value": [
-1,
-1,
28.622,
35.053,
39.988,
40.069,
40.082,
40.094
]
},
{
"metric": "memUsage",
"value": [
-1,
-1,
0.544,
0.641,
0.736,
0.737,
0.738,
0.739
]
},
{
"metric": "npuUtil",
"value": [
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1
]
},
{
"metric": "npuMemUsage",
"value": [
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1
]
},
{
"metric": "gpuUtil",
"value": [
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1
]
},
{
"metric": "gpuMemUsage",
"value": [
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1
]
}
]
}
You can view the metrics such as the CPU usage.
URI: DELETE https://{ma_endpoint}/v2/{project_id}/training-jobs/{training_job_id}
Request header: X-Auth-Token →MIIZmgYJKoZIhvcNAQcCoIIZizCCGYcCAQExDTALBglghkgBZQMEAgEwgXXXXXX...
Set the bold parameters based on site requirements.