This section describes how to train a model on ModelArts by calling a series of APIs.
The process for creating a training job using the TensorFlow framework is as follows:
URI format: POST https://{iam_endpoint}/v3/auth/tokens
Request header: Content-Type → application/json
{
"auth": {
"identity": {
"methods": ["password"],
"password": {
"user": {
"name": "username",
"password": "**********",
"domain": {
"name": "domainname"
}
}
}
},
"scope": {
"project": {
"name": ""
}
}
}
}
x-subject-token →MIIZmgYJKoZIhvcNAQcCoIIZizCCGYcCAQExDTALBglghkgBZQMEAgEwgXXXXXX...
URI format: GET https://{ma_endpoint}/v1/{project_id}/job/resource-specs?job_type=train
Request header: X-auth-Token →MIIZmgYJKoZIhvcNAQcCoIIZizCCGYcCAQExDTALBglghkgBZQMEAgEwgXXXXXX...
{
"specs": [
......
{
"spec_id": 7,
"core": "2",
"cpu": "8",
"gpu_num": 0,
"gpu_type": "",
"spec_code": "modelarts.vm.cpu.2u",
"unit_num": 1,
"max_num": 1,
"storage": "",
"interface_type": 1,
"no_resource": false
},
{
"spec_id": 27,
"core": "8",
"cpu": "32",
"gpu_num": 0,
"gpu_type": "",
"spec_code": "modelarts.vm.cpu.8u",
"unit_num": 1,
"max_num": 1,
"storage": "",
"interface_type": 1,
"no_resource": false
}
],
"is_success": true,
"spec_total_count": 5
}
URI format: GET https://{ma_endpoint}/v1/{project_id}/job/ai-engines?job_type=train
Request header: X-auth-Token →MIIZmgYJKoZIhvcNAQcCoIIZizCCGYcCAQExDTALBglghkgBZQMEAgEwgXXXXXX...
{
"engines": [
{
"engine_type": 1,
"engine_name": "TensorFlow",
"engine_id": 3,
"engine_version": "TF-1.8.0-python2.7"
},
{
"engine_type": 1,
"engine_name": "TensorFlow",
"engine_id": 4,
"engine_version": "TF-1.8.0-python3.6"
},
......
{
"engine_type": 9,
"engine_name": "XGBoost-Sklearn",
"engine_id": 100,
"engine_version": "XGBoost-0.80-Sklearn-0.18.1-python3.6"
}
],
"is_success": true
}
Select the engine flavor required for creating a training job based on the engine_name and engine_version fields and record engine_id. This section describes how to create a job based on the TensorFlow engine. Record engine_id as 4.
URI format: POST https://{ma_endpoint}/v1/{project_id}/training-jobs
{
"job_name": "jobtest_TF",
"job_desc": "using TensorFlow for handwritten digit recognition",
"config": {
"worker_server_num": 1,
"parameter": [],
"flavor": {
"code": "modelarts.vm.cpu.8u"
},
"train_url": "/test-modelarts/mnist-model/output/",
"engine_id": 4,
"app_url": "/test-modelarts/mnist-tensorflow-code/",
"boot_file_url": "/test-modelarts/mnist-tensorflow-code/train_mnist_tf.py",
"data_source": [
{
"type": "obs",
"data_url": "/test-modelarts/dataset-mnist/"
}
]
},
"notification": {
"topic_urn": "",
"events": []
},
"workspace_id": "0"
}
{
"version_name": "V0001",
"job_name": "jobtest_TF",
"create_time": 1609121837000,
"job_id": 567524,
"resource_id": "jobaedef089",
"version_id": 1108482,
"is_success": true,
"status": 1
}
URI format: GET https://{ma_endpoint}/v1/{project_id}/training-jobs/567524/versions/1108482
Request header: X-auth-Token →MIIZmgYJKoZIhvcNAQcCoIIZizCCGYcCAQExDTALBglghkgBZQMEAgEwgXXXXXX...
{
"dataset_name": null,
"duration": 1326,
"spec_code": "modelarts.vm.cpu.8u",
"parameter": [],
"start_time": 1609121913000,
"model_outputs": [],
"engine_name": "TensorFlow",
"error_result": null,
"gpu_type": "",
"user_frame_image": null,
"gpu": null,
"dataset_id": null,
"nas_mount_path": null,
"task_summary": {},
"max_num": 1,
"model_metric_list": "{}",
"is_zombie": null,
"flavor_code": "modelarts.vm.cpu.8u",
"gpu_num": 0,
"train_url": "/test-modelarts/mnist-model/output/",
"engine_type": 1,
"job_name": "jobtest_TF",
"nas_type": "efs",
"outputs": null,
"job_id": 567524,
"data_url": "/test-modelarts/dataset-mnist/",
"log_url": null,
"boot_file_url": "/test-modelarts/mnist-tensorflow-code/train_mnist_tf.py",
"volumes": null,
"dataset_version_id": null,
"algorithm_id": null,
"worker_server_num": 1,
"pool_type": "SYSTEM_DEFINED",
"autosearch_config": null,
"job_desc": "using TensorFlow for handwritten digit recognition",
"inputs": null,
"model_id": null,
"dataset_version_name": null,
"pool_name": "hec-train-pub-cpu",
"engine_version": "TF-1.8.0-python3.6",
"system_metric_list": {
"recvBytesRate": [
"0",
"0"
],
"cpuUsage": [
"0",
"0"
],
"sendBytesRate": [
"0",
"0"
],
"memUsage": [
"0",
"0"
],
"gpuUtil": [
"0",
"0"
],
"gpuMemUsage": [
"0",
"0"
],
"interval": 1,
"diskWriteRate": [
"0",
"0"
],
"diskReadRate": [
"0",
"0"
]
},
"retrain_model_id": null,
"version_name": "V0001",
"pod_version": "1.8.0-cp36",
"engine_id": 4,
"status": 10,
"cpu": "32",
"user_image_url": null,
"spec_id": 27,
"is_success": true,
"storage": "",
"nas_share_addr": null,
"version_id": 1108482,
"no_resource": false,
"user_command": null,
"resource_id": "jobaedef089",
"core": "8",
"npu_info": null,
"app_url": "/test-modelarts/mnist-tensorflow-code/",
"data_source": [
{
"type": "obs",
"data_url": "/test-modelarts/dataset-mnist/"
}
],
"pre_version_id": null,
"create_time": 1609121837000,
"job_type": 1,
"pool_id": "pool7d1e384a"
}
You can learn about the version details of the training job based on the response. The value of status is 10, indicating that the training job is successful.
URI format: GET https://{ma_endpoint}/v1/{project_id}/training-jobs/567524/versions/1108482/log/file-names
Request header: X-auth-Token →MIIZmgYJKoZIhvcNAQcCoIIZizCCGYcCAQExDTALBglghkgBZQMEAgEwgXXXXXX...
Set the italic fields in bold based on the site requirements.
{
"is_success": true,
"log_file_list": [
"job-jobtest-tf.0"
]
}
Only one log file named job-jobtest-tf.0 exists.
URI format: GET https://{ma_endpoint}/v1/{project_id}/training-jobs/567524/versions/1108482/aom-log?log_file=job-jobtest-tf.0&lines=8&order=desc
Request header: X-auth-Token →MIIZmgYJKoZIhvcNAQcCoIIZizCCGYcCAQExDTALBglghkgBZQMEAgEwgXXXXXX...
{
"start_line": "1609121886518240330",
"lines": 8,
"is_success": true,
"end_line": "1609121900042593083",
"content": "Done exporting!\n\n[Modelarts Service Log]Training completed.\n\n[ModelArts Service Log]modelarts-pipe: will create log file /tmp/log/jobtest_TF.log\n\n[ModelArts Service Log]modelarts-pipe: will create log file /tmp/log/jobtest_TF.log\n\n[ModelArts Service Log]modelarts-pipe: will write log file /tmp/log/jobtest_TF.log\n\n[ModelArts Service Log]modelarts-pipe: param for max log length: 1073741824\n\n[ModelArts Service Log]modelarts-pipe: param for whether exit on overflow: 0\n\n[ModelArts Service Log]modelarts-pipe: total length: 23303\n"
}
URI format: GET https://{ma_endpoint}/v1/{project_id}/training-jobs/567524
Request header: X-auth-Token →MIIZmgYJKoZIhvcNAQcCoIIZizCCGYcCAQExDTALBglghkgBZQMEAgEwgXXXXXX...
Set the italic fields in bold based on the site requirements.
{
"is_success": true
}