diff --git a/docs/modelarts/best-practice/ALL_META.TXT.json b/docs/modelarts/best-practice/ALL_META.TXT.json new file mode 100644 index 00000000..e510b10b --- /dev/null +++ b/docs/modelarts/best-practice/ALL_META.TXT.json @@ -0,0 +1,479 @@ +[ + { + "uri":"modelarts_10_0150.html", + "node_id":"en-us_topic_0000001679516496.xml", + "product_code":"", + "code":"1", + "des":"HUAWEI CLOUD Help Center presents technical documents to help you quickly get started with HUAWEI CLOUD services. The technical documents include Service Overview, Price Details, Purchase Guide, User Guide, API Reference, Best Practices, FAQs, and Videos.", + "doc_type":"", + "kw":"Permissions Management", + "search_title":"", + "metedata":[ + { + + } + ], + "title":"Permissions Management", + "githuburl":"" + }, + { + "uri":"modelarts_24_0078.html", + "node_id":"en-us_topic_0000001679679248.xml", + "product_code":"modelarts", + "code":"2", + "des":"ModelArts allows you to configure fine-grained permissions for refined management of resources and permissions. This is commonly used by large enterprises, but it is comp", + "doc_type":"usermanual", + "kw":"Basic Concepts,Permissions Management,Best Practices", + "search_title":"", + "metedata":[ + { + "prodname":"modelarts", + "opensource":"true", + "documenttype":"usermanual", + "IsBot":"Yes", + "IsMulti":"Yes" + } + ], + "title":"Basic Concepts", + "githuburl":"" + }, + { + "uri":"modelarts_24_0079.html", + "node_id":"en-us_topic_0000001727798129.xml", + "product_code":"", + "code":"3", + "des":"HUAWEI CLOUD Help Center presents technical documents to help you quickly get started with HUAWEI CLOUD services. The technical documents include Service Overview, Price Details, Purchase Guide, User Guide, API Reference, Best Practices, FAQs, and Videos.", + "doc_type":"", + "kw":"Permission Management Mechanisms", + "search_title":"", + "metedata":[ + { + + } + ], + "title":"Permission Management Mechanisms", + "githuburl":"" + }, + { + "uri":"modelarts_24_0080.html", + "node_id":"en-us_topic_0000001679679160.xml", + "product_code":"", + "code":"4", + "des":"This section describes the IAM permission configurations for all ModelArts functions.If no fine-grained authorization policy is configured for a user created by the admin", + "doc_type":"", + "kw":"IAM,Permission Management Mechanisms,Best Practices", + "search_title":"", + "metedata":[ + { + + } + ], + "title":"IAM", + "githuburl":"" + }, + { + "uri":"modelarts_24_0081.html", + "node_id":"en-us_topic_0000001727718241.xml", + "product_code":"", + "code":"5", + "des":"Function Dependency PoliciesWhen using ModelArts to develop algorithms or manage training jobs, you are required to use other Cloud services. For example, before submitti", + "doc_type":"", + "kw":"Agencies and Dependencies,Permission Management Mechanisms,Best Practices", + "search_title":"", + "metedata":[ + { + + } + ], + "title":"Agencies and Dependencies", + "githuburl":"" + }, + { + "uri":"modelarts_24_0082.html", + "node_id":"en-us_topic_0000001679679168.xml", + "product_code":"", + "code":"6", + "des":"ModelArts allows you to create multiple workspaces to develop algorithms and manage and deploy models for different service objectives. In this way, the development outpu", + "doc_type":"", + "kw":"Workspace,Permission Management Mechanisms,Best Practices", + "search_title":"", + "metedata":[ + { + + } + ], + "title":"Workspace", + "githuburl":"" + }, + { + "uri":"modelarts_24_0084.html", + "node_id":"en-us_topic_0000001727718221.xml", + "product_code":"", + "code":"7", + "des":"HUAWEI CLOUD Help Center presents technical documents to help you quickly get started with HUAWEI CLOUD services. The technical documents include Service Overview, Price Details, Purchase Guide, User Guide, API Reference, Best Practices, FAQs, and Videos.", + "doc_type":"", + "kw":"Configuration Practices in Typical Scenarios", + "search_title":"", + "metedata":[ + { + + } + ], + "title":"Configuration Practices in Typical Scenarios", + "githuburl":"" + }, + { + "uri":"modelarts_24_0085.html", + "node_id":"en-us_topic_0000001727798133.xml", + "product_code":"", + "code":"8", + "des":"Certain ModelArts functions require access to Object Storage Service (OBS), Software Repository for Container (SWR), and Intelligent EdgeFabric (IEF). Before using ModelA", + "doc_type":"", + "kw":"Assigning Permissions to Individual Users for Using ModelArts,Configuration Practices in Typical Sce", + "search_title":"", + "metedata":[ + { + + } + ], + "title":"Assigning Permissions to Individual Users for Using ModelArts", + "githuburl":"" + }, + { + "uri":"modelarts_24_0086.html", + "node_id":"en-us_topic_0000001679679164.xml", + "product_code":"", + "code":"9", + "des":"HUAWEI CLOUD Help Center presents technical documents to help you quickly get started with HUAWEI CLOUD services. The technical documents include Service Overview, Price Details, Purchase Guide, User Guide, API Reference, Best Practices, FAQs, and Videos.", + "doc_type":"", + "kw":"Assigning Basic Permissions for Using ModelArts", + "search_title":"", + "metedata":[ + { + + } + ], + "title":"Assigning Basic Permissions for Using ModelArts", + "githuburl":"" + }, + { + "uri":"modelarts_10_0062.html", + "node_id":"en-us_topic_0000001727798125.xml", + "product_code":"", + "code":"10", + "des":"Certain ModelArts functions require the permission to access other services. This section describes how to assign specific permissions to IAM users when they use ModelArt", + "doc_type":"", + "kw":"Scenarios,Assigning Basic Permissions for Using ModelArts,Best Practices", + "search_title":"", + "metedata":[ + { + "opensource":"true", + "IsMulti":"No", + "IsBot":"Yes" + } + ], + "title":"Scenarios", + "githuburl":"" + }, + { + "uri":"modelarts_24_0089.html", + "node_id":"en-us_topic_0000001679838896.xml", + "product_code":"", + "code":"11", + "des":"Multiple IAM users can be created under a tenant user, and the permissions of the IAM users are managed by group. This section describes how to create a user group and IA", + "doc_type":"", + "kw":"Step 1 Create a User Group and Add Users to the User Group,Assigning Basic Permissions for Using Mod", + "search_title":"", + "metedata":[ + { + + } + ], + "title":"Step 1 Create a User Group and Add Users to the User Group", + "githuburl":"" + }, + { + "uri":"modelarts_24_0090.html", + "node_id":"en-us_topic_0000001727798145.xml", + "product_code":"", + "code":"12", + "des":"An IAM user can use cloud services such as ModelArts and OBS only after they are assigned with permissions from the tenant user. This section describes how to assign the ", + "doc_type":"", + "kw":"Step 2 Assigning Permissions for Using Cloud Services,Assigning Basic Permissions for Using ModelArt", + "search_title":"", + "metedata":[ + { + + } + ], + "title":"Step 2 Assigning Permissions for Using Cloud Services", + "githuburl":"" + }, + { + "uri":"modelarts_24_0091.html", + "node_id":"en-us_topic_0000001679838984.xml", + "product_code":"", + "code":"13", + "des":"After assigning IAM permissions, configure ModelArts access authorization for IAM users on the ModelArts page so that ModelArts can access dependent services such as OBS,", + "doc_type":"", + "kw":"Step 3 Configure Agent-based ModelArts Access Authorization for the User,Assigning Basic Permissions", + "search_title":"", + "metedata":[ + { + + } + ], + "title":"Step 3 Configure Agent-based ModelArts Access Authorization for the User", + "githuburl":"" + }, + { + "uri":"modelarts_24_0092.html", + "node_id":"en-us_topic_0000001727798137.xml", + "product_code":"", + "code":"14", + "des":"It takes 15 to 30 minutes for the permissions configured in 4 to take effect. Therefore, wait for 30 minutes after the configuration and then verify the configuration.Log", + "doc_type":"", + "kw":"Step 4 Verify User Permissions,Assigning Basic Permissions for Using ModelArts,Best Practices", + "search_title":"", + "metedata":[ + { + + } + ], + "title":"Step 4 Verify User Permissions", + "githuburl":"" + }, + { + "uri":"modelarts_24_0093.html", + "node_id":"en-us_topic_0000001727718225.xml", + "product_code":"", + "code":"15", + "des":"In small- and medium-sized teams, administrators need to globally control ModelArts resources, and developers only need to focus on their own instances. By default, a dev", + "doc_type":"", + "kw":"Separately Assigning Permissions to Administrators and Developers,Configuration Practices in Typical", + "search_title":"", + "metedata":[ + { + + } + ], + "title":"Separately Assigning Permissions to Administrators and Developers", + "githuburl":"" + }, + { + "uri":"modelarts_24_0095.html", + "node_id":"en-us_topic_0000001679679152.xml", + "product_code":"", + "code":"16", + "des":"Any IAM user granted with the listAllNotebooks and listUsers permissions can click View all on the notebook page to view the instances of all users in the current IAM pro", + "doc_type":"", + "kw":"Viewing the Notebook Instances of All IAM Users Under One Tenant Account,Configuration Practices in ", + "search_title":"", + "metedata":[ + { + + } + ], + "title":"Viewing the Notebook Instances of All IAM Users Under One Tenant Account", + "githuburl":"" + }, + { + "uri":"modelarts_24_0096.html", + "node_id":"en-us_topic_0000001727798225.xml", + "product_code":"modelarts", + "code":"17", + "des":"You can use Cloud Shell provided by the ModelArts console to log in to a running training container.You can use Cloud Shell to log in to a running training container usin", + "doc_type":"usermanual", + "kw":"Logging In to a Training Container Using Cloud Shell,Configuration Practices in Typical Scenarios,Be", + "search_title":"", + "metedata":[ + { + "prodname":"modelarts", + "IsMulti":"No", + "IsBot":"No", + "opensource":"true", + "documenttype":"usermanual" + } + ], + "title":"Logging In to a Training Container Using Cloud Shell", + "githuburl":"" + }, + { + "uri":"modelarts_24_0097.html", + "node_id":"en-us_topic_0000001727718321.xml", + "product_code":"", + "code":"18", + "des":"This section describes how to control the ModelArts permissions of a user so that the user is not allowed to use a public resource pool to create training jobs, create no", + "doc_type":"", + "kw":"Prohibiting a User from Using a Public Resource Pool,Configuration Practices in Typical Scenarios,Be", + "search_title":"", + "metedata":[ + { + + } + ], + "title":"Prohibiting a User from Using a Public Resource Pool", + "githuburl":"" + }, + { + "uri":"modelarts_10_0001.html", + "node_id":"en-us_topic_0000001727355869.xml", + "product_code":"", + "code":"19", + "des":"ModelArts provides ExeML for service developers, freeing you from model development and parameter tuning. With ExeML, you can finish an AI development project in just thr", + "doc_type":"", + "kw":"Huawei Cloud Mascot Detection (Using ExeML for Object Detection),Best Practices", + "search_title":"", + "metedata":[ + { + "opensource":"true" + } + ], + "title":"Huawei Cloud Mascot Detection (Using ExeML for Object Detection)", + "githuburl":"" + }, + { + "uri":"modelarts_10_0002.html", + "node_id":"en-us_topic_0000001727435941.xml", + "product_code":"", + "code":"20", + "des":"Banks often predict whether customers would be interested in a time deposit based on their characteristics, including the age, work type, marital status, education backgr", + "doc_type":"", + "kw":"Bank Deposit Prediction (Using ExeML for Predictive Analytics),Best Practices", + "search_title":"", + "metedata":[ + { + "opensource":"true", + "IsMulti":"No", + "IsBot":"Yes" + } + ], + "title":"Bank Deposit Prediction (Using ExeML for Predictive Analytics)", + "githuburl":"" + }, + { + "uri":"modelarts_10_0080.html", + "node_id":"en-us_topic_0000001679356792.xml", + "product_code":"", + "code":"21", + "des":"This section describes how to modify a local custom algorithm to train and deploy models on ModelArts.This case describes how to use PyTorch 1.8 to recognize handwritten ", + "doc_type":"", + "kw":"Using a Custom Algorithm to Build a Handwritten Digit Recognition Model,Best Practices", + "search_title":"", + "metedata":[ + { + "opensource":"true", + "IsMulti":"No", + "IsBot":"Yes" + } + ], + "title":"Using a Custom Algorithm to Build a Handwritten Digit Recognition Model", + "githuburl":"" + }, + { + "uri":"develop-modelarts-0143.html", + "node_id":"en-us_topic_0000001679516552.xml", + "product_code":"modelarts", + "code":"22", + "des":"This section describes how to create an image and use it for training on ModelArts. The AI engine used in the image is horovod_0.22.1-pytorch_1.8.1, and the resources use", + "doc_type":"usermanual", + "kw":"Example: Creating a Custom Image for Training (Horovod-PyTorch and GPUs),Best Practices", + "search_title":"", + "metedata":[ + { + "prodname":"modelarts", + "documenttype":"usermanual" + } + ], + "title":"Example: Creating a Custom Image for Training (Horovod-PyTorch and GPUs)", + "githuburl":"" + }, + { + "uri":"develop-modelarts-0144.html", + "node_id":"en-us_topic_0000001679516596.xml", + "product_code":"modelarts", + "code":"23", + "des":"This section describes how to create an image and use it for training on ModelArts. The AI engine used in the image is MindSpore, and the resources used for training are ", + "doc_type":"usermanual", + "kw":"Example: Creating a Custom Image for Training (MindSpore and GPUs),Best Practices", + "search_title":"", + "metedata":[ + { + "prodname":"modelarts", + "documenttype":"usermanual" + } + ], + "title":"Example: Creating a Custom Image for Training (MindSpore and GPUs)", + "githuburl":"" + }, + { + "uri":"develop-modelarts-0145.html", + "node_id":"en-us_topic_0000001727355837.xml", + "product_code":"modelarts", + "code":"24", + "des":"This section describes how to create an image and use it for training on ModelArts. The AI engine used in the image is TensorFlow, and the resources used for training are", + "doc_type":"usermanual", + "kw":"Example: Creating a Custom Image for Training (TensorFlow and GPUs),Best Practices", + "search_title":"", + "metedata":[ + { + "prodname":"modelarts", + "documenttype":"usermanual" + } + ], + "title":"Example: Creating a Custom Image for Training (TensorFlow and GPUs)", + "githuburl":"" + }, + { + "uri":"modelarts_10_0072.html", + "node_id":"en-us_topic_0000001727355817.xml", + "product_code":"modelarts", + "code":"25", + "des":"If you want to use an AI engine that is not supported by ModelArts, create a custom image for the engine, import the image to ModelArts, and use the image to create AI ap", + "doc_type":"usermanual", + "kw":"Creating a Custom Image and Using It to Create an AI Application,Best Practices", + "search_title":"", + "metedata":[ + { + "prodname":"modelarts", + "documenttype":"usermanual" + } + ], + "title":"Creating a Custom Image and Using It to Create an AI Application", + "githuburl":"" + }, + { + "uri":"modelarts_04_0203.html", + "node_id":"en-us_topic_0000001679516500.xml", + "product_code":"", + "code":"26", + "des":"This section describes how to enable an inference service to access the Internet.An inference service accesses the Internet in the following scenarios:After an image is i", + "doc_type":"", + "kw":"Enabling an Inference Service to Access the Internet,Best Practices", + "search_title":"", + "metedata":[ + { + + } + ], + "title":"Enabling an Inference Service to Access the Internet", + "githuburl":"" + }, + { + "uri":"modelarts_04_0204.html", + "node_id":"en-us_topic_0000001727355769.xml", + "product_code":"", + "code":"27", + "des":"The end-to-end O&M of ModelArts inference services involves the entire AI process including algorithm development, service O&M, and service running.End-to-End O&M Process", + "doc_type":"", + "kw":"End-to-End O&M of Inference Services,Best Practices", + "search_title":"", + "metedata":[ + { + + } + ], + "title":"End-to-End O&M of Inference Services", + "githuburl":"" + } +] \ No newline at end of file diff --git a/docs/modelarts/best-practice/CLASS.TXT.json b/docs/modelarts/best-practice/CLASS.TXT.json new file mode 100644 index 00000000..d68f1f18 --- /dev/null +++ b/docs/modelarts/best-practice/CLASS.TXT.json @@ -0,0 +1,245 @@ +[ + { + "desc":"HUAWEI CLOUD Help Center presents technical documents to help you quickly get started with HUAWEI CLOUD services. The technical documents include Service Overview, Price Details, Purchase Guide, User Guide, API Reference, Best Practices, FAQs, and Videos.", + "product_code":"", + "title":"Permissions Management", + "uri":"modelarts_10_0150.html", + "doc_type":"", + "p_code":"", + "code":"1" + }, + { + "desc":"ModelArts allows you to configure fine-grained permissions for refined management of resources and permissions. This is commonly used by large enterprises, but it is comp", + "product_code":"modelarts", + "title":"Basic Concepts", + "uri":"modelarts_24_0078.html", + "doc_type":"usermanual", + "p_code":"1", + "code":"2" + }, + { + "desc":"HUAWEI CLOUD Help Center presents technical documents to help you quickly get started with HUAWEI CLOUD services. The technical documents include Service Overview, Price Details, Purchase Guide, User Guide, API Reference, Best Practices, FAQs, and Videos.", + "product_code":"", + "title":"Permission Management Mechanisms", + "uri":"modelarts_24_0079.html", + "doc_type":"", + "p_code":"1", + "code":"3" + }, + { + "desc":"This section describes the IAM permission configurations for all ModelArts functions.If no fine-grained authorization policy is configured for a user created by the admin", + "product_code":"", + "title":"IAM", + "uri":"modelarts_24_0080.html", + "doc_type":"", + "p_code":"3", + "code":"4" + }, + { + "desc":"Function Dependency PoliciesWhen using ModelArts to develop algorithms or manage training jobs, you are required to use other Cloud services. For example, before submitti", + "product_code":"", + "title":"Agencies and Dependencies", + "uri":"modelarts_24_0081.html", + "doc_type":"", + "p_code":"3", + "code":"5" + }, + { + "desc":"ModelArts allows you to create multiple workspaces to develop algorithms and manage and deploy models for different service objectives. In this way, the development outpu", + "product_code":"", + "title":"Workspace", + "uri":"modelarts_24_0082.html", + "doc_type":"", + "p_code":"3", + "code":"6" + }, + { + "desc":"HUAWEI CLOUD Help Center presents technical documents to help you quickly get started with HUAWEI CLOUD services. The technical documents include Service Overview, Price Details, Purchase Guide, User Guide, API Reference, Best Practices, FAQs, and Videos.", + "product_code":"", + "title":"Configuration Practices in Typical Scenarios", + "uri":"modelarts_24_0084.html", + "doc_type":"", + "p_code":"1", + "code":"7" + }, + { + "desc":"Certain ModelArts functions require access to Object Storage Service (OBS), Software Repository for Container (SWR), and Intelligent EdgeFabric (IEF). Before using ModelA", + "product_code":"", + "title":"Assigning Permissions to Individual Users for Using ModelArts", + "uri":"modelarts_24_0085.html", + "doc_type":"", + "p_code":"7", + "code":"8" + }, + { + "desc":"HUAWEI CLOUD Help Center presents technical documents to help you quickly get started with HUAWEI CLOUD services. The technical documents include Service Overview, Price Details, Purchase Guide, User Guide, API Reference, Best Practices, FAQs, and Videos.", + "product_code":"", + "title":"Assigning Basic Permissions for Using ModelArts", + "uri":"modelarts_24_0086.html", + "doc_type":"", + "p_code":"7", + "code":"9" + }, + { + "desc":"Certain ModelArts functions require the permission to access other services. This section describes how to assign specific permissions to IAM users when they use ModelArt", + "product_code":"", + "title":"Scenarios", + "uri":"modelarts_10_0062.html", + "doc_type":"", + "p_code":"9", + "code":"10" + }, + { + "desc":"Multiple IAM users can be created under a tenant user, and the permissions of the IAM users are managed by group. This section describes how to create a user group and IA", + "product_code":"", + "title":"Step 1 Create a User Group and Add Users to the User Group", + "uri":"modelarts_24_0089.html", + "doc_type":"", + "p_code":"9", + "code":"11" + }, + { + "desc":"An IAM user can use cloud services such as ModelArts and OBS only after they are assigned with permissions from the tenant user. This section describes how to assign the ", + "product_code":"", + "title":"Step 2 Assigning Permissions for Using Cloud Services", + "uri":"modelarts_24_0090.html", + "doc_type":"", + "p_code":"9", + "code":"12" + }, + { + "desc":"After assigning IAM permissions, configure ModelArts access authorization for IAM users on the ModelArts page so that ModelArts can access dependent services such as OBS,", + "product_code":"", + "title":"Step 3 Configure Agent-based ModelArts Access Authorization for the User", + "uri":"modelarts_24_0091.html", + "doc_type":"", + "p_code":"9", + "code":"13" + }, + { + "desc":"It takes 15 to 30 minutes for the permissions configured in 4 to take effect. Therefore, wait for 30 minutes after the configuration and then verify the configuration.Log", + "product_code":"", + "title":"Step 4 Verify User Permissions", + "uri":"modelarts_24_0092.html", + "doc_type":"", + "p_code":"9", + "code":"14" + }, + { + "desc":"In small- and medium-sized teams, administrators need to globally control ModelArts resources, and developers only need to focus on their own instances. By default, a dev", + "product_code":"", + "title":"Separately Assigning Permissions to Administrators and Developers", + "uri":"modelarts_24_0093.html", + "doc_type":"", + "p_code":"7", + "code":"15" + }, + { + "desc":"Any IAM user granted with the listAllNotebooks and listUsers permissions can click View all on the notebook page to view the instances of all users in the current IAM pro", + "product_code":"", + "title":"Viewing the Notebook Instances of All IAM Users Under One Tenant Account", + "uri":"modelarts_24_0095.html", + "doc_type":"", + "p_code":"7", + "code":"16" + }, + { + "desc":"You can use Cloud Shell provided by the ModelArts console to log in to a running training container.You can use Cloud Shell to log in to a running training container usin", + "product_code":"modelarts", + "title":"Logging In to a Training Container Using Cloud Shell", + "uri":"modelarts_24_0096.html", + "doc_type":"usermanual", + "p_code":"7", + "code":"17" + }, + { + "desc":"This section describes how to control the ModelArts permissions of a user so that the user is not allowed to use a public resource pool to create training jobs, create no", + "product_code":"", + "title":"Prohibiting a User from Using a Public Resource Pool", + "uri":"modelarts_24_0097.html", + "doc_type":"", + "p_code":"7", + "code":"18" + }, + { + "desc":"ModelArts provides ExeML for service developers, freeing you from model development and parameter tuning. With ExeML, you can finish an AI development project in just thr", + "product_code":"", + "title":"Huawei Cloud Mascot Detection (Using ExeML for Object Detection)", + "uri":"modelarts_10_0001.html", + "doc_type":"", + "p_code":"", + "code":"19" + }, + { + "desc":"Banks often predict whether customers would be interested in a time deposit based on their characteristics, including the age, work type, marital status, education backgr", + "product_code":"", + "title":"Bank Deposit Prediction (Using ExeML for Predictive Analytics)", + "uri":"modelarts_10_0002.html", + "doc_type":"", + "p_code":"", + "code":"20" + }, + { + "desc":"This section describes how to modify a local custom algorithm to train and deploy models on ModelArts.This case describes how to use PyTorch 1.8 to recognize handwritten ", + "product_code":"", + "title":"Using a Custom Algorithm to Build a Handwritten Digit Recognition Model", + "uri":"modelarts_10_0080.html", + "doc_type":"", + "p_code":"", + "code":"21" + }, + { + "desc":"This section describes how to create an image and use it for training on ModelArts. The AI engine used in the image is horovod_0.22.1-pytorch_1.8.1, and the resources use", + "product_code":"modelarts", + "title":"Example: Creating a Custom Image for Training (Horovod-PyTorch and GPUs)", + "uri":"develop-modelarts-0143.html", + "doc_type":"usermanual", + "p_code":"", + "code":"22" + }, + { + "desc":"This section describes how to create an image and use it for training on ModelArts. The AI engine used in the image is MindSpore, and the resources used for training are ", + "product_code":"modelarts", + "title":"Example: Creating a Custom Image for Training (MindSpore and GPUs)", + "uri":"develop-modelarts-0144.html", + "doc_type":"usermanual", + "p_code":"", + "code":"23" + }, + { + "desc":"This section describes how to create an image and use it for training on ModelArts. The AI engine used in the image is TensorFlow, and the resources used for training are", + "product_code":"modelarts", + "title":"Example: Creating a Custom Image for Training (TensorFlow and GPUs)", + "uri":"develop-modelarts-0145.html", + "doc_type":"usermanual", + "p_code":"", + "code":"24" + }, + { + "desc":"If you want to use an AI engine that is not supported by ModelArts, create a custom image for the engine, import the image to ModelArts, and use the image to create AI ap", + "product_code":"modelarts", + "title":"Creating a Custom Image and Using It to Create an AI Application", + "uri":"modelarts_10_0072.html", + "doc_type":"usermanual", + "p_code":"", + "code":"25" + }, + { + "desc":"This section describes how to enable an inference service to access the Internet.An inference service accesses the Internet in the following scenarios:After an image is i", + "product_code":"", + "title":"Enabling an Inference Service to Access the Internet", + "uri":"modelarts_04_0203.html", + "doc_type":"", + "p_code":"", + "code":"26" + }, + { + "desc":"The end-to-end O&M of ModelArts inference services involves the entire AI process including algorithm development, service O&M, and service running.End-to-End O&M Process", + "product_code":"", + "title":"End-to-End O&M of Inference Services", + "uri":"modelarts_04_0204.html", + "doc_type":"", + "p_code":"", + "code":"27" + } +] \ No newline at end of file diff --git a/docs/modelarts/best-practice/PARAMETERS.txt b/docs/modelarts/best-practice/PARAMETERS.txt new file mode 100644 index 00000000..6da8d5f0 --- /dev/null +++ b/docs/modelarts/best-practice/PARAMETERS.txt @@ -0,0 +1,3 @@ +version="" +language="en-us" +type="" \ No newline at end of file diff --git a/docs/modelarts/best-practice/develop-modelarts-0143.html b/docs/modelarts/best-practice/develop-modelarts-0143.html new file mode 100644 index 00000000..ca646e2b --- /dev/null +++ b/docs/modelarts/best-practice/develop-modelarts-0143.html @@ -0,0 +1,496 @@ + + +
This section describes how to create an image and use it for training on ModelArts. The AI engine used in the image is horovod_0.22.1-pytorch_1.8.1, and the resources used for training are GPUs.
+This section applies only to training jobs of the new version.
+In this example, write a Dockerfile to create a custom image on a Linux x86_64 server running Ubuntu 18.04.
+Create a container image with the following configurations and use the image to create a CPU- or GPU-powered training job on ModelArts:
+Before using a custom image to create a training job, you need to be familiar with Docker and have development experience.
+ +Create a bucket and folders in OBS for storing the sample dataset and training code. Table 1 lists the folders to be created. Replace the bucket name and folder names in the example with actual names.
+For details about how to create an OBS bucket and folder, see Creating a Bucket and Creating a Folder.
+Ensure that the OBS directory you use and ModelArts are in the same region.
+ + +Obtain training scripts pytorch_synthetic_benchmark.py and run_mpi.sh and upload them to obs://test-modelarts/horovod/demo-code/ in the OBS bucket.
+pytorch_synthetic_benchmark.py is as follows:
+import argparse +import torch.backends.cudnn as cudnn +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data.distributed +from torchvision import models +import horovod.torch as hvd +import timeit +import numpy as np + +# Benchmark settings +parser = argparse.ArgumentParser(description='PyTorch Synthetic Benchmark', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--fp16-allreduce', action='store_true', default=False, + help='use fp16 compression during allreduce') + +parser.add_argument('--model', type=str, default='resnet50', + help='model to benchmark') +parser.add_argument('--batch-size', type=int, default=32, + help='input batch size') + +parser.add_argument('--num-warmup-batches', type=int, default=10, + help='number of warm-up batches that don\'t count towards benchmark') +parser.add_argument('--num-batches-per-iter', type=int, default=10, + help='number of batches per benchmark iteration') +parser.add_argument('--num-iters', type=int, default=10, + help='number of benchmark iterations') + +parser.add_argument('--no-cuda', action='store_true', default=False, + help='disables CUDA training') + +parser.add_argument('--use-adasum', action='store_true', default=False, + help='use adasum algorithm to do reduction') + +args = parser.parse_args() +args.cuda = not args.no_cuda and torch.cuda.is_available() + +hvd.init() + +if args.cuda: + # Horovod: pin GPU to local rank. + torch.cuda.set_device(hvd.local_rank()) + +cudnn.benchmark = True + +# Set up standard model. +model = getattr(models, args.model)() + +# By default, Adasum doesn't need scaling up learning rate. +lr_scaler = hvd.size() if not args.use_adasum else 1 + +if args.cuda: + # Move model to GPU. + model.cuda() + # If using GPU Adasum allreduce, scale learning rate by local_size. + if args.use_adasum and hvd.nccl_built(): + lr_scaler = hvd.local_size() + +optimizer = optim.SGD(model.parameters(), lr=0.01 * lr_scaler) + +# Horovod: (optional) compression algorithm. +compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none + +# Horovod: wrap optimizer with DistributedOptimizer. +optimizer = hvd.DistributedOptimizer(optimizer, + named_parameters=model.named_parameters(), + compression=compression, + op=hvd.Adasum if args.use_adasum else hvd.Average) + +# Horovod: broadcast parameters & optimizer state. +hvd.broadcast_parameters(model.state_dict(), root_rank=0) +hvd.broadcast_optimizer_state(optimizer, root_rank=0) + +# Set up fixed fake data +data = torch.randn(args.batch_size, 3, 224, 224) +target = torch.LongTensor(args.batch_size).random_() % 1000 +if args.cuda: + data, target = data.cuda(), target.cuda() + + +def benchmark_step(): + optimizer.zero_grad() + output = model(data) + loss = F.cross_entropy(output, target) + loss.backward() + optimizer.step() + + +def log(s, nl=True): + if hvd.rank() != 0: + return + print(s, end='\n' if nl else '') + + +log('Model: %s' % args.model) +log('Batch size: %d' % args.batch_size) +device = 'GPU' if args.cuda else 'CPU' +log('Number of %ss: %d' % (device, hvd.size())) + +# Warm-up +log('Running warmup...') +timeit.timeit(benchmark_step, number=args.num_warmup_batches) + +# Benchmark +log('Running benchmark...') +img_secs = [] +for x in range(args.num_iters): + time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter) + img_sec = args.batch_size * args.num_batches_per_iter / time + log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device)) + img_secs.append(img_sec) + +# Results +img_sec_mean = np.mean(img_secs) +img_sec_conf = 1.96 * np.std(img_secs) +log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf)) +log('Total img/sec on %d %s(s): %.1f +-%.1f' % + (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf))+
run_mpi.sh is as follows:
+#!/bin/bash +MY_HOME=/home/ma-user + +MY_SSHD_PORT=${MY_SSHD_PORT:-"36666"} + +MY_MPI_BTL_TCP_IF=${MY_MPI_BTL_TCP_IF:-"eth0,bond0"} + +MY_TASK_INDEX=${MA_TASK_INDEX:-${VC_TASK_INDEX:-${VK_TASK_INDEX}}} + +MY_MPI_SLOTS=${MY_MPI_SLOTS:-"${MA_NUM_GPUS}"} + +MY_MPI_TUNE_FILE="${MY_HOME}/env_for_user_process" + +if [ -z ${MY_MPI_SLOTS} ]; then + echo "[run_mpi] MY_MPI_SLOTS is empty, set it be 1" + MY_MPI_SLOTS="1" +fi + +printf "MY_HOME: ${MY_HOME}\nMY_SSHD_PORT: ${MY_SSHD_PORT}\nMY_MPI_BTL_TCP_IF: ${MY_MPI_BTL_TCP_IF}\nMY_TASK_INDEX: ${MY_TASK_INDEX}\nMY_MPI_SLOTS: ${MY_MPI_SLOTS}\n" + +env | grep -E '^MA_|^AWS_|^S3_|^PATH|^VC_WORKER_|^SCC|^CRED' | grep -v '=$' > ${MY_MPI_TUNE_FILE} +# add -x to each line +sed -i 's/^/-x /' ${MY_MPI_TUNE_FILE} + +sed -i "s|{{MY_SSHD_PORT}}|${MY_SSHD_PORT}|g" ${MY_HOME}/etc/ssh/sshd_config + +# start sshd service +bash -c "$(which sshd) -f ${MY_HOME}/etc/ssh/sshd_config" + +# confirm the sshd is up +netstat -anp | grep LIS | grep ${MY_SSHD_PORT} + +if [ $MY_TASK_INDEX -eq 0 ]; then + # generate the hostfile of mpi + for ((i=0; i<$MA_NUM_HOSTS; i++)) + do + eval hostname=${MA_VJ_NAME}-${MA_TASK_NAME}-${i}.${MA_VJ_NAME} + echo "[run_mpi] hostname: ${hostname}" + + ip="" + while [ -z "$ip" ]; do + ip=$(ping -c 1 ${hostname} | grep "PING" | sed -E 's/PING .* .([0-9.]+). .*/\1/g') + sleep 1 + done + echo "[run_mpi] resolved ip: ${ip}" + + # test the sshd is up + while : + do + if [ cat < /dev/null >/dev/tcp/${ip}/${MY_SSHD_PORT} ]; then + break + fi + sleep 1 + done + + echo "[run_mpi] the sshd of ip ${ip} is up" + + echo "${ip} slots=$MY_MPI_SLOTS" >> ${MY_HOME}/hostfile + done + + printf "[run_mpi] hostfile:\n`cat ${MY_HOME}/hostfile`\n" +fi + +RET_CODE=0 + +if [ $MY_TASK_INDEX -eq 0 ]; then + + echo "[run_mpi] start exec command time: "$(date +"%Y-%m-%d-%H:%M:%S") + + np=$(( ${MA_NUM_HOSTS} * ${MY_MPI_SLOTS} )) + + echo "[run_mpi] command: mpirun -np ${np} -hostfile ${MY_HOME}/hostfile -mca plm_rsh_args \"-p ${MY_SSHD_PORT}\" -tune ${MY_MPI_TUNE_FILE} ... $@" + + # execute mpirun at worker-0 + # mpirun + mpirun \ + -np ${np} \ + -hostfile ${MY_HOME}/hostfile \ + -mca plm_rsh_args "-p ${MY_SSHD_PORT}" \ + -tune ${MY_MPI_TUNE_FILE} \ + -bind-to none -map-by slot \ + -x NCCL_DEBUG=INFO -x NCCL_SOCKET_IFNAME=${MY_MPI_BTL_TCP_IF} -x NCCL_SOCKET_FAMILY=AF_INET \ + -x HOROVOD_MPI_THREADS_DISABLE=1 \ + -x LD_LIBRARY_PATH \ + -mca pml ob1 -mca btl ^openib -mca plm_rsh_no_tree_spawn true \ + "$@" + + RET_CODE=$? + + if [ $RET_CODE -ne 0 ]; then + echo "[run_mpi] exec command failed, exited with $RET_CODE" + else + echo "[run_mpi] exec command successfully, exited with $RET_CODE" + fi + + # stop 1...N worker by killing the sleep proc + sed -i '1d' ${MY_HOME}/hostfile + if [ `cat ${MY_HOME}/hostfile | wc -l` -ne 0 ]; then + echo "[run_mpi] stop 1 to (N - 1) worker by killing the sleep proc" + + sed -i 's/${MY_MPI_SLOTS}/1/g' ${MY_HOME}/hostfile + printf "[run_mpi] hostfile:\n`cat ${MY_HOME}/hostfile`\n" + + mpirun \ + --hostfile ${MY_HOME}/hostfile \ + --mca btl_tcp_if_include ${MY_MPI_BTL_TCP_IF} \ + --mca plm_rsh_args "-p ${MY_SSHD_PORT}" \ + -x PATH -x LD_LIBRARY_PATH \ + pkill sleep \ + > /dev/null 2>&1 + fi + + echo "[run_mpi] exit time: "$(date +"%Y-%m-%d-%H:%M:%S") +else + echo "[run_mpi] the training log is in worker-0" + sleep 365d + echo "[run_mpi] exit time: "$(date +"%Y-%m-%d-%H:%M:%S") +fi + +exit $RET_CODE+
Obtain a Linux x86_64 server running Ubuntu 18.04. Either an ECS or your local PC will do.
+Create a container image with the following configurations and use the image to create a training job on ModelArts:
+This section describes how to write a Dockerfile to create a custom image.
+The following uses Linux x86_64 as an example to describe how to obtain a Docker installation package. For more details about how to install Docker, see official Docker documents. Run the following command to install Docker:
+curl -fsSL get.docker.com -o get-docker.sh +sh get-docker.sh+
If the docker images command can be executed, Docker has been installed. In this case, skip this step.
+docker version | grep -A 1 Engine+
Engine: + Version: 18.09.0+
Use the Docker engine of the preceding version or later to create a custom image.
+mkdir -p context+
[global] +index-url = https://repo.huaweicloud.com/repository/pypi/simple +trusted-host = repo.huaweicloud.com +timeout = 120+
To obtain pip.conf, switch to Huawei Mirrors https://mirrors.huaweicloud.com/home and search for pypi.
+Download horovod-0.22.1.tar.gz from https://pypi.org/project/horovod/0.22.1/#files.
+Download the following .whl files from https://download.pytorch.org/whl/torch_stable.html.
+The URL code of the plus sign (+) is %2B. When searching for files in the preceding websites, replace the plus sign (+) in the file name with %2B, for example, torch-1.8.1%2Bcu111-cp37-cp37m-linux_x86_64.whl.
+Download Miniconda3-py37_4.12.0-Linux-x86_64.sh from https://repo.anaconda.com/miniconda/Miniconda3-py37_4.12.0-Linux-x86_64.sh.
+# The server on which the container image is created must access the Internet. + +# Base container image at https://github.com/NVIDIA/nvidia-docker/wiki/CUDA +# +# https://docs.docker.com/develop/develop-images/multistage-build/#use-multi-stage-builds +# require Docker Engine >= 17.05 +# +# builder stage +FROM nvidia/cuda:11.1.1-devel-ubuntu18.04 AS builder + +# Install CMake obtained from Huawei Mirrors. +RUN cp -a /etc/apt/sources.list /etc/apt/sources.list.bak && \ + sed -i "s@http://.*archive.ubuntu.com@http://repo.huaweicloud.com@g" /etc/apt/sources.list && \ + sed -i "s@http://.*security.ubuntu.com@http://repo.huaweicloud.com@g" /etc/apt/sources.list && \ + echo > /etc/apt/apt.conf.d/00skip-verify-peer.conf "Acquire { https::Verify-Peer false }" && \ + apt-get update && \ + apt-get install -y build-essential cmake g++-7 && \ + apt-get clean && \ + mv /etc/apt/sources.list.bak /etc/apt/sources.list && \ + rm /etc/apt/apt.conf.d/00skip-verify-peer.conf + +# The default user of the base container image is root. +# USER root + +# Use the PyPI configuration obtained from Huawei Mirrors. +RUN mkdir -p /root/.pip/ +COPY pip.conf /root/.pip/pip.conf + +# Copy the installation files to the /tmp directory in the base container image. +COPY Miniconda3-py37_4.12.0-Linux-x86_64.sh /tmp +COPY torch-1.8.1+cu111-cp37-cp37m-linux_x86_64.whl /tmp +COPY torchvision-0.9.1+cu111-cp37-cp37m-linux_x86_64.whl /tmp +COPY torchaudio-0.8.1-cp37-cp37m-linux_x86_64.whl /tmp +COPY openmpi-3.0.0-bin.tar.gz /tmp +COPY horovod-0.22.1.tar.gz /tmp + +# https://conda.io/projects/conda/en/latest/user-guide/install/linux.html#installing-on-linux +# Install Miniconda3 in the /home/ma-user/miniconda3 directory of the base container image. +RUN bash /tmp/Miniconda3-py37_4.12.0-Linux-x86_64.sh -b -p /home/ma-user/miniconda3 + +# Install the Open MPI 3.0.0 file obtained from Horovod v0.22.1. +# https://github.com/horovod/horovod/blob/v0.22.1/docker/horovod/Dockerfile +# https://github.com/horovod/horovod/files/1596799/openmpi-3.0.0-bin.tar.gz +RUN cd /usr/local && \ + tar -zxf /tmp/openmpi-3.0.0-bin.tar.gz && \ + ldconfig && \ + mpirun --version + +# Environment variables required for building Horovod with PyTorch +ENV HOROVOD_NCCL_INCLUDE=/usr/include \ + HOROVOD_NCCL_LIB=/usr/lib/x86_64-linux-gnu \ + HOROVOD_MPICXX_SHOW="/usr/local/openmpi/bin/mpicxx -show" \ + HOROVOD_GPU_OPERATIONS=NCCL \ + HOROVOD_WITH_PYTORCH=1 + +# Install the .whl files using default Miniconda3 Python environment /home/ma-user/miniconda3/bin/pip. +RUN cd /tmp && \ + /home/ma-user/miniconda3/bin/pip install --no-cache-dir \ + /tmp/torch-1.8.1+cu111-cp37-cp37m-linux_x86_64.whl \ + /tmp/torchvision-0.9.1+cu111-cp37-cp37m-linux_x86_64.whl \ + /tmp/torchaudio-0.8.1-cp37-cp37m-linux_x86_64.whl + +# Build and install horovod-0.22.1.tar.gz using default Miniconda3 Python environment /home/ma-user/miniconda3/bin/pip. +RUN cd /tmp && \ + /home/ma-user/miniconda3/bin/pip install --no-cache-dir \ + /tmp/horovod-0.22.1.tar.gz + +# Create the container image. +FROM nvidia/cuda:11.1.1-runtime-ubuntu18.04 + +COPY MLNX_OFED_LINUX-5.4-3.5.8.0-ubuntu18.04-x86_64.tgz /tmp + +# Install the vim, cURL, net-tools, MLNX_OFED, and SSH tools obtained from Huawei Mirrors. +RUN cp -a /etc/apt/sources.list /etc/apt/sources.list.bak && \ + sed -i "s@http://.*archive.ubuntu.com@http://repo.huaweicloud.com@g" /etc/apt/sources.list && \ + sed -i "s@http://.*security.ubuntu.com@http://repo.huaweicloud.com@g" /etc/apt/sources.list && \ + echo > /etc/apt/apt.conf.d/00skip-verify-peer.conf "Acquire { https::Verify-Peer false }" && \ + apt-get update && \ + apt-get install -y vim curl net-tools iputils-ping libfile-find-rule-perl-perl \ + openssh-client openssh-server && \ + ssh -V && \ + mkdir -p /run/sshd && \ + # mlnx ofed + apt-get install -y python libfuse2 dpatch libnl-3-dev autoconf libnl-route-3-dev pciutils libnuma1 libpci3 m4 libelf1 debhelper automake graphviz bison lsof kmod libusb-1.0-0 swig libmnl0 autotools-dev flex chrpath libltdl-dev && \ + cd /tmp && \ + tar -xvf MLNX_OFED_LINUX-5.4-3.5.8.0-ubuntu18.04-x86_64.tgz && \ + MLNX_OFED_LINUX-5.4-3.5.8.0-ubuntu18.04-x86_64/mlnxofedinstall --user-space-only --basic --without-fw-update -q && \ + cd - && \ + rm -rf /tmp/* && \ + apt-get clean && \ + mv /etc/apt/sources.list.bak /etc/apt/sources.list && \ + rm /etc/apt/apt.conf.d/00skip-verify-peer.conf + +# Install the Open MPI 3.0.0 file obtained from Horovod v0.22.1. +# https://github.com/horovod/horovod/blob/v0.22.1/docker/horovod/Dockerfile +# https://github.com/horovod/horovod/files/1596799/openmpi-3.0.0-bin.tar.gz +COPY openmpi-3.0.0-bin.tar.gz /tmp +RUN cd /usr/local && \ + tar -zxf /tmp/openmpi-3.0.0-bin.tar.gz && \ + ldconfig && \ + mpirun --version + +# Add user ma-user (UID = 1000, GID = 100). +# A user group whose GID is 100 exists in the basic container image. User ma-user can directly run the following command: +RUN useradd -m -d /home/ma-user -s /bin/bash -g 100 -u 1000 ma-user + +# Copy the /home/ma-user/miniconda3 directory from the builder stage to the directory with the same name in the current container image. +COPY --chown=ma-user:100 --from=builder /home/ma-user/miniconda3 /home/ma-user/miniconda3 + +# Configure the default user and working directory of the container image. +USER ma-user +WORKDIR /home/ma-user + +# Configure sshd to support SSH password-free login. +RUN MA_HOME=/home/ma-user && \ + # setup sshd dir + mkdir -p ${MA_HOME}/etc && \ + ssh-keygen -f ${MA_HOME}/etc/ssh_host_rsa_key -N '' -t rsa && \ + mkdir -p ${MA_HOME}/etc/ssh ${MA_HOME}/var/run && \ + # setup sshd config (listen at {{MY_SSHD_PORT}} port) + echo "Port {{MY_SSHD_PORT}}\n\ +HostKey ${MA_HOME}/etc/ssh_host_rsa_key\n\ +AuthorizedKeysFile ${MA_HOME}/.ssh/authorized_keys\n\ +PidFile ${MA_HOME}/var/run/sshd.pid\n\ +StrictModes no\n\ +UsePAM no" > ${MA_HOME}/etc/ssh/sshd_config && \ + # generate ssh key + ssh-keygen -t rsa -f ${MA_HOME}/.ssh/id_rsa -P '' && \ + cat ${MA_HOME}/.ssh/id_rsa.pub >> ${MA_HOME}/.ssh/authorized_keys && \ + # disable ssh host key checking for all hosts + echo "Host *\n\ + StrictHostKeyChecking no" > ${MA_HOME}/.ssh/config + +# Configure the preset environment variables of the container image. +# Set PYTHONUNBUFFERED to 1 to prevent log loss. +ENV PATH=/home/ma-user/miniconda3/bin:$PATH \ + PYTHONUNBUFFERED=1+
For details about how to write a Dockerfile, see official Docker documents.
+Go to https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/, click Download, set Version to 5.4-3.5.8.0-LTS, OSDistributionVersion to Ubuntu 18.04, and Architecture to x86_64, and download MLNX_OFED_LINUX-5.4-3.5.8.0-ubuntu18.04-x86_64.tgz.
+Download openmpi-3.0.0-bin.tar.gz from https://github.com/horovod/horovod/files/1596799/openmpi-3.0.0-bin.tar.gz.
+context +├── Dockerfile +├── MLNX_OFED_LINUX-5.4-3.5.8.0-ubuntu18.04-x86_64.tgz +├── Miniconda3-py37_4.12.0-Linux-x86_64.sh +├── horovod-0.22.1.tar.gz +├── openmpi-3.0.0-bin.tar.gz +├── pip.conf +├── torch-1.8.1+cu111-cp37-cp37m-linux_x86_64.whl +├── torchaudio-0.8.1-cp37-cp37m-linux_x86_64.whl +└── torchvision-0.9.1+cu111-cp37-cp37m-linux_x86_64.whl+
1 | docker build . -t horovod-pytorch:0.22.1-1.8.1-ofed-cuda11.1 + |
Successfully tagged horovod-pytorch:0.22.1-1.8.1-ofed-cuda11.1+
# Replace the region, domain, as well as organization name deep-learning with the actual values. +sudo docker tag horovod-pytorch:0.22.1-1.8.1-ofed-cuda11.1 swr.{region-id}.{domain}/deep-learning/horovod-pytorch:0.22.1-1.8.1-ofed-cuda11.1+
# Replace the region, domain, as well as organization name deep-learning with the actual values. +sudo docker push swr.{region-id}.{domain}/deep-learning/horovod-pytorch:0.22.1-1.8.1-ofed-cuda11.1+
After you submit the job creation request, the system will automatically perform operations on the backend, such as downloading the container image and code directory and running the boot command. A training job requires a certain period of time for running. The duration ranges from dozens of minutes to several hours, varying depending on the service logic and selected resources. After the training job is executed, the log similar to the following is output.
+This section describes how to create an image and use it for training on ModelArts. The AI engine used in the image is MindSpore, and the resources used for training are GPUs.
+This section applies only to training jobs of the new version.
+In this example, write a Dockerfile to create a custom image on a Linux x86_64 server running Ubuntu 18.04.
+Create a container image with the following configurations and use the image to create a GPU-powered training job on ModelArts:
+Before using a custom image to create a training job, you need to be familiar with Docker and have development experience.
+ +Create a bucket and folders in OBS for storing the sample dataset and training code. Table 1 lists the folders to be created. Replace the bucket name and folder names in the example with actual names.
+For details, see Creating a Bucket and Creating a Folder.
+Ensure that the OBS and ModelArts are in the same region.
+ +Folder + |
+Description + |
+
---|---|
obs://test-modelarts/mindspore-gpu/resnet/ + |
+Stores the training script. + |
+
obs://test-modelarts/mindspore-gpu/cifar-10-batches-bin/ + |
+Stores dataset files. + |
+
obs://test-modelarts/mindspore-gpu/output/ + |
+Stores training output files. + |
+
obs://test-modelarts/mindspore-gpu/log/ + |
+Store training log files. + |
+
Go to http://www.cs.toronto.edu/~kriz/cifar.html, download CIFAR-10 binary version (suitable for C programs), decompress it, and upload the decompressed data to obs://test-modelarts/mindspore-gpu/cifar-10-batches-bin/ in the OBS bucket, which is as follows.
+Obtain the ResNet file and script run_mpi.sh and upload them to obs://test-modelarts/mindspore-gpu/resnet/ in the OBS bucket.
+Download the ResNet file from https://gitee.com/mindspore/models/tree/r1.8/official/cv/resnet.
+run_mpi.sh is as follows:
+#!/bin/bash +MY_HOME=/home/ma-user + +MY_SSHD_PORT=${MY_SSHD_PORT:-"36666"} + +MY_MPI_BTL_TCP_IF=${MY_MPI_BTL_TCP_IF:-"eth0,bond0"} + +MY_TASK_INDEX=${MA_TASK_INDEX:-${VC_TASK_INDEX:-${VK_TASK_INDEX}}} + +MY_MPI_SLOTS=${MY_MPI_SLOTS:-"${MA_NUM_GPUS}"} + +MY_MPI_TUNE_FILE="${MY_HOME}/env_for_user_process" + +if [ -z ${MY_MPI_SLOTS} ]; then + echo "[run_mpi] MY_MPI_SLOTS is empty, set it be 1" + MY_MPI_SLOTS="1" +fi + +printf "MY_HOME: ${MY_HOME}\nMY_SSHD_PORT: ${MY_SSHD_PORT}\nMY_MPI_BTL_TCP_IF: ${MY_MPI_BTL_TCP_IF}\nMY_TASK_INDEX: ${MY_TASK_INDEX}\nMY_MPI_SLOTS: ${MY_MPI_SLOTS}\n" + +env | grep -E '^MA_|^AWS_|^S3_|^PATH|^VC_WORKER_|^SCC|^CRED' | grep -v '=$' > ${MY_MPI_TUNE_FILE} +# add -x to each line +sed -i 's/^/-x /' ${MY_MPI_TUNE_FILE} + +sed -i "s|{{MY_SSHD_PORT}}|${MY_SSHD_PORT}|g" ${MY_HOME}/etc/ssh/sshd_config + +# start sshd service +bash -c "$(which sshd) -f ${MY_HOME}/etc/ssh/sshd_config" + +# confirm the sshd is up +netstat -anp | grep LIS | grep ${MY_SSHD_PORT} + +if [ $MY_TASK_INDEX -eq 0 ]; then + # generate the hostfile of mpi + for ((i=0; i<$MA_NUM_HOSTS; i++)) + do + eval hostname=${MA_VJ_NAME}-${MA_TASK_NAME}-${i}.${MA_VJ_NAME} + echo "[run_mpi] hostname: ${hostname}" + + ip="" + while [ -z "$ip" ]; do + ip=$(ping -c 1 ${hostname} | grep "PING" | sed -E 's/PING .* .([0-9.]+). .*/\1/g') + sleep 1 + done + echo "[run_mpi] resolved ip: ${ip}" + + # test the sshd is up + while : + do + if [ cat < /dev/null >/dev/tcp/${ip}/${MY_SSHD_PORT} ]; then + break + fi + sleep 1 + done + + echo "[run_mpi] the sshd of ip ${ip} is up" + + echo "${ip} slots=$MY_MPI_SLOTS" >> ${MY_HOME}/hostfile + done + + printf "[run_mpi] hostfile:\n`cat ${MY_HOME}/hostfile`\n" +fi + +RET_CODE=0 + +if [ $MY_TASK_INDEX -eq 0 ]; then + + echo "[run_mpi] start exec command time: "$(date +"%Y-%m-%d-%H:%M:%S") + + np=$(( ${MA_NUM_HOSTS} * ${MY_MPI_SLOTS} )) + + echo "[run_mpi] command: mpirun -np ${np} -hostfile ${MY_HOME}/hostfile -mca plm_rsh_args \"-p ${MY_SSHD_PORT}\" -tune ${MY_MPI_TUNE_FILE} ... $@" + + # execute mpirun at worker-0 + # mpirun + mpirun \ + -np ${np} \ + -hostfile ${MY_HOME}/hostfile \ + -mca plm_rsh_args "-p ${MY_SSHD_PORT}" \ + -tune ${MY_MPI_TUNE_FILE} \ + -bind-to none -map-by slot \ + -x NCCL_DEBUG=INFO -x NCCL_SOCKET_IFNAME=${MY_MPI_BTL_TCP_IF} -x NCCL_SOCKET_FAMILY=AF_INET \ + -x HOROVOD_MPI_THREADS_DISABLE=1 \ + -x LD_LIBRARY_PATH \ + -mca pml ob1 -mca btl ^openib -mca plm_rsh_no_tree_spawn true \ + "$@" + + RET_CODE=$? + + if [ $RET_CODE -ne 0 ]; then + echo "[run_mpi] exec command failed, exited with $RET_CODE" + else + echo "[run_mpi] exec command successfully, exited with $RET_CODE" + fi + + # stop 1...N worker by killing the sleep proc + sed -i '1d' ${MY_HOME}/hostfile + if [ `cat ${MY_HOME}/hostfile | wc -l` -ne 0 ]; then + echo "[run_mpi] stop 1 to (N - 1) worker by killing the sleep proc" + + sed -i 's/${MY_MPI_SLOTS}/1/g' ${MY_HOME}/hostfile + printf "[run_mpi] hostfile:\n`cat ${MY_HOME}/hostfile`\n" + + mpirun \ + --hostfile ${MY_HOME}/hostfile \ + --mca btl_tcp_if_include ${MY_MPI_BTL_TCP_IF} \ + --mca plm_rsh_args "-p ${MY_SSHD_PORT}" \ + -x PATH -x LD_LIBRARY_PATH \ + pkill sleep \ + > /dev/null 2>&1 + fi + + echo "[run_mpi] exit time: "$(date +"%Y-%m-%d-%H:%M:%S") +else + echo "[run_mpi] the training log is in worker-0" + sleep 365d + echo "[run_mpi] exit time: "$(date +"%Y-%m-%d-%H:%M:%S") +fi + +exit $RET_CODE+
The following figure shows obs://test-modelarts/mindspore-gpu/resnet/, including the ResNet file and run_mpi.sh.
+Obtain a Linux x86_64 server running Ubuntu 18.04. Either an ECS or your local PC will do.
+Create a container image with the following configurations and use the image to create a training job on ModelArts:
+This section describes how to write a Dockerfile to create a custom image.
+The following uses Linux x86_64 as an example to describe how to obtain a Docker installation package. For more details about how to install Docker, see official Docker documents. Run the following command to install Docker:
+curl -fsSL get.docker.com -o get-docker.sh +sh get-docker.sh+
If the docker images command can be executed, Docker has been installed. In this case, skip this step.
+docker version | grep -A 1 Engine+
Engine: + Version: 18.09.0+
Use the Docker engine of the preceding version or later to create a custom image.
+mkdir -p context+
[global] +index-url = https://repo.huaweicloud.com/repository/pypi/simple +trusted-host = repo.huaweicloud.com +timeout = 120+
To obtain pip.conf, switch to Huawei Mirrors https://mirrors.huaweicloud.com/home and search for pypi.
+Download Miniconda3-py37_4.12.0-Linux-x86_64.sh from https://repo.anaconda.com/miniconda/Miniconda3-py37_4.12.0-Linux-x86_64.sh.
+# The server on which the container image is created must access the Internet. + +# Base container image at https://github.com/NVIDIA/nvidia-docker/wiki/CUDA +# +# https://docs.docker.com/develop/develop-images/multistage-build/#use-multi-stage-builds +# require Docker Engine >= 17.05 +# +# builder stage +FROM nvidia/cuda:11.1.1-devel-ubuntu18.04 AS builder + +# The default user of the base container image is root. +# USER root + +# Use the PyPI configuration obtained from Huawei Mirrors. +RUN mkdir -p /root/.pip/ +COPY pip.conf /root/.pip/pip.conf + +# Copy the installation files to the /tmp directory in the base container image. +COPY Miniconda3-py37_4.12.0-Linux-x86_64.sh /tmp +COPY mindspore_gpu-1.8.1-cp37-cp37m-linux_x86_64.whl /tmp + +# https://conda.io/projects/conda/en/latest/user-guide/install/linux.html#installing-on-linux +# Install Miniconda3 in the /home/ma-user/miniconda3 directory of the base container image. +RUN bash /tmp/Miniconda3-py37_4.12.0-Linux-x86_64.sh -b -p /home/ma-user/miniconda3 + +# Install the whl file using default Miniconda3 Python environment /home/ma-user/miniconda3/bin/pip. +RUN cd /tmp && \ + /home/ma-user/miniconda3/bin/pip install --no-cache-dir \ + /tmp/mindspore_gpu-1.8.1-cp37-cp37m-linux_x86_64.whl \ + easydict PyYAML + +# Create the container image. +FROM nvidia/cuda:11.1.1-cudnn8-runtime-ubuntu18.04 + +COPY MLNX_OFED_LINUX-5.4-3.5.8.0-ubuntu18.04-x86_64.tgz /tmp + +# Install the vim, cURL, net-tools, MLNX_OFED, and SSH tools obtained from Huawei Mirrors. +RUN cp -a /etc/apt/sources.list /etc/apt/sources.list.bak && \ + sed -i "s@http://.*archive.ubuntu.com@http://repo.huaweicloud.com@g" /etc/apt/sources.list && \ + sed -i "s@http://.*security.ubuntu.com@http://repo.huaweicloud.com@g" /etc/apt/sources.list && \ + echo > /etc/apt/apt.conf.d/00skip-verify-peer.conf "Acquire { https::Verify-Peer false }" && \ + apt-get update && \ + apt-get install -y vim curl net-tools iputils-ping libfile-find-rule-perl-perl \ + openssh-client openssh-server && \ + ssh -V && \ + mkdir -p /run/sshd && \ + # mlnx ofed + apt-get install -y python libfuse2 dpatch libnl-3-dev autoconf libnl-route-3-dev pciutils libnuma1 libpci3 m4 libelf1 debhelper automake graphviz bison lsof kmod libusb-1.0-0 swig libmnl0 autotools-dev flex chrpath libltdl-dev && \ + cd /tmp && \ + tar -xvf MLNX_OFED_LINUX-5.4-3.5.8.0-ubuntu18.04-x86_64.tgz && \ + MLNX_OFED_LINUX-5.4-3.5.8.0-ubuntu18.04-x86_64/mlnxofedinstall --user-space-only --basic --without-fw-update -q && \ + cd - && \ + rm -rf /tmp/* && \ + apt-get clean && \ + mv /etc/apt/sources.list.bak /etc/apt/sources.list && \ + rm /etc/apt/apt.conf.d/00skip-verify-peer.conf + +# Install the Open MPI 3.0.0 file obtained from Horovod v0.22.1. +# https://github.com/horovod/horovod/blob/v0.22.1/docker/horovod/Dockerfile +# https://github.com/horovod/horovod/files/1596799/openmpi-3.0.0-bin.tar.gz +COPY openmpi-3.0.0-bin.tar.gz /tmp +RUN cd /usr/local && \ + tar -zxf /tmp/openmpi-3.0.0-bin.tar.gz && \ + ldconfig && \ + mpirun --version + +# Add user ma-user (UID = 1000, GID = 100). +# A user group whose GID is 100 exists in the basic container image. User ma-user can directly run the following command: +RUN useradd -m -d /home/ma-user -s /bin/bash -g 100 -u 1000 ma-user + +# Copy the /home/ma-user/miniconda3 directory from the builder stage to the directory with the same name in the current container image. +COPY --chown=ma-user:100 --from=builder /home/ma-user/miniconda3 /home/ma-user/miniconda3 + +# Configure the default user and working directory of the container image. +USER ma-user +WORKDIR /home/ma-user + +# Configure sshd to support SSH password-free login. +RUN MA_HOME=/home/ma-user && \ + # setup sshd dir + mkdir -p ${MA_HOME}/etc && \ + ssh-keygen -f ${MA_HOME}/etc/ssh_host_rsa_key -N '' -t rsa && \ + mkdir -p ${MA_HOME}/etc/ssh ${MA_HOME}/var/run && \ + # setup sshd config (listen at {{MY_SSHD_PORT}} port) + echo "Port {{MY_SSHD_PORT}}\n\ +HostKey ${MA_HOME}/etc/ssh_host_rsa_key\n\ +AuthorizedKeysFile ${MA_HOME}/.ssh/authorized_keys\n\ +PidFile ${MA_HOME}/var/run/sshd.pid\n\ +StrictModes no\n\ +UsePAM no" > ${MA_HOME}/etc/ssh/sshd_config && \ + # generate ssh key + ssh-keygen -t rsa -f ${MA_HOME}/.ssh/id_rsa -P '' && \ + cat ${MA_HOME}/.ssh/id_rsa.pub >> ${MA_HOME}/.ssh/authorized_keys && \ + # disable ssh host key checking for all hosts + echo "Host *\n\ + StrictHostKeyChecking no" > ${MA_HOME}/.ssh/config + +# Configure the preset environment variables of the container image. +# Set PYTHONUNBUFFERED to 1 to prevent log loss. +ENV PATH=/home/ma-user/miniconda3/bin:$PATH \ + LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH \ + PYTHONUNBUFFERED=1+
For details about how to write a Dockerfile, see official Docker documents.
+Go to https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/, click Download, set Version to 5.4-3.5.8.0-LTS, OSDistributionVersion to Ubuntu 18.04, and Architecture to x86_64, and download MLNX_OFED_LINUX-5.4-3.5.8.0-ubuntu18.04-x86_64.tgz.
+Download openmpi-3.0.0-bin.tar.gz from https://github.com/horovod/horovod/files/1596799/openmpi-3.0.0-bin.tar.gz.
+context +├── Dockerfile +├── MLNX_OFED_LINUX-5.4-3.5.8.0-ubuntu18.04-x86_64.tgz +├── Miniconda3-py37_4.12.0-Linux-x86_64.sh +├── mindspore_gpu-1.8.1-cp37-cp37m-linux_x86_64.whl +├── openmpi-3.0.0-bin.tar.gz +└── pip.conf+
1 | docker build . -t mindspore:1.8.1-ofed-cuda11.1 + |
Successfully tagged mindspore:1.8.1-ofed-cuda11.1+
# Replace the region, domain, as well as organization name deep-learning with the actual values. +sudo docker tag mindspore:1.8.1-ofed-cuda11.1 swr.{region-id}.{domain}/deep-learning/mindspore:1.8.1-ofed-cuda11.1+
# Replace the region, domain, as well as organization name deep-learning with the actual values. +sudo docker push swr.{region-id}.{domain}/deep-learning/mindspore:1.8.1-ofed-cuda11.1+
After you submit the job creation request, the system will automatically perform operations on the backend, such as downloading the container image and code directory and running the boot command. A training job requires a certain period of time for running. The duration ranges from dozens of minutes to several hours, varying depending on the service logic and selected resources. After the training job is executed, the log similar to the following is output.
+This section describes how to create an image and use it for training on ModelArts. The AI engine used in the image is TensorFlow, and the resources used for training are GPUs.
+This section applies only to training jobs of the new version.
+In this example, write a Dockerfile to create a custom image on a Linux x86_64 server running Ubuntu 18.04.
+Create a container image with the following configurations and use the image to create a GPU-powered training job on ModelArts:
+Before using a custom image to create a training job, you need to be familiar with Docker and have development experience.
+ +Create a bucket and folders in OBS for storing the sample dataset and training code. Table 1 lists the folders to be created. Replace the bucket name and folder names in the example with actual names.
+For details, see Creating a Bucket and Creating a Folder.
+Ensure that the OBS and ModelArts are in the same region.
+ + +Download mnist.npz from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz, and upload it to obs://test-modelarts/tensorflow/data/ in the OBS bucket.
+Obtain the training script mnist.py and upload it to obs://test-modelarts/tensorflow/code/ in the OBS bucket.
+mnist.py is as follows:
+import argparse +import tensorflow as tf + +parser = argparse.ArgumentParser(description='TensorFlow quick start') +parser.add_argument('--data_url', type=str, default="./Data", help='path where the dataset is saved') +args = parser.parse_args() + +mnist = tf.keras.datasets.mnist + +(x_train, y_train), (x_test, y_test) = mnist.load_data(args.data_url) +x_train, x_test = x_train / 255.0, x_test / 255.0 + +model = tf.keras.models.Sequential([ + tf.keras.layers.Flatten(input_shape=(28, 28)), + tf.keras.layers.Dense(128, activation='relu'), + tf.keras.layers.Dropout(0.2), + tf.keras.layers.Dense(10) +]) + +loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) + +model.compile(optimizer='adam', + loss=loss_fn, + metrics=['accuracy']) + +model.fit(x_train, y_train, epochs=5)+
Obtain a Linux x86_64 server running Ubuntu 18.04. Either an ECS or your local PC will do.
+Create a container image with the following configurations and use the image to create a training job on ModelArts:
+This section describes how to write a Dockerfile to create a custom image.
+The following uses Linux x86_64 as an example to describe how to obtain a Docker installation package. For more details about how to install Docker, see official Docker documents. Run the following command to install Docker:
+curl -fsSL get.docker.com -o get-docker.sh +sh get-docker.sh+
If the docker images command can be executed, Docker has been installed. In this case, skip this step.
+docker version | grep -A 1 Engine+
Engine: + Version: 18.09.0+
Use the Docker engine of the preceding version or later to create a custom image.
+mkdir -p context+
[global] +index-url = https://repo.huaweicloud.com/repository/pypi/simple +trusted-host = repo.huaweicloud.com +timeout = 120+
To obtain pip.conf, switch to Huawei Mirrors https://mirrors.huaweicloud.com/home and search for pypi.
+Download tensorflow_gpu-2.10.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl from https://pypi.org/project/tensorflow-gpu/2.10.0/#files.
+Download Miniconda3-py37_4.12.0-Linux-x86_64.sh from https://repo.anaconda.com/miniconda/Miniconda3-py37_4.12.0-Linux-x86_64.sh.
+# The server on which the container image is created must access the Internet. + +# Base container image at https://github.com/NVIDIA/nvidia-docker/wiki/CUDA +# +# https://docs.docker.com/develop/develop-images/multistage-build/#use-multi-stage-builds +# require Docker Engine >= 17.05 +# +# builder stage +FROM nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu18.04 AS builder + +# The default user of the base container image is root. +# USER root + +# Use the PyPI configuration obtained from Huawei Mirrors. +RUN mkdir -p /root/.pip/ +COPY pip.conf /root/.pip/pip.conf + +# Copy the installation files to the /tmp directory in the base container image. +COPY Miniconda3-py37_4.12.0-Linux-x86_64.sh /tmp +COPY tensorflow_gpu-2.10.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl /tmp + +# https://conda.io/projects/conda/en/latest/user-guide/install/linux.html#installing-on-linux +# Install Miniconda3 in the /home/ma-user/miniconda3 directory of the base container image. +RUN bash /tmp/Miniconda3-py37_4.12.0-Linux-x86_64.sh -b -p /home/ma-user/miniconda3 + +# Install the whl file using default Miniconda3 Python environment /home/ma-user/miniconda3/bin/pip. +RUN cd /tmp && \ + /home/ma-user/miniconda3/bin/pip install --no-cache-dir \ + /tmp/tensorflow_gpu-2.10.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + +RUN cd /tmp && \ + /home/ma-user/miniconda3/bin/pip install --no-cache-dir keras==2.10.0 + +# Create the container image. +FROM nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu18.04 + +COPY MLNX_OFED_LINUX-5.4-3.5.8.0-ubuntu18.04-x86_64.tgz /tmp + +# Install the vim, cURL, net-tools, and MLNX_OFED tools obtained from Huawei Mirrors. +RUN cp -a /etc/apt/sources.list /etc/apt/sources.list.bak && \ + sed -i "s@http://.*archive.ubuntu.com@http://repo.huaweicloud.com@g" /etc/apt/sources.list && \ + sed -i "s@http://.*security.ubuntu.com@http://repo.huaweicloud.com@g" /etc/apt/sources.list && \ + echo > /etc/apt/apt.conf.d/00skip-verify-peer.conf "Acquire { https::Verify-Peer false }" && \ + apt-get update && \ + apt-get install -y vim curl net-tools iputils-ping && \ + # mlnx ofed + apt-get install -y python libfuse2 dpatch libnl-3-dev autoconf libnl-route-3-dev pciutils libnuma1 libpci3 m4 libelf1 debhelper automake graphviz bison lsof kmod libusb-1.0-0 swig libmnl0 autotools-dev flex chrpath libltdl-dev && \ + cd /tmp && \ + tar -xvf MLNX_OFED_LINUX-5.4-3.5.8.0-ubuntu18.04-x86_64.tgz && \ + MLNX_OFED_LINUX-5.4-3.5.8.0-ubuntu18.04-x86_64/mlnxofedinstall --user-space-only --basic --without-fw-update -q && \ + cd - && \ + rm -rf /tmp/* && \ + apt-get clean && \ + mv /etc/apt/sources.list.bak /etc/apt/sources.list && \ + rm /etc/apt/apt.conf.d/00skip-verify-peer.conf + +# Add user ma-user (UID = 1000, GID = 100). +# A user group whose GID is 100 exists in the basic container image. User ma-user can directly run the following command: +RUN useradd -m -d /home/ma-user -s /bin/bash -g 100 -u 1000 ma-user + +# Copy the /home/ma-user/miniconda3 directory from the builder stage to the directory with the same name in the current container image. +COPY --chown=ma-user:100 --from=builder /home/ma-user/miniconda3 /home/ma-user/miniconda3 + +# Configure the default user and working directory of the container image. +USER ma-user +WORKDIR /home/ma-user + +# Configure the preset environment variables of the container image. +# Set PYTHONUNBUFFERED to 1 to prevent log loss. +ENV PATH=/home/ma-user/miniconda3/bin:$PATH \ + LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH \ + PYTHONUNBUFFERED=1+
For details about how to write a Dockerfile, see official Docker documents.
+Go to https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/, click Download, set Version to 5.4-3.5.8.0-LTS, OSDistributionVersion to Ubuntu 18.04, and Architecture to x86_64, and download MLNX_OFED_LINUX-5.4-3.5.8.0-ubuntu18.04-x86_64.tgz.
+context +├── Dockerfile +├── MLNX_OFED_LINUX-5.4-3.5.8.0-ubuntu18.04-x86_64.tgz +├── Miniconda3-py37_4.12.0-Linux-x86_64.sh +├── pip.conf +└── tensorflow_gpu-2.10.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl+
1 | docker build . -t tensorflow:2.10.0-ofed-cuda11.2 + |
Successfully tagged tensorflow:2.10.0-ofed-cuda11.2+
# Replace the region, domain, as well as organization name deep-learning with the actual values. +sudo docker tag tensorflow:2.10.0-ofed-cuda11.2 swr.{region-id}.{domain}/deep-learning/tensorflow:2.10.0-ofed-cuda11.2+
# Replace the region, domain, as well as organization name deep-learning with the actual values. +sudo docker push swr.{region-id}.{domain}/deep-learning/tensorflow:2.10.0-ofed-cuda11.2+
After you submit the job creation request, the system will automatically perform operations on the backend, such as downloading the container image and code directory and running the boot command. A training job requires a certain period of time for running. The duration ranges from dozens of minutes to several hours, varying depending on the service logic and selected resources. After the training job is executed, the log similar to the following is output.
+This section describes how to enable an inference service to access the Internet.
+An inference service accesses the Internet in the following scenarios:
+Use the algorithm on the instance where the inference service is deployed to access the Internet.
+Interconnecting a VPC enables the ModelArts resource pool to exchange data with your VPC.
+Before installing a forward proxy, purchase an ECS with the latest Ubuntu image and bind an EIP to the ECS. Then, log in to the ECS, and install and configure a squid forward proxy.
+curl -sSL https://get.daocloud.io/docker | sh+
docker pull ubuntu/squid+
Create a host directory:
+mkdir –p /etc/squid/+
Add the whitelist.conf configuration file. The content is the addresses that can be accessed. For example:
+.apig.cn-east-3.huaweicloudapis.com+
Add the squid.conf configuration file, which includes the following:
+# An ACL named 'whitelist' +acl whitelist dstdomain '/etc/squid/whitelist.conf' + +# Allow whitelisted URLs through +http_access allow whitelist + +# Block the rest +http_access deny all + +# Default port +http_port 3128+
Set the permissions on the host directory and configuration files:
+chmod 640 -R /etc/squid+
docker run -d --name squid -e TZ=UTC -v /etc/squid:/etc/squid -p 3128:3128 ubuntu/squid:latest+
docker exec –it squid bash +root@{container_id}:/# squid -k reconfigure+
In the code, specify the private IP address and port of the proxy server, as shown in the following:
+proxies = { + "http": "http://{proxy_server_private_ip}:3128", + "https": "http://{proxy_server_private_ip}:3128" +}+
The following figure shows how to obtain the private IP address of a server.
+In the inference code, use the service URL to send a service request, for example:
+https://e8a048ce25136addbbac23ce6132a.apig.cn-east-3.huaweicloudapis.com+
The end-to-end O&M of ModelArts inference services involves the entire AI process including algorithm development, service O&M, and service running.
+End-to-End O&M Process
+During the entire O&M process, service request failures and high resource usage are monitored. When the resource usage threshold is reached, the system will send an alarm notification to you.
+Advantages
+End-to-end service O&M enables you to easily check service running at both peak and off-peak hours and detect the health status of real-time services in real time.
+Constraints
+End-to-end service O&M applies only to real-time services because Cloud Eye does not monitor batch or edge inference services.
+This section uses an occupant safety algorithm in travel as an example to describe how to use ModelArts for process-based service deployment and update, as well as automatic service O&M and monitoring.
+After the configuration, choose Cloud Service Monitoring > ModelArts in the navigation pane on the left to view the requests and resource usage of the real-time service.
+When an alarm is triggered based on the monitored data, the object who has subscribed to the target topic will receive a message notification.
+ModelArts provides ExeML for service developers, freeing you from model development and parameter tuning. With ExeML, you can finish an AI development project in just three steps, including data labeling, auto training, and service deployment.
+As an example of object detection, this section describes how to detect Yunbao, the mascot of HUAWEI CLOUD, to help you quickly get started with ModelArts. By using the built-in Yunbao image dataset, the system automatically trains and generates a detection model, and deploys the generated model as a real-time service. After the deployment is completed, you can use the real-time service to identify whether an input image contains Yunbao.
+Before you start, carefully complete the preparations described in Preparations. To use ExeML to build a model, perform the following steps:
+On the ModelArts management console, choose Data Management > Datasets in the left navigation pane. On the page that is displayed, click Service Authorization to apply for permission authorization. If you log in using an account, a dialog box is displayed, asking you to accept the authorization. If you log in as an IAM user (member account), your master account or a user with admin permissions grants authorization to you.
+ModelArts provides a sample dataset of Yunbao named Yunbao-Data-Custom. This example uses this dataset to build a model. Perform the following operations to upload the dataset to the OBS directory test-modelarts/dataset-yunbao created in preparation. The OBS bucket name test-modelarts is for reference only. You need to customize an OBS bucket name.
+If you want to use your own dataset, skip this step, upload the dataset to the OBS folder, and select this directory in Step 2: Create an Object Detection Project.
+The obtained dataset has two directories: eval and train. The data stored in train is used for model training, and the data stored in eval is used for model prediction.
+The Yunbao dataset has two directories: eval and train. Select the data in the train directory for training. If the upper-layer directory of train is selected, an error message is displayed, indicating that OBS has invalid data. As a result, the project will fail to be created.
+For an object detection project, labeling data is to locate an object in an image and assign a label to the object. The labeled data is used for model training. In the Yunbao dataset, part of data has been labeled. You can label the unlabeled data for trial use.
+Data source synchronization is automatically performed when you create an ExeML project. Data source synchronization takes a certain period of time. If the synchronization fails, you can click Synchronize Data Source to manually execute the synchronization.
+If you select free specifications, you do not need to set Auto Stop, because the node will be stopped one hour later.
+After the model is deployed, you can test the service using an image.
+After the prediction is completed, the label name yunbao, location coordinates, and confidence score are displayed in the prediction result pane on the right. In the prediction result, detection_boxes indicates the location of the object, detection_scores indicates the detection score of yunbao.
+If the model accuracy does not meet your expectation, add images on the Label Data tab page, label the images, and train and deploy the model again.
+A running real-time service keeps consuming the resources. If you do not need to use the real-time service, click Stop in the Version Manager pane to stop the service and avoid unnecessary billing. If you want to use the service again, click Start.
+Banks often predict whether customers would be interested in a time deposit based on their characteristics, including the age, work type, marital status, education background, housing loan, and personal loan.
+Now, you can use the ExeML function on ModelArts to predict whether a customer would be interested in the time deposit. The process of using ExeML is as follows:
+In this example, the dataset is from the Machine Learning Repository of UCI. For details about the dataset, see Bank Marketing Data Set. Table 1 and Table 2 describe the parameters and sample data of the dataset. You can obtain the dataset from GitHub and upload it to OBS.
+ +Parameter + |
+Meaning + |
+Type + |
+Description + |
+
---|---|---|---|
attr_1 + |
+Age + |
+Int + |
+Age of the customer + |
+
attr_2 + |
+Occupation + |
+String + |
+Occupation of the customer + |
+
attr_3 + |
+Marital status + |
+String + |
+Marital status of the customer + |
+
attr_4 + |
+Education background + |
+String + |
+Education background of the customer + |
+
attr_5 + |
+Real estate + |
+String + |
+Real estate of the customer + |
+
attr_6 + |
+Loan + |
+String + |
+Loan of the customer + |
+
attr_7 + |
+Deposit + |
+String + |
+Deposit of the customer + |
+
attr_1 + |
+attr_2 + |
+attr_3 + |
+attr_4 + |
+attr_5 + |
+attr_6 + |
+attr_7 + |
+
---|---|---|---|---|---|---|
31 + |
+blue-collar + |
+married + |
+secondary + |
+yes + |
+no + |
+no + |
+
41 + |
+management + |
+married + |
+tertiary + |
+yes + |
+yes + |
+no + |
+
38 + |
+technician + |
+single + |
+secondary + |
+yes + |
+no + |
+no + |
+
39 + |
+technician + |
+single + |
+secondary + |
+yes + |
+no + |
+yes + |
+
39 + |
+blue-collar + |
+married + |
+secondary + |
+yes + |
+no + |
+no + |
+
39 + |
+services + |
+single + |
+unknown + |
+yes + |
+no + |
+no + |
+
For successful prediction, make sure that the label column is attr_7 and that the data in the label column is of the discrete type.
+The training takes a certain period of time. If you close or exit the page, the system continues training until it is complete.
+The following shows the test code. As shown in Figure 7, the prediction result is "predict": "no", indicating that the customer will not apply for a deposit.
+ +{ + + "data": + { + "count": 1, + "req_data": + [ + { + "attr_1": "34", + "attr_2": "blue-collar", + "attr_3": "single", + "attr_4": "tertiary", + "attr_5": "no", + "attr_6": "no" + } + ] + } +}+ + +
Certain ModelArts functions require the permission to access other services. This section describes how to assign specific permissions to IAM users when they use ModelArts.
+The permissions of IAM users are controlled by their tenant user. Logging in as a tenant user, you can assign permissions to the target user group through IAM. Then, the permissions are assigned to all members in the user group. The following authorization list uses the system-defined policies of ModelArts and other services as an example.
+ +Target Service + |
+Description + |
+IAM Permission + |
+Mandatory + |
+
---|---|---|---|
ModelArts + |
+Assign permissions to IAM users for using ModelArts. +The users with the ModelArts CommonOperations permission can only use resources, but cannot create, update, or delete any dedicated resource pool. You are advised to assign this permission to IAM users. + |
+ModelArts CommonOperations + |
+Yes + |
+
The users with the ModelArts FullAccess permission have all access permissions, including creating, updating, and deleting dedicated resource pools. Exercise caution when selecting this option. + |
+ModelArts FullAccess + |
+No +Select either ModelArts FullAccess or ModelArts CommonOperations. + |
+|
Object Storage Service (OBS) + |
+Assign permissions to IAM users for using OBS. ModelArts data management, development environments, training jobs, and model deployment require OBS for forwarding data. + |
+OBS OperateAccess + |
+Yes + |
+
Software Repository for Container (SWR) + |
+Assign permissions to IAM users for using SWR. ModelArts custom images require the SWR FullAccess permission. + |
+SWR OperateAccess + |
+Yes + |
+
Key Management Service (KMS) + |
+To use remote SSH of ModelArts notebook, IAM users require KMS authorization. + |
+KMS CMKFullAccess + |
+No + |
+
Intelligent EdgeFabric (IEF) + |
+Assign permissions to IAM users for using IEF. Tenant administrator permissions are required so that ModelArts edge services depending on IEF can be used. + |
+Tenant Administrator + |
+No + |
+
Cloud Eye + |
+Assign permissions to IAM users for using Cloud Eye. Using Cloud Eye, you can view the running statuses of ModelArts real-time services and AI application loads, and set monitoring alarms. + |
+CES FullAccess + |
+No + |
+
Simple Message Notification (SMN) + |
+Assign permissions to IAM users for using SMN. SMN is used with Cloud Eye. + |
+SMN FullAccess + |
+No + |
+
Virtual Private Cloud (VPC) + |
+During the creation of a dedicated resource pool for ModelArts, IAM users require VPC permissions so that they can customize networks. + |
+VPC FullAccess + |
+No + |
+
Scalable File Service (SFS) + |
++ | SFS Turbo FullAccess +SFS FullAccess + |
++ |
If you want to use an AI engine that is not supported by ModelArts, create a custom image for the engine, import the image to ModelArts, and use the image to create AI applications. This section describes how to use a custom image to create an AI application and deploy the application as a real-time service.
+The process is as follows:
+This section uses a Linux x86_x64 host as an example. You can purchase an ECS of the same specifications or use an existing local host to create a custom image.
+curl -fsSL get.docker.com -o get-docker.sh +sh get-docker.sh+
docker pull ubuntu:18.04+
self-define-images/ + --Dockerfile + --test_app.py+
From ubuntu:18.04 +# Configure the HUAWEI CLOUD source and install Python, Python3-PIP, and Flask. +RUN cp -a /etc/apt/sources.list /etc/apt/sources.list.bak && \ + sed -i "s@http://.*security.ubuntu.com@http://repo.huaweicloud.com@g" /etc/apt/sources.list && \ + sed -i "s@http://.*archive.ubuntu.com@http://repo.huaweicloud.com@g" /etc/apt/sources.list && \ + apt-get update && \ + apt-get install -y python3 python3-pip && \ + pip3 install --trusted-host https://repo.huaweicloud.com -i https://repo.huaweicloud.com/repository/pypi/simple Flask + +# Copy the application code to the image. +COPY test_app.py /opt/test_app.py + +# Specify the boot command of the image. +CMD python3 /opt/test_app.py+
from flask import Flask, request +import json +app = Flask(__name__) + +@app.route('/greet', methods=['POST']) +def say_hello_func(): + print("----------- in hello func ----------") + data = json.loads(request.get_data(as_text=True)) + print(data) + username = data['name'] + rsp_msg = 'Hello, {}!'.format(username) + return json.dumps({"response":rsp_msg}, indent=4) + +@app.route('/goodbye', methods=['GET']) +def say_goodbye_func(): + print("----------- in goodbye func ----------") + return '\nGoodbye!\n' + + +@app.route('/', methods=['POST']) +def default_func(): + print("----------- in default func ----------") + data = json.loads(request.get_data(as_text=True)) + return '\n called default func !\n {} \n'.format(str(data)) + +# host must be "0.0.0.0", port must be 8080 +if __name__ == '__main__': + app.run(host="0.0.0.0", port=8080)+
docker build -t test:v1 .+
docker run -it -p 8080:8080 test:v1+
curl -X POST -H "Content-Type: application/json" --data '{"name":"Tom"}' 127.0.0.1:8080/ +curl -X POST -H "Content-Type: application/json" --data '{"name":"Tom"}' 127.0.0.1:8080/greet +curl -X GET 127.0.0.1:8080/goodbye+
If information similar to the following is displayed, the function verification is successful.
+[{ + "url": "/", + "method": "post", + "request": { + "Content-type": "application/json" + }, + "response": { + "Content-type": "application/json" + } + }, +{ + "url": "/greet", + "method": "post", + "request": { + "Content-type": "application/json" + }, + "response": { + "Content-type": "application/json" + } + }, +{ + "url": "/goodbye", + "method": "get", + "request": { + "Content-type": "application/json" + }, + "response": { + "Content-type": "application/json" + } + } +]+
This section describes how to modify a local custom algorithm to train and deploy models on ModelArts.
+This case describes how to use PyTorch 1.8 to recognize handwritten digit images. An official MNIST dataset is used in this case.
+Through this case, you can learn how to train jobs, deploy an inference model, and perform prediction on ModelArts.
+Before performing the following operations, complete necessary operations. For details, see Preparations.
+Authorized User: Select All users.
+Agency: Select Add agency.
+Permissions: Select Common User.
+An MNIST dataset downloaded from the MNIST official website is used in this case. Ensure that the four files are all downloaded.
+If you are asked to enter the login information after you click the MNIST official website link, copy and paste this link in the address box of your browser: http://yann.lecun.com/exdb/mnist/
+The login information is required when you open the link in HTTPS mode, which is not required if you open the link in HTTP mode.
+In this case, ModelArts provides the training script, inference script, and inference configuration file.
+When pasting code from a .py file, create a .py file. Otherwise, the error message "SyntaxError: 'gbk' codec can't decode byte 0xa4 in position 324: illegal multibyte sequence" may be displayed.
+Create the training script train.py on the local host. The content is as follows:
+# base on https://github.com/pytorch/examples/blob/main/mnist/main.py + +from __future__ import print_function + +import os +import gzip +import codecs +import argparse +from typing import IO, Union + +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torchvision import datasets, transforms +from torch.optim.lr_scheduler import StepLR + +import shutil + + +# Define a network model. +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 32, 3, 1) + self.conv2 = nn.Conv2d(32, 64, 3, 1) + self.dropout1 = nn.Dropout(0.25) + self.dropout2 = nn.Dropout(0.5) + self.fc1 = nn.Linear(9216, 128) + self.fc2 = nn.Linear(128, 10) + + def forward(self, x): + x = self.conv1(x) + x = F.relu(x) + x = self.conv2(x) + x = F.relu(x) + x = F.max_pool2d(x, 2) + x = self.dropout1(x) + x = torch.flatten(x, 1) + x = self.fc1(x) + x = F.relu(x) + x = self.dropout2(x) + x = self.fc2(x) + output = F.log_softmax(x, dim=1) + return output + + +# Train the model. Set the model to the training mode, load the training data, calculate the loss function, and perform gradient descent. +def train(args, model, device, train_loader, optimizer, epoch): + model.train() + for batch_idx, (data, target) in enumerate(train_loader): + data, target = data.to(device), target.to(device) + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + if batch_idx % args.log_interval == 0: + print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + epoch, batch_idx * len(data), len(train_loader.dataset), + 100. * batch_idx / len(train_loader), loss.item())) + if args.dry_run: + break + + +# Validate the model. Set the model to the validation mode, load the validation data, and calculate the loss function and accuracy. +def test(model, device, test_loader): + model.eval() + test_loss = 0 + correct = 0 + with torch.no_grad(): + for data, target in test_loader: + data, target = data.to(device), target.to(device) + output = model(data) + test_loss += F.nll_loss(output, target, reduction='sum').item() + pred = output.argmax(dim=1, keepdim=True) + correct += pred.eq(target.view_as(pred)).sum().item() + + test_loss /= len(test_loader.dataset) + + print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( + test_loss, correct, len(test_loader.dataset), + 100. * correct / len(test_loader.dataset))) + + +# The following is PyTorch MNIST. +# https://github.com/pytorch/vision/blob/v0.9.0/torchvision/datasets/mnist.py +def get_int(b: bytes) -> int: + return int(codecs.encode(b, 'hex'), 16) + + +def open_maybe_compressed_file(path: Union[str, IO]) -> Union[IO, gzip.GzipFile]: + """Return a file object that possibly decompresses 'path' on the fly. + Decompression occurs when argument `path` is a string and ends with '.gz' or '.xz'. + """ + if not isinstance(path, torch._six.string_classes): + return path + if path.endswith('.gz'): + return gzip.open(path, 'rb') + if path.endswith('.xz'): + return lzma.open(path, 'rb') + return open(path, 'rb') + + +SN3_PASCALVINCENT_TYPEMAP = { + 8: (torch.uint8, np.uint8, np.uint8), + 9: (torch.int8, np.int8, np.int8), + 11: (torch.int16, np.dtype('>i2'), 'i2'), + 12: (torch.int32, np.dtype('>i4'), 'i4'), + 13: (torch.float32, np.dtype('>f4'), 'f4'), + 14: (torch.float64, np.dtype('>f8'), 'f8') +} + + +def read_sn3_pascalvincent_tensor(path: Union[str, IO], strict: bool = True) -> torch.Tensor: + """Read a SN3 file in "Pascal Vincent" format (Lush file 'libidx/idx-io.lsh'). + Argument may be a filename, compressed filename, or file object. + """ + # read + with open_maybe_compressed_file(path) as f: + data = f.read() + # parse + magic = get_int(data[0:4]) + nd = magic % 256 + ty = magic // 256 + assert 1 <= nd <= 3 + assert 8 <= ty <= 14 + m = SN3_PASCALVINCENT_TYPEMAP[ty] + s = [get_int(data[4 * (i + 1): 4 * (i + 2)]) for i in range(nd)] + parsed = np.frombuffer(data, dtype=m[1], offset=(4 * (nd + 1))) + assert parsed.shape[0] == np.prod(s) or not strict + return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s) + + +def read_label_file(path: str) -> torch.Tensor: + with open(path, 'rb') as f: + x = read_sn3_pascalvincent_tensor(f, strict=False) + assert(x.dtype == torch.uint8) + assert(x.ndimension() == 1) + return x.long() + + +def read_image_file(path: str) -> torch.Tensor: + with open(path, 'rb') as f: + x = read_sn3_pascalvincent_tensor(f, strict=False) + assert(x.dtype == torch.uint8) + assert(x.ndimension() == 3) + return x + + +def extract_archive(from_path, to_path): + to_path = os.path.join(to_path, os.path.splitext(os.path.basename(from_path))[0]) + with open(to_path, "wb") as out_f, gzip.GzipFile(from_path) as zip_f: + out_f.write(zip_f.read()) +# The above is pytorch mnist. +# --- end + + +# Raw MNIST dataset processing +def convert_raw_mnist_dataset_to_pytorch_mnist_dataset(data_url): + """ + raw + + {data_url}/ + train-images-idx3-ubyte.gz + train-labels-idx1-ubyte.gz + t10k-images-idx3-ubyte.gz + t10k-labels-idx1-ubyte.gz + + processed + + {data_url}/ + train-images-idx3-ubyte.gz + train-labels-idx1-ubyte.gz + t10k-images-idx3-ubyte.gz + t10k-labels-idx1-ubyte.gz + MNIST/raw + train-images-idx3-ubyte + train-labels-idx1-ubyte + t10k-images-idx3-ubyte + t10k-labels-idx1-ubyte + MNIST/processed + training.pt + test.pt + """ + resources = [ + "train-images-idx3-ubyte.gz", + "train-labels-idx1-ubyte.gz", + "t10k-images-idx3-ubyte.gz", + "t10k-labels-idx1-ubyte.gz" + ] + + pytorch_mnist_dataset = os.path.join(data_url, 'MNIST') + + raw_folder = os.path.join(pytorch_mnist_dataset, 'raw') + processed_folder = os.path.join(pytorch_mnist_dataset, 'processed') + + os.makedirs(raw_folder, exist_ok=True) + os.makedirs(processed_folder, exist_ok=True) + + print('Processing...') + + for f in resources: + extract_archive(os.path.join(data_url, f), raw_folder) + + training_set = ( + read_image_file(os.path.join(raw_folder, 'train-images-idx3-ubyte')), + read_label_file(os.path.join(raw_folder, 'train-labels-idx1-ubyte')) + ) + test_set = ( + read_image_file(os.path.join(raw_folder, 't10k-images-idx3-ubyte')), + read_label_file(os.path.join(raw_folder, 't10k-labels-idx1-ubyte')) + ) + with open(os.path.join(processed_folder, 'training.pt'), 'wb') as f: + torch.save(training_set, f) + with open(os.path.join(processed_folder, 'test.pt'), 'wb') as f: + torch.save(test_set, f) + + print('Done!') + + +def main(): + # Define the preset running parameters of the training job. + parser = argparse.ArgumentParser(description='PyTorch MNIST Example') + + parser.add_argument('--data_url', type=str, default=False, + help='mnist dataset path') + parser.add_argument('--train_url', type=str, default=False, + help='mnist model path') + + parser.add_argument('--batch-size', type=int, default=64, metavar='N', + help='input batch size for training (default: 64)') + parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', + help='input batch size for testing (default: 1000)') + parser.add_argument('--epochs', type=int, default=14, metavar='N', + help='number of epochs to train (default: 14)') + parser.add_argument('--lr', type=float, default=1.0, metavar='LR', + help='learning rate (default: 1.0)') + parser.add_argument('--gamma', type=float, default=0.7, metavar='M', + help='Learning rate step gamma (default: 0.7)') + parser.add_argument('--no-cuda', action='store_true', default=False, + help='disables CUDA training') + parser.add_argument('--dry-run', action='store_true', default=False, + help='quickly check a single pass') + parser.add_argument('--seed', type=int, default=1, metavar='S', + help='random seed (default: 1)') + parser.add_argument('--log-interval', type=int, default=10, metavar='N', + help='how many batches to wait before logging training status') + parser.add_argument('--save-model', action='store_true', default=True, + help='For Saving the current Model') + args = parser.parse_args() + + use_cuda = not args.no_cuda and torch.cuda.is_available() + + torch.manual_seed(args.seed) + + # Set whether to use GPU or CPU to run the algorithm. + device = torch.device("cuda" if use_cuda else "cpu") + + train_kwargs = {'batch_size': args.batch_size} + test_kwargs = {'batch_size': args.test_batch_size} + if use_cuda: + cuda_kwargs = {'num_workers': 1, + 'pin_memory': True, + 'shuffle': True} + train_kwargs.update(cuda_kwargs) + test_kwargs.update(cuda_kwargs) + + # Define the data preprocessing method. + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ]) + + # Convert the raw MNIST dataset to a PyTorch MNIST dataset. + convert_raw_mnist_dataset_to_pytorch_mnist_dataset(args.data_url) + + # Create a training dataset and a validation dataset. + dataset1 = datasets.MNIST(args.data_url, train=True, download=False, + transform=transform) + dataset2 = datasets.MNIST(args.data_url, train=False, download=False, + transform=transform) + + # Create iterators for the training dataset and the validation dataset. + train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs) + test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) + + # Initialize the neural network model and copy the model to the compute device. + model = Net().to(device) + # Define the training optimizer and learning rate for gradient descent calculation. + optimizer = optim.Adadelta(model.parameters(), lr=args.lr) + scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) + + # Train the neural network and perform validation in each epoch. + for epoch in range(1, args.epochs + 1): + train(args, model, device, train_loader, optimizer, epoch) + test(model, device, test_loader) + scheduler.step() + + # Save the model and make it adapted to the ModelArts inference model package specifications. + if args.save_model: + + # Create the model directory in the path specified in train_url. + model_path = os.path.join(args.train_url, 'model') + os.makedirs(model_path, exist_ok = True) + + # Save the model to the model directory based on the ModelArts inference model package specifications. + torch.save(model.state_dict(), os.path.join(model_path, 'mnist_cnn.pt')) + + # Copy the inference code and configuration file to the model directory. + the_path_of_current_file = os.path.dirname(__file__) + shutil.copyfile(os.path.join(the_path_of_current_file, 'infer/customize_service.py'), os.path.join(model_path, 'customize_service.py')) + shutil.copyfile(os.path.join(the_path_of_current_file, 'infer/config.json'), os.path.join(model_path, 'config.json')) + +if __name__ == '__main__': + main()+
Create the inference script customize_service.py on the local host. The content is as follows:
+import os +import log +import json + +import torch.nn.functional as F +import torch.nn as nn +import torch +import torchvision.transforms as transforms + +import numpy as np +from PIL import Image + +from model_service.pytorch_model_service import PTServingBaseService + +logger = log.getLogger(__name__) + +# Define model preprocessing. +infer_transformation = transforms.Compose([ + transforms.Resize(28), + transforms.CenterCrop(28), + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) +]) + +# Model inference service +class PTVisionService(PTServingBaseService): + + def __init__(self, model_name, model_path): + # Call the constructor of the parent class. + super(PTVisionService, self).__init__(model_name, model_path) + + # Call the customized function to load the model. + self.model = Mnist(model_path) + + # Load labels. + self.label = [0,1,2,3,4,5,6,7,8,9] + + # Receive the request data and convert it to the input format acceptable to the model. + def _preprocess(self, data): + preprocessed_data = {} + for k, v in data.items(): + input_batch = [] + for file_name, file_content in v.items(): + with Image.open(file_content) as image1: + # Gray processing + image1 = image1.convert("L") + if torch.cuda.is_available(): + input_batch.append(infer_transformation(image1).cuda()) + else: + input_batch.append(infer_transformation(image1)) + input_batch_var = torch.autograd.Variable(torch.stack(input_batch, dim=0), volatile=True) + print(input_batch_var.shape) + preprocessed_data[k] = input_batch_var + + return preprocessed_data + + # Post-process the inference result to obtain the expected output format. The result is the returned value. + def _postprocess(self, data): + results = [] + for k, v in data.items(): + result = torch.argmax(v[0]) + result = {k: self.label[result]} + results.append(result) + return results + + # Perform forward inference on the input data to obtain the inference result. + def _inference(self, data): + + result = {} + for k, v in data.items(): + result[k] = self.model(v) + + return result + +# Define a network. +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 32, 3, 1) + self.conv2 = nn.Conv2d(32, 64, 3, 1) + self.dropout1 = nn.Dropout(0.25) + self.dropout2 = nn.Dropout(0.5) + self.fc1 = nn.Linear(9216, 128) + self.fc2 = nn.Linear(128, 10) + + def forward(self, x): + x = self.conv1(x) + x = F.relu(x) + x = self.conv2(x) + x = F.relu(x) + x = F.max_pool2d(x, 2) + x = self.dropout1(x) + x = torch.flatten(x, 1) + x = self.fc1(x) + x = F.relu(x) + x = self.dropout2(x) + x = self.fc2(x) + output = F.log_softmax(x, dim=1) + return output + + +def Mnist(model_path, **kwargs): + # Generate a network. + model = Net() + + # Load the model. + if torch.cuda.is_available(): + device = torch.device('cuda') + model.load_state_dict(torch.load(model_path, map_location="cuda:0")) + else: + device = torch.device('cpu') + model.load_state_dict(torch.load(model_path, map_location=device)) + + # CPU or GPU mapping + model.to(device) + + # Turn the model to inference mode. + model.eval() + + return model+
Infer the configuration file config.json on the local host. The content is as follows:
+{ + "model_algorithm": "image_classification", + "model_type": "PyTorch", + "runtime": "pytorch_1.8.0-cuda_10.2-py_3.7-ubuntu_18.04-x86_64" +}+
Upload the data, code file, inference code file, and inference configuration file obtained from Step 2 to an OBS bucket. When running a training job on ModelArts, read data and code files from the OBS bucket.
+{OBS bucket} # OBS bucket name, which is customizable, for example, test-modelarts-xx + -{OBS folder} # OBS folder name, which is customizable, for example, pytorch + - mnist-data # OBS folder, which is used to store the training dataset. The folder name is customizable, for example, mnist-data. + - mnist-code # OBS folder, which is used to store training script train.py. The folder name is customizable, for example, mnist-code. + - infer # OBS folder, which is used to store inference script customize_service.py and configuration file config.json + - mnist-output # OBS folder, which is used to store trained models. The folder name is customizable, for example, mnist-output.+
The sample code runs on a single node with a single card. If you select a flavor with multiple GPUs, the training will fail.
+In this case, the training job will take more than 10 minutes.
+After the model training is complete, create an AI application and deploy it as a real-time service.
+Choose Training Job for Meta Model Source. Select the training job completed in Step 4 Create a Training Job from the drop-down list and select Dynamic loading. The values of AI Engine will be automatically configured.
+If you have used Training Jobs of an old version, you can see both Training Jobs and Training Jobs New below Training job. In this case, select Training Jobs New.
+After you submit the service deployment request, the system automatically switches to the Real-Time Services page. When the service status changes to Running, the service has been deployed.
+After the prediction is complete, the prediction result is displayed in the Test Result pane. According to the prediction result, the digit on the image is 2.
+The MNIST used in this case is a simple dataset used for demonstration, and its algorithms are also simple neural network algorithms used for teaching. The models generated using such data and algorithms are applicable only to teaching but not to complex prediction scenarios. The prediction is accurate only if the image used for prediction is similar to the image in the training dataset (white characters on black background).
+If the training job is always queuing, the selected resources are limited in the resource pool, and the job needs to be queued. In this case, wait for resources. For details, see Why Is a Training Job Always Queuing.
+Ensure that the created bucket is in the same region as ModelArts. For details, see Incorrect OBS Path on ModelArts.
+ModelArts allows you to configure fine-grained permissions for refined management of resources and permissions. This is commonly used by large enterprises, but it is complex for individual users. It is recommended that individual users configure permissions for using ModelArts by referring to Assigning Permissions to Individual Users for Using ModelArts.
+If you meet any of the following conditions, read this document.
+ModelArts uses Identity and Access Management (IAM) for most permissions management functions. Before reading below, learn about Basic Concepts. This helps you better understand this document.
+To implement fine-grained permissions management, ModelArts provides permission control, agency authorization, and workspace. The following describes the details.
+Exposed ModelArts functions are controlled through IAM permissions. For example, if you as an IAM user need to create a training job on ModelArts, you must have the modelarts:trainJob:create permission. For details about how to assign permissions to a user (you need to add the user to a user group and then assign permissions to the user group), see Permissions Management.
+ModelArts must access other services for AI computing. For example, ModelArts must access OBS to read your data for training. For security purposes, ModelArts must be authorized to access other cloud services. This is agency authorization.
+The following summarizes permissions management:
+By default, new IAM users do not have any permissions assigned. You need to add a user to one or more groups, and assign permissions policies or roles to these groups. Users inherit permissions of the groups to which they are added. This process is called authorization. After authorization, users can perform operations on ModelArts based on permissions.
+ModelArts is a project-level service deployed and accessed in specific physical regions. When you authorize an agency, you can set the scope for the permissions you select to all resources, enterprises projects, or region-specific projects. If you specify region-specific projects, the selected permissions will be applied to resources in these projects.
+For details, see Creating a User Group and Assigning Permissions.
+When assigning permissions to a user group, IAM does not directly assign specific permissions to the user group. Instead, IAM needs to add the permissions to a policy and then assign the policy to the user group. To facilitate user permissions management, each cloud service provides some preset policies for you to directly use. If the preset policies cannot meet your requirements of fine-grained permissions management, you can customize policies.
+Policy + |
+Description + |
+Type + |
+
---|---|---|
ModelArts FullAccess + |
+Administrator permissions for ModelArts. Users granted these permissions can operate and use ModelArts. + |
+System-defined policy + |
+
ModelArts CommonOperations + |
+Common user permissions for ModelArts. Users granted these permissions can operate and use ModelArts, but cannot manage dedicated resource pools. + |
+System-defined policy + |
+
Generally, ModelArts FullAccess is assigned only to administrators. If fine-grained management is not required, assigning ModelArts CommonOperations to all users will meet the development requirements of most small teams. If you want to customize policies for fine-grained permissions management, see IAM.
+When you assign ModelArts permissions to a user, the system does not automatically assign the permissions of other services to the user. This ensures security and prevents unexpected unauthorized operations. In this case, however, you must separately assign permissions of different services to users so that they can perform some ModelArts operations.
+For example, if an IAM user needs to use OBS data for training and the ModelArts training permission has been configured for the IAM user, the IAM user still needs to be assigned with the OBS read, write, and list permissions. The OBS list permission allows you to select the training data path on ModelArts. The read permission is used to preview data and read data for training. The write permission is used to save training results and logs.
+ModelArts must be authorized by users to access other cloud services for AI computing. In the IAM permission system, such authorization is performed through agencies.
+For details about the basic concepts and operations of agencies, see Cloud Service Delegation.
+To simplify agency authorization, ModelArts supports automatic agency authorization configuration. You only need to configure an agency for yourself or specified users on the Global Configuration page of the ModelArts console.
+On the Global Configuration page of the ModelArts console, after you click Add Authorization, you can configure an agency for a specific user or all users. Generally, an agency named modelarts_agency_<Username>_Random ID is created by default. In the Permissions area, you can select the preset permission configuration or select the required policies. If both options cannot meet your requirements, you can create an agency on the IAM management page (you need to delegate ModelArts to access your resources), and then use an existing agency instead of adding an agency on the Add Authorization page.
+ModelArts associates multiple users with one agency. This means that if two users need to configure the same agency, you do not need to create an agency for each user. Instead, you only need to configure the same agency for the two users.
+Each user can use ModelArts only after being associated with an agency. However, even if the permissions assigned to the agency are insufficient, no error is reported when the API is called. An error occurs only when the system uses unauthorized functions. For example, you enable message notification when creating a training job. Message notification requires SMN authorization. However, an error occurs only when messages need to be sent for the training job. The system ignores some errors, and other errors may cause job failures. When you implement permission minimization, ensure that you will still have sufficient permissions for the required operations on ModelArts.
+In strict authorization mode, explicit authorization by the account administrator is required for IAM users to access ModelArts. The administrator can add the required ModelArts permissions to common users through authorization policies.
+In non-strict authorization mode, IAM users can use ModelArts without explicit authorization. The administrator needs to configure the deny policy for IAM users to prevent them from using some ModelArts functions.
+The administrator can change the authorization mode on the Global Configuration page.
+The strict authorization mode is recommended. In this mode, IAM users must be authorized to use ModelArts functions. In this way, the permission scope of IAM users can be accurately controlled, minimizing permissions granted to IAM users.
+Workspace enables enterprise customers to split their resources into multiple spaces that are logically isolated and to manage access to different spaces. As an enterprise user, you can submit the request for enabling the workspace function to your technical support manager.
+After workspace is enabled, a default workspace is created. All resources you have created are in this workspace. A workspace is like a ModelArts twin. You can switch between workspaces in the upper left corner of the ModelArts console. Jobs in different workspaces do not affect each other.
+When creating a workspace, you must bind it to an enterprise project. Multiple workspaces can be bound to the same enterprise project, but one workspace cannot be bound to multiple enterprise projects. You can use workspaces for refined restrictions on resource access and permissions of different users. The restrictions are as follows:
+Key features of ModelArts permissions management:
+This section describes the IAM permission configurations for all ModelArts functions.
+If no fine-grained authorization policy is configured for a user created by the administrator, the user has all permissions of ModelArts by default. To control user permissions, the administrator needs to add the user to a user group on IAM and configure fine-grained authorization policies for the user group. In this way, the user obtains the permissions defined in the policies before performing operations on cloud service resources.
+You can grant users permissions by using roles and policies.
+ModelArts does not support role-based authorization. It supports only policy-based authorization.
+Policy Structure
+A policy consists of a version and one or more statements (indicating different actions).
+Policy Parameters
+The following describes policy parameters. You can create custom policies by specifying the parameters. For details, see Custom Policy Use Cases.
+ +Parameter + |
+Description + |
+Value + |
+|
---|---|---|---|
Version + |
+Policy version + |
+1.1: indicates policy-based access control. + |
+|
Statement: authorization statement of a policy + |
+Effect + |
+Whether to allow or deny the operations defined in the action + |
+
|
+
Action + |
+Operation to be performed on the service + |
+Format: "Service name:Resource type:Action". Wildcard characters (*) are supported, indicating all options. +Example: +modelarts:notebook:list: indicates the permission to view a notebook instance list. modelarts indicates the service name, notebook indicates the resource type, and list indicates the operation. +View all actions of a service in its API Reference. + |
+|
Condition + |
+Condition for a policy to take effect, including condition keys and operators + |
+Format: "Condition operator:{Condition key:[Value 1,Value 2]}" +If you set multiple conditions, the policy takes effect only when all the conditions are met. +Example: +StringEndWithIfExists":{"g:UserName":["specialCharacter"]}: The statement is valid for users whose names end with specialCharacter. + |
+|
Resource + |
+Resources on which a policy takes effect + |
+Format: Service name:Region:Account ID:Resource type:Resource path. Wildcard characters (*) are supported, indicating all resources. + NOTE:
+ModelArts authorization does not allow you to specify a resource path. + |
+
During policy-based authorization, the administrator can select the authorization scope based on ModelArts resource types. The following table lists the resource types supported by ModelArts:
+ +Resource Type + |
+Description + |
+
---|---|
notebook + |
+Notebook instances in DevEnviron + |
+
exemlProject + |
+ExeML projects + |
+
exemlProjectInf + |
+ExeML-powered real-time inference service + |
+
exemlProjectTrain + |
+ExeML-powered training jobs + |
+
exemlProjectVersion + |
+ExeML project version + |
+
workflow + |
+Workflow + |
+
pool + |
+Dedicated resource pool + |
+
network + |
+Networking of a dedicated resource pool + |
+
trainJob + |
+Training job + |
+
trainJobLog + |
+Runtime logs of a training job + |
+
trainJobInnerModel + |
+Preset model + |
+
trainJobVersion + |
+Version of a training job (supported by old-version training jobs that will be discontinued soon) + |
+
trainConfig + |
+Configuration of a training job (supported by old-version training jobs that will be discontinued soon) + |
+
tensorboard + |
+Visualization job of training results (supported by old-version training jobs that will be discontinued soon) + |
+
model + |
+Models + |
+
service + |
+Real-time service + |
+
nodeservice + |
+Edge service + |
+
workspace + |
+Workspace + |
+
dataset + |
+Dataset + |
+
dataAnnotation + |
+Dataset labels + |
+
aiAlgorithm + |
+Algorithm for training jobs + |
+
image + |
+Image + |
+
For details, see "Permissions Policies and Supported Actions" in ModelArts API Reference.
+ +Function Dependency Policies
+When using ModelArts to develop algorithms or manage training jobs, you are required to use other Cloud services. For example, before submitting a training job, select an OBS path for storing the dataset and logs, respectively. Therefore, when configuring fine-grained authorization policies for a user, the administrator must configure dependent permissions so that the user can use required functions.
+If you use ModelArts as the root user (default IAM user with the same name as the account), the root user has all permissions by default.
+Application Scenario + |
+Dependent Service + |
+Dependent Policy + |
+Supported Function + |
+
---|---|---|---|
Global configuration + |
+IAM + |
+iam:users:listUsers + |
+Obtain a user list. This action is required by the administrator only. + |
+
Basic function + |
+IAM + |
+iam:tokens:assume + |
+(Mandatory) Use an agency to obtain temporary authentication credentials. + |
+
Basic function + |
+BSS + |
+bss:balance:view + |
+Show the balance of the current account on the page after resources are created on the ModelArts console. + |
+
Application Scenario + |
+Dependent Service + |
+Dependent Policy + |
+Supported Function + |
+
---|---|---|---|
Workspace + |
+IAM + |
+iam:users:listUsers + |
+Authorize an IAM user to use a workspace. + |
+
ModelArts + |
+modelarts:*:*delete* + |
+Clear resources in a workspace when deleting it. + |
+
Application Scenario + |
+Dependent Service + |
+Dependent Policy + |
+Supported Function + |
+
---|---|---|---|
Lifecycle management of development environment instances + |
+ModelArts + |
+modelarts:notebook:create +modelarts:notebook:list +modelarts:notebook:get +modelarts:notebook:update +modelarts:notebook:delete +modelarts:notebook:start +modelarts:notebook:stop +modelarts:notebook:updateStopPolicy +modelarts:image:delete +modelarts:image:list +modelarts:image:create +modelarts:image:get +modelarts:pool:list +modelarts:tag:list +modelarts:network:get +aom:metric:get +aom:metric:list +aom:alarm:list + |
+Start, stop, create, delete, and update an instance. + |
+
Dynamically mounting storage + + |
+ModelArts + |
+modelarts:notebook:listMountedStorages +modelarts:notebook:mountStorage +modelarts:notebook:getMountedStorage +modelarts:notebook:umountStorage + |
+Dynamically mount storage. + |
+
OBS + |
+obs:bucket:ListAllMyBuckets +obs:bucket:ListBucket + |
+||
Image management + |
+ModelArts + |
+modelarts:image:register +modelarts:image:listGroup + |
+Register and view an image on the Image Management page. + |
+
Saving an image + |
+SWR + |
+SWR Admin + |
+The SWR Admin policy contains the maximum scope of SWR permissions, which can be used to: +
|
+
Using the SSH function + |
+ECS + |
+ecs:serverKeypairs:list +ecs:serverKeypairs:get +ecs:serverKeypairs:delete +ecs:serverKeypairs:create + |
+Configure a login key for a notebook instance. + |
+
Mounting an SFS Turbo file system + |
+SFS Turbo + |
+SFS Turbo FullAccess + |
+Read and write an SFS directory as an IAM user. Mount an SFS file system that is not created by you to a notebook instance using a dedicated resource pool. + |
+
Viewing all Instances + |
+ModelArts + |
+modelarts:notebook:listAllNotebooks + |
+View development environment instances of all users on the ModelArts management console. This action is required by the development environment instance administrator. + |
+
IAM + |
+iam:users:listUsers + |
+||
Local VS Code plug-in or PyCharm Toolkit + |
+ModelArts + |
+modelarts:notebook:listAllNotebooks +modelarts:trainJob:create +modelarts:trainJob:list +modelarts:trainJob:update +modelarts:trainJobVersion:delete +modelarts:trainJob:get +modelarts:trainJob:logExport +modelarts:workspace:getQuotas (This policy is required if the workspace function is enabled.) + |
+Access a notebook instance from local VS Code and submit training jobs. + |
+
OBS + |
+obs:bucket:ListAllMybuckets +obs:bucket:HeadBucket +obs:bucket:ListBucket +obs:bucket:GetBucketLocation +obs:object:GetObject +obs:object:GetObjectVersion +obs:object:PutObject +obs:object:DeleteObject +obs:object:DeleteObjectVersion +obs:object:ListMultipartUploadParts +obs:object:AbortMultipartUpload +obs:object:GetObjectAcl +obs:object:GetObjectVersionAcl +obs:bucket:PutBucketAcl +obs:object:PutObjectAcl +obs:object:ModifyObjectMetaData + |
+||
IAM + |
+iam:projects:listProjects + |
+Obtain an IAM project list through local PyCharm for access configurations. + |
+
Application Scenario + |
+Dependent Service + |
+Dependent Policy + |
+Supported Function + |
+
---|---|---|---|
Training management + + |
+ModelArts + + + |
+modelarts:trainJob:* +modelarts:trainJobLog:* +modelarts:aiAlgorithm:* +modelarts:image:list + |
+Create a training job and view training logs. + |
+
modelarts:workspace:getQuotas + |
+Obtain a workspace quota. This policy is required if the workspace function is enabled. + |
+||
modelarts:tag:list + |
+Use Tag Management Service (TMS) in a training job. + |
+||
IAM + |
+iam:credentials:listCredentials +iam:agencies:listAgencies + |
+Use the configured agency authorization. + |
+|
SFS Turbo + |
+sfsturbo:shares:getShare +sfsturbo:shares:getAllShares + |
+Use SFS Turbo in a training job. + |
+|
SWR + |
+swr:repository:listTags +swr:repository:getRepository +swr:repository:listRepositories + |
+Use a custom image to create a training job. + |
+|
SMN + |
+smn:topic:publish +smn:topic:list + |
+Notify training job status changes through SMN. + |
+|
OBS + |
+obs:bucket:ListAllMybuckets +obs:bucket:HeadBucket +obs:bucket:ListBucket +obs:bucket:GetBucketLocation +obs:object:GetObject +obs:object:GetObjectVersion +obs:object:PutObject +obs:object:DeleteObject +obs:object:DeleteObjectVersion +obs:object:ListMultipartUploadParts +obs:object:AbortMultipartUpload +obs:object:GetObjectAcl +obs:object:GetObjectVersionAcl +obs:bucket:PutBucketAcl +obs:object:PutObjectAcl +obs:object:ModifyObjectMetaData + |
+Run a training job using a dataset in an OBS bucket. + |
+
Application Scenario + |
+Dependent Service + |
+Dependent Policy + |
+Supported Function + |
+
---|---|---|---|
Using a dataset + |
+ModelArts + |
+modelarts:dataset:getDataset +modelarts:dataset:createDataset +modelarts:dataset:createDatasetVersion +modelarts:dataset:createImportTask +modelarts:dataset:updateDataset +modelarts:processTask:createProcessTask +modelarts:processTask:getProcessTask +modelarts:dataset:listDatasets + |
+Use ModelArts datasets in a workflow. + |
+
Managing AI applications + |
+ModelArts + |
+modelarts:model:list +modelarts:model:get +modelarts:model:create +modelarts:model:delete +modelarts:model:update + |
+Manage ModelArts AI applications in a workflow. + |
+
Deploying a service + |
+ModelArts + |
+modelarts:service:get +modelarts:service:create +modelarts:service:update +modelarts:service:delete +modelarts:service:getLogs + |
+Manage ModelArts real-time services in a workflow. + |
+
Training jobs + |
+ModelArts + |
+modelarts:trainJob:get +modelarts:trainJob:create +modelarts:trainJob:list +modelarts:trainJobVersion:list +modelarts:trainJobVersion:create +modelarts:trainJob:delete +modelarts:trainJobVersion:delete +modelarts:trainJobVersion:stop + |
+Manage ModelArts training jobs in a workflow. + |
+
Workspace + |
+ModelArts + |
+modelarts:workspace:get +modelarts:workspace:getQuotas + |
+Use ModelArts workspaces in a workflow. + |
+
Managing data + |
+OBS + |
+obs:bucket:ListAllMybuckets (Obtaining a bucket list) +obs:bucket:HeadBucket (Obtaining bucket metadata) +obs:bucket:ListBucket (Listing objects in a bucket) +obs:bucket:GetBucketLocation (Obtaining the bucket location) +obs:object:GetObject (Obtaining object content and metadata) +obs:object:GetObjectVersion (Obtaining object content and metadata) +obs:object:PutObject (Uploading objects using PUT method, uploading objects using POST method, copying objects, appending an object, initializing a multipart task, uploading parts, and merging parts) +obs:object:DeleteObject (Deleting an object or batch deleting objects) +obs:object:DeleteObjectVersion (Deleting an object or batch deleting objects) +obs:object:ListMultipartUploadParts (Listing uploaded parts) +obs:object:AbortMultipartUpload (Aborting multipart uploads) +obs:object:GetObjectAcl (Obtaining an object ACL) +obs:object:GetObjectVersionAcl (Obtaining an object ACL) +obs:bucket:PutBucketAcl (Configuring a bucket ACL) +obs:object:PutObjectAcl (Configuring an object ACL) + |
+Use OBS data in a workflow. + |
+
Executing a workflow + |
+IAM + |
+iam:users:listUsers (Obtaining users) +iam:agencies:getAgency (Obtaining details about a specified agency) +iam:tokens:assume (Obtaining an agency token) + |
+Call other ModelArts services when the workflow is running. + |
+
Integrating DLI + |
+DLI + |
+dli:jobs:get (Obtaining job details) +dli:jobs:list_all (Viewing a job list) +dli:jobs:create (Creating a job) + |
+Integrate DLI into a workflow. + |
+
Integrating MRS + |
+MRS + |
+mrs:job:get (Obtaining job details) +mrs:job:submit (Creating and executing a job) +mrs:job:list (Viewing a job list) +mrs:job:stop (Stopping a job) +mrs:job:batchDelete (Batch deleting jobs) +mrs:file:list (Viewing a file list) + |
+Integrate MRS into a workflow. + |
+
Application Scenario + |
+Dependent Service + |
+Dependent Policy + |
+Supported Function + |
+
---|---|---|---|
Managing AI applications + + |
+SWR + |
+swr:repository:deleteRepository +swr:repository:deleteTag +swr:repository:getRepository +swr:repository:listTags + |
+Import a model from a custom image. +Use a custom engine when importing a model from OBS. + |
+
OBS + |
+obs:bucket:ListAllMybuckets (Obtaining a bucket list) +obs:bucket:HeadBucket (Obtaining bucket metadata) +obs:bucket:ListBucket (Listing objects in a bucket) +obs:bucket:GetBucketLocation (Obtaining the bucket location) +obs:object:GetObject (Obtaining object content and metadata) +obs:object:GetObjectVersion (Obtaining object content and metadata) +obs:object:PutObject (Uploading objects using PUT method, uploading objects using POST method, copying objects, appending an object, initializing a multipart task, uploading parts, and merging parts) +obs:object:DeleteObject (Deleting an object or batch deleting objects) +obs:object:DeleteObjectVersion (Deleting an object or batch deleting objects) +obs:object:ListMultipartUploadParts (Listing uploaded parts) +obs:object:AbortMultipartUpload (Aborting multipart uploads) +obs:object:GetObjectAcl (Obtaining an object ACL) +obs:object:GetObjectVersionAcl (Obtaining an object ACL) +obs:bucket:PutBucketAcl (Configuring a bucket ACL) +obs:object:PutObjectAcl (Configuring an object ACL) + |
+Import a model from a template. +Specify an OBS path for model conversion. + |
+
Application Scenario + |
+Dependent Service + |
+Dependent Policy + |
+Supported Function + |
+
---|---|---|---|
Deploying a service + |
+LTS + |
+lts:logs:list (Obtaining the log list) + |
+Show LTS logs. + |
+
Batch services + |
+OBS + |
+obs:object:GetObject (Obtaining object content and metadata) +obs:object:PutObject (Uploading objects using PUT method, uploading objects using POST method, copying objects, appending an object, initializing a multipart task, uploading parts, and merging parts) +obs:bucket:CreateBucket (Creating a bucket) +obs:bucket:ListBucket (Listing objects in a bucket) +obs:bucket:ListAllMyBuckets (Obtaining a bucket list) + |
+Create a batch service. + |
+
Edge services + + |
+CES + |
+ces:metricData:list: (Obtaining metric data) + |
+View monitoring metrics. + |
+
IEF + |
+ief:deployment:delete (Deleting a deployment) + |
+Manage edge services. + |
+
Application Scenario + |
+Dependent Service + |
+Dependent Policy + |
+Supported Function + |
+
---|---|---|---|
Managing datasets and labels + |
+OBS + |
+obs:bucket:ListBucket (Listing objects in a bucket) +obs:object:GetObject (Obtaining object content and metadata) +obs:object:PutObject (Uploading objects using PUT method, uploading objects using POST method, copying objects, appending an object, initializing a multipart task, uploading parts, and merging parts) +obs:object:DeleteObject (Deleting an object or batch deleting objects) +obs:bucket:HeadBucket (Obtaining bucket metadata) +obs:bucket:GetBucketAcl (Obtaining a bucket ACL) +obs:bucket:PutBucketAcl (Configuring a bucket ACL) +obs:bucket:GetBucketPolicy (Obtaining a bucket policy) +obs:bucket:PutBucketPolicy (Configuring a bucket policy) +obs:bucket:DeleteBucketPolicy (Deleting a bucket policy) +obs:bucket:PutBucketCORS (Configuring or deleting CORS rules of a bucket) +obs:bucket:GetBucketCORS (Obtaining the CORS rules of a bucket) +obs:object:PutObjectAcl (Configuring an object ACL) + |
+Manage datasets in OBS. +Label OBS data. +Create a data management job. + |
+
Managing table datasets + |
+DLI + |
+dli:database:displayAllDatabases +dli:database:displayAllTables +dli:table:describe_table + |
+Manage DLI data in a dataset. + |
+
Managing table datasets + |
+DWS + |
+dws:openAPICluster:list +dws:openAPICluster:getDetail + |
+Manage DWS data in a dataset. + |
+
Managing table datasets + |
+MRS + |
+mrs:job:submit +mrs:job:list +mrs:cluster:list +mrs:cluster:get + |
+Manage MRS data in a dataset. + |
+
Auto labeling + |
+ModelArts + |
+modelarts:service:list +modelarts:model:list +modelarts:model:get +modelarts:model:create +modelarts:trainJobInnerModel:list +modelarts:workspace:get +modelarts:workspace:list + |
+Enable auto labeling. + |
+
Team labeling + |
+IAM + |
+iam:projects:listProjects (Obtaining tenant projects) +iam:users:listUsers (Obtaining users) +iam:agencies:createAgency (Creating an agency) +iam:quotas:listQuotasForProject (Obtaining the quotas of a project) + |
+Manage labeling teams. + |
+
Application Scenario + |
+Dependent Service + |
+Dependent Policy + |
+Supported Function + |
+
---|---|---|---|
Managing resource pools + + |
+BSS + |
+bss:coupon:view +bss:order:view +bss:balance:view +bss:discount:view +bss:renewal:view +bss:bill:view +bss:contract:update +bss:order:pay +bss:unsubscribe:update +bss:renewal:update +bss:order:update + |
+Create, renew, and unsubscribe from a resource pool. Dependent permissions must be configured in the IAM project view. + |
+
ECS + |
+ecs:availabilityZones:list + |
+Show AZs. Dependent permissions must be configured in the IAM project view. + |
+|
Network management + + + |
+VPC + |
+vpc:routes:create +vpc:routes:list +vpc:routes:get +vpc:routes:delete +vpc:peerings:create +vpc:peerings:accept +vpc:peerings:get +vpc:peerings:delete +vpc:routeTables:update +vpc:routeTables:get +vpc:routeTables:list +vpc:vpcs:create +vpc:vpcs:list +vpc:vpcs:get +vpc:vpcs:delete +vpc:subnets:create +vpc:subnets:get +vpc:subnets:delete +vpcep:endpoints:list +vpcep:endpoints:create +vpcep:endpoints:delete +vpcep:endpoints:get +vpc:ports:create +vpc:ports:get +vpc:ports:update +vpc:ports:delete +vpc:networks:create +vpc:networks:get +vpc:networks:update +vpc:networks:delete + |
+Create and delete ModelArts networks, and interconnect VPCs. Dependent permissions must be configured in the IAM project view. + |
+
SFS Turbo + |
+sfsturbo:shares:addShareNic +sfsturbo:shares:deleteShareNic +sfsturbo:shares:showShareNic +sfsturbo:shares:listShareNics + |
+Interconnect your network with SFS Turbo. Dependent permissions must be configured in the IAM project view. + |
+|
Edge resource pool + |
+IEF + |
+ief:node:list +ief:group:get +ief:application:list +ief:application:get +ief:node:listNodeCert +ief:node:get +ief:IEFInstance:get +ief:deployment:list +ief:group:listGroupInstanceState +ief:IEFInstance:list +ief:deployment:get +ief:group:list + |
+Add, delete, modify, and search for edge pools + |
+
To simplify operations when you use ModelArts to run jobs, certain operations are automatically performed on the ModelArts backend, for example, downloading the datasets in an OBS bucket to a workspace before a training job is started and dumping training job logs to the OBS bucket.
+ModelArts does not save your token authentication credentials. Before performing operations on your resources (such as OBS buckets) in a backend asynchronous job, you are required to explicitly authorize ModelArts through an IAM agency. ModelArts will use the agency to obtain a temporary authentication credential for performing operations on your resources. For details, see Adding Authorization.
+ +As shown in Figure 1, after authorization is configured on ModelArts, ModelArts uses the temporary credential to access and operate your resources, relieving you from some complex and time-consuming operations. The agency credential will also be synchronized to your jobs (including notebook instances and training jobs). You can use the agency credential to access your resources in the jobs.
+You can use either of the following methods to authorize ModelArts using an agency:
+One-click authorization
+ModelArts provides one-click automatic authorization. You can quickly configure agency authorization on the Global Configuration page of ModelArts. Then, ModelArts will automatically create an agency for you and configure it in ModelArts.
+In this mode, the authorization scope is specified based on the preset system policies of dependent services to ensure sufficient permissions for using services. The created agency has almost all permissions of dependent services. If you want to precisely control the scope of permissions granted to an agency, use the second method.
+Custom authorization
+The administrator creates different agency authorization policies for different users in IAM, and configures the created agency for ModelArts users. When creating an agency for an IAM user, the administrator specifies the minimum permissions for the agency based on the user's permissions to control the resources that the user can access when they use ModelArts. For details, see Assigning Basic Permissions for Using ModelArts.
+ +Risks in Unauthorized Operations
+The agency authorization of a user is independent. Theoretically, the agency authorization scope of a user can be beyond the authorization scope of the authorization policy configured for the user group. Any improper configuration will result in unauthorized operations.
+To prevent unauthorized operations, only a tenant administrator is allowed to configure agencies for users in the ModelArts global configuration to ensure the security of agency authorization.
+ +Minimal Agency Authorization
+When configuring agency authorization, an administrator must strictly control the authorization scope.
+ModelArts asynchronously and automatically performs operations such as job preparation and clearing. The required agency authorization is within the basic authorization scope. If you use only some functions of ModelArts, the administrator can filter out the basic permissions that are not used according to the agency authorization configuration. Conversely, if you need to obtain resource permissions beyond the basic authorization scope in a job, the administrator can add new permissions to the agency authorization configuration. In a word, the agency authorization scope must be minimized and customized based on service requirements.
+ +Basic Agency Authorization Scope
+To customize the permissions for an agency, select permissions based on your service requirements.
+ +Application Scenario + |
+Dependent Service + |
+Agency Authorization + |
+Description + |
+Configuration Suggestion + |
+
---|---|---|---|---|
JupyterLab + |
+OBS + |
+obs:object:DeleteObject +obs:object:GetObject +obs:object:GetObjectVersion +obs:bucket:CreateBucket +obs:bucket:ListBucket +obs:bucket:ListAllMyBuckets +obs:object:PutObject +obs:bucket:GetBucketAcl +obs:bucket:PutBucketAcl +obs:bucket:PutBucketCORS + |
+Use OBS to upload and download data in JupyterLab through ModelArts notebook. + |
+Recommended + |
+
Development environment monitoring + |
+AOM + |
+aom:alarm:put + |
+Call the AOM API to obtain monitoring data and events of notebook instances and display them in ModelArts notebook. + |
+Recommended + |
+
Application Scenario + |
+Dependent Service + |
+Agency Authorization + |
+Description + |
+
---|---|---|---|
Training jobs + |
+OBS + |
+obs:bucket:ListBucket +obs:object:GetObject +obs:object:PutObject + |
+Download data, models, and code before starting a training job. +Upload logs and models when a training job is running. + |
+
Application Scenario + |
+Dependent Service + |
+Agency Authorization + |
+Description + |
+
---|---|---|---|
Real-time services + |
+LTS + |
+lts:groups:create +lts:groups:list +lts:topics:create +lts:topics:delete +lts:topics:list + |
+Configure LTS for reporting logs of real-time services. + |
+
Batch services + |
+OBS + |
+obs:bucket:ListBucket +obs:object:GetObject +obs:object:PutObject + |
+Run a batch service. + |
+
Edge services + |
+IEF + |
+ief:deployment:list +ief:deployment:create +ief:deployment:update +ief:deployment:delete +ief:node:createNodeCert +ief:iefInstance:list +ief:node:list + |
+Deploy an edge service using IEF. + |
+
Application Scenario + |
+Dependent Service + |
+Agency Authorization + |
+Description + |
+
---|---|---|---|
Dataset and data labeling + |
+OBS + |
+obs:object:GetObject +obs:object:PutObject +obs:object:DeleteObject +obs:object:PutObjectAcl +obs:bucket:ListBucket +obs:bucket:HeadBucket +obs:bucket:GetBucketAcl +obs:bucket:PutBucketAcl +obs:bucket:GetBucketPolicy +obs:bucket:PutBucketPolicy +obs:bucket:DeleteBucketPolicy +obs:bucket:PutBucketCORS +obs:bucket:GetBucketCORS + |
+Manage datasets in an OBS bucket. + |
+
Labeling data + |
+ModelArts inference + |
+modelarts:service:get +modelarts:service:create +modelarts:service:update + |
+Perform auto labeling based on ModelArts inference. + |
+
Application Scenario + |
+Dependent Service + |
+Agency Authorization + |
+Description + |
+
---|---|---|---|
Network management (New version) + + |
+VPC + |
+vpc:routes:create +vpc:routes:list +vpc:routes:get +vpc:routes:delete +vpc:peerings:create +vpc:peerings:accept +vpc:peerings:get +vpc:peerings:delete +vpc:routeTables:update +vpc:routeTables:get +vpc:routeTables:list +vpc:vpcs:create +vpc:vpcs:list +vpc:vpcs:get +vpc:vpcs:delete +vpc:subnets:create +vpc:subnets:get +vpc:subnets:delete +vpcep:endpoints:list +vpcep:endpoints:create +vpcep:endpoints:delete +vpcep:endpoints:get +vpc:ports:create +vpc:ports:get +vpc:ports:update +vpc:ports:delete +vpc:networks:create +vpc:networks:get +vpc:networks:update +vpc:networks:delete + |
+Create and delete ModelArts networks, and interconnect VPCs. Dependent permissions must be configured in the IAM project view. + |
+
SFS Turbo + |
+sfsturbo:shares:addShareNic +sfsturbo:shares:deleteShareNic +sfsturbo:shares:showShareNic +sfsturbo:shares:listShareNics + |
+Interconnect your network with SFS Turbo. Dependent permissions must be configured in the IAM project view. + |
+|
Managing resource pools + |
+BSS + |
+bss:coupon:view +bss:order:view +bss:balance:view +bss:discount:view +bss:renewal:view +bss:bill:view +bss:contract:update +bss:order:pay +bss:unsubscribe:update +bss:renewal:update +bss:order:update + |
+Create, renew, and unsubscribe from a resource pool. Dependent permissions must be configured in the IAM project view. + |
+
Managing resource pools + |
+ECS + |
+ecs:availabilityZones:list + |
+Show AZs. Dependent permissions must be configured in the IAM project view. + |
+
ModelArts allows you to create multiple workspaces to develop algorithms and manage and deploy models for different service objectives. In this way, the development outputs of different applications are allocated to different workspaces for simplified management.
+Workspace supports the following types of access control:
+A default workspace is allocated to each IAM project of each account. The access control of the default workspace is PUBLIC.
+Workspace access control allows the access of only certain users. This function can be used in the following scenarios:
+As an enterprise user, you can submit the request for enabling the workspace function to your technical support.
+Certain ModelArts functions require access to Object Storage Service (OBS), Software Repository for Container (SWR), and Intelligent EdgeFabric (IEF). Before using ModelArts, your account must be authorized to access these services. Otherwise, these functions will be unavailable.
+Parameter + |
+Description + |
+
---|---|
Authorized User + |
+Options: IAM user, Federated user, Agency, and All users +
|
+
Authorized To + |
+This parameter is not displayed when Authorized User is set to All users. +
|
+
Agency + |
+
|
+
Add agency > Agency Name + |
+The system automatically creates a changeable agency name. + |
+
Add agency > Permissions > Common User + |
+Common User provides the permissions to use all basic ModelArts functions. For example, you can access data, and create and manage training jobs. Select this option generally. +Click View permissions to view common user permissions. + |
+
Add agency > Permissions > Custom + |
+If you need refined permissions management, select Custom to flexibly assign permissions to the created agency. You can select permissions from the permission list as required. + |
+
You can view the configured authorizations on the Global Configuration page. Click View Permissions in the Authorization Content column to view the permission details.
+Multiple IAM users can be created under a tenant user, and the permissions of the IAM users are managed by group. This section describes how to create a user group and IAM users and add the IAM users to the user group.
+After the user group is created, the system automatically switches to the user group list. Then, you can add existing IAM users to the user group through user group management. If there is no existing IAM user, create users and add them to the user group.
+The system will automatically add the two users to the target group one by one.
+An IAM user can use cloud services such as ModelArts and OBS only after they are assigned with permissions from the tenant user. This section describes how to assign the permissions to use cloud services to all IAM users in a user group.
+DEW key management permission is configured in the following regions: CN North-Beijing1, CN North-Beijing4, CN East-Shanghai1, CN East-Shanghai2, CN South-Guangzhou, CN Southwest-Guiyang1, CN-Hong Kong, and AP-Singapore. In other regions, the KMS key management permission is configured. In this example, the CN-Hong Kong region is used. Therefore, the DEW key management permission is to be configured.
+Tenant Administrator has the permission to manage all cloud services, not only the ModelArts service. Exercise caution when assigning the Tenant Administrator permission.
+To view monitoring data only, select CES ReadOnlyAccess.
+To set alarm monitoring on Cloud Eye, you also need to add CES FullAccess and SMN permissions.
+After assigning IAM permissions, configure ModelArts access authorization for IAM users on the ModelArts page so that ModelArts can access dependent services such as OBS, SWR, and IEF.
+In agent-based ModelArts access authorization, only tenant users are allowed to configure for their IAM users. Therefore, in this example, the administrator needs to configure access authorization for all the IAM users.
+It takes 15 to 30 minutes for the permissions configured in 4 to take effect. Therefore, wait for 30 minutes after the configuration and then verify the configuration.
+Change the password as prompted upon the first login.
+Alternatively, you can try other functions, such as Training Management > Training Jobs. If the operation is successful, you can use ModelArts properly.
+In small- and medium-sized teams, administrators need to globally control ModelArts resources, and developers only need to focus on their own instances. By default, a developer account does not have the te_admin permission. The tenant account must configure the required permissions. This section uses notebook as an example to describe how to assign different permissions to administrators and developers through custom policies.
+To develop a project using notebook, administrators need full control permissions for using ModelArts dedicated resource pools, and access and operation permissions on all notebook instances.
+To use development environments, developers only need operation permissions for using their own instances and dependent services. They do not need to perform operations on ModelArts dedicated resource pools or view notebook instances of other users.
+Assign full control permissions to administrators for using ModelArts dedicated resource pools and all notebook instances. The procedure is as follows:
+{ + "Version": "1.1", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "iam:users:listUsers", + "iam:projects:listProjects", + "obs:object:PutObject", + "obs:object:GetObject", + "obs:object:GetObjectVersion", + "obs:bucket:HeadBucket", + "obs:object:DeleteObject", + "obs:bucket:CreateBucket", + "obs:bucket:ListBucket" + ] + } + ] +}+
{ + "Version": "1.1", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "ecs:serverKeypairs:list", + "ecs:serverKeypairs:get", + "ecs:serverKeypairs:delete", + "ecs:serverKeypairs:create", + "swr:repository:getNamespace", + "swr:repository:listNamespaces", + "swr:repository:deleteTag", + "swr:repository:getRepository", + "swr:repository:listTags", + "swr:instance:createTempCredential", + "mrs:cluster:get", + "modelarts:*:*" + ] + } + ] +}+
Change the password as prompted upon the first login.
+Use IAM for fine-grained control of developer permissions. The procedure is as follows:
+In the navigation pane of the IAM console, choose Permissions > Policies/Roles. Click Create Custom Policy in the upper right corner. On the displayed page, enter Policy3_DenyOperation for Policy Name, select JSON for Policy View, configure the policy content, and click OK.
+{ + "Version": "1.1", + "Statement": [ + { + "Effect": "deny", + "Action": [ + "modelarts:pool:create", + "modelarts:pool:update", + "modelarts:pool:delete", + "modelarts:notebook:listAllNotebooks" + ] + + } + ] +}+
On the Add Authorization page, set Authorized User to IAM user, select a developer account for Authorized To, and select the existing agency ma_agency_develop_user created before.
+Change the password as prompted upon the first login.
+Any IAM user granted with the listAllNotebooks and listUsers permissions can click View all on the notebook page to view the instances of all users in the current IAM project.
+Users granted with these permissions can also access OBS and SWR of all users in the current IAM project.
+Policy 2: Create a policy that allows users to view all users of an IAM project.
+After the configuration, all users in the user group have the permission to view all notebook instances created by users in the user group.
+If no user group is available, create one, add users to it through user group management, and configure authorization for the user group. If the target user is not in a user group, add the user to a user group through user group management.
+If an IAM user wants to access another IAM user's notebook instance through remote SSH, they need to update the SSH key pair to their own. Otherwise, error ModelArts.6786 will be reported. For details about how to update a key pair, see Modifying the SSH Configuration for a Notebook Instance.
+ModelArts.6789: Failed to find SSH key pair KeyPair-xxx on the ECS key pair page. Update the key pair and try again later.
+You can use Cloud Shell provided by the ModelArts console to log in to a running training container.
+You can use Cloud Shell to log in to a running training container using a dedicated resource pool.
+After the configuration, all users in the user group have the permission to use Cloud Shell to log in to a running training container.
+If no user group is available, create one, add users to it through user group management, and configure authorization for the user group. If the target user is not in a user group, add the user to a user group through user group management.
+Verify that the login is successful, as shown in the following figure.
+This section describes how to control the ModelArts permissions of a user so that the user is not allowed to use a public resource pool to create training jobs, create notebook instances, or deploy inference services.
+Through permission control, ModelArts dedicated resource pool users can be prohibited from using a public resource pool to create training jobs, create notebook instances, or deploy inference services.
+To control the permissions, configure the following permission policy items:
+{ + "Version": "1.1", + "Statement": [ + { + "Effect": "Deny", + "Action": [ + "modelarts:trainJob:create", + "modelarts:notebook:create", + "modelarts:service:create" + ], + "Condition": { + "StringEquals": { + "modelarts:poolType": [ + "public" + ] + } + } + } + ] +}+
After the configuration, all users in the user group have the permission to view all notebook instances created by users in the user group.
+If no user group is available, create one, add users to it through user group management, and configure authorization for the user group. If the target user is not in a user group, add the user to a user group through user group management.
+In the navigation pane, choose Agencies. Locate the agency used by the user group on ModelArts and click Modify in the Operation column. On the Permissions tab page, click Authorize, select the created custom policy, and click Next. Select the scope for authorization and click OK.
+Log in to the ModelArts console as an IAM user, choose Training Management > Training Jobs, and click Create Training Job. On the page for creating a training job, only a dedicated resource pool can be selected for Resource Pool.
+Log in to the ModelArts console as an IAM user, choose DevEnviron > Notebook, and click Create. On the page for creating a notebook instance, only a dedicated resource pool can be selected for Resource Pool.
+Log in to the ModelArts console as an IAM user, choose Service Deployment > Real-Time Services, and click Deploy. On the page for service deployment, only a dedicated resource pool can be selected for Resource Pool.
+