forked from docs/doc-exports
Reviewed-by: Pruthi, Vineet <vineet.pruthi@t-systems.com> Co-authored-by: Hasko, Vladimir <vladimir.hasko@t-systems.com> Co-committed-by: Hasko, Vladimir <vladimir.hasko@t-systems.com>
293 lines
32 KiB
HTML
293 lines
32 KiB
HTML
<a name="dli_09_0205"></a><a name="dli_09_0205"></a>
|
|
|
|
<h1 class="topictitle1">Using Spark Jar Jobs to Read and Query OBS Data</h1>
|
|
<div id="body0000001207799294"><div class="section" id="dli_09_0205__section8832837123"><h4 class="sectiontitle">Scenario</h4><p id="dli_09_0205__p1969478201214">DLI is fully compatible with open-source <a href="https://spark.apache.org/" target="_blank" rel="noopener noreferrer">Apache Spark</a> and allows you to import, query, analyze, and process job data by programming. This section describes how to write a Spark program to read and query OBS data, compile and package the code, and submit it to a Spark Jar job.</p>
|
|
</div>
|
|
<div class="section" id="dli_09_0205__section199842111628"><h4 class="sectiontitle">Environment Preparations</h4><p id="dli_09_0205__p8202163717211">Before you start, set up the development environment.</p>
|
|
|
|
<div class="tablenoborder"><table cellpadding="4" cellspacing="0" summary="" id="dli_09_0205__table15851625229" frame="border" border="1" rules="all"><caption><b>Table 1 </b>Spark Jar job development environment</caption><thead align="left"><tr id="dli_09_0205__row11859253210"><th align="left" class="cellrowborder" valign="top" width="27.63%" id="mcps1.3.2.3.2.3.1.1"><p id="dli_09_0205__p9852251528">Item</p>
|
|
</th>
|
|
<th align="left" class="cellrowborder" valign="top" width="72.37%" id="mcps1.3.2.3.2.3.1.2"><p id="dli_09_0205__p8851725529">Description</p>
|
|
</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody><tr id="dli_09_0205__row78519251429"><td class="cellrowborder" valign="top" width="27.63%" headers="mcps1.3.2.3.2.3.1.1 "><p id="dli_09_0205__p108522517216">OS</p>
|
|
</td>
|
|
<td class="cellrowborder" valign="top" width="72.37%" headers="mcps1.3.2.3.2.3.1.2 "><p id="dli_09_0205__p20851825626">Windows 7 or later</p>
|
|
</td>
|
|
</tr>
|
|
<tr id="dli_09_0205__row18851325325"><td class="cellrowborder" valign="top" width="27.63%" headers="mcps1.3.2.3.2.3.1.1 "><p id="dli_09_0205__p1885825624">JDK</p>
|
|
</td>
|
|
<td class="cellrowborder" valign="top" width="72.37%" headers="mcps1.3.2.3.2.3.1.2 "><p id="dli_09_0205__p8859251424">JDK 1.8.</p>
|
|
</td>
|
|
</tr>
|
|
<tr id="dli_09_0205__row24601502619"><td class="cellrowborder" valign="top" width="27.63%" headers="mcps1.3.2.3.2.3.1.1 "><p id="dli_09_0205__p16497910469">IntelliJ IDEA</p>
|
|
</td>
|
|
<td class="cellrowborder" valign="top" width="72.37%" headers="mcps1.3.2.3.2.3.1.2 "><p id="dli_09_0205__p84601601562">This tool is used for application development. The version of the tool must be 2019.1 or other compatible versions.</p>
|
|
</td>
|
|
</tr>
|
|
<tr id="dli_09_0205__row53111251665"><td class="cellrowborder" valign="top" width="27.63%" headers="mcps1.3.2.3.2.3.1.1 "><p id="dli_09_0205__p831117511968">Maven</p>
|
|
</td>
|
|
<td class="cellrowborder" valign="top" width="72.37%" headers="mcps1.3.2.3.2.3.1.2 "><p id="dli_09_0205__p23118511064">Basic configurations of the development environment. Maven is used for project management throughout the lifecycle of software development.</p>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="dli_09_0205__section54791739112210"><h4 class="sectiontitle">Development Process</h4><div class="p" id="dli_09_0205__p892144112221">The following figure shows the process of developing a Spark Jar job.<div class="fignone" id="dli_09_0205__fig676731405019"><span class="figcap"><b>Figure 1 </b>Development process</span><br><span><img id="dli_09_0205__image13767161411506" src="en-us_image_0000001251908699.png"></span></div>
|
|
|
|
<div class="tablenoborder"><table cellpadding="4" cellspacing="0" summary="" id="dli_09_0205__table1421119391677" frame="border" border="1" rules="all"><caption><b>Table 2 </b>Process description</caption><thead align="left"><tr id="dli_09_0205__row11211153918715"><th align="left" class="cellrowborder" valign="top" width="6.830601092896176%" id="mcps1.3.3.2.2.2.5.1.1"><p id="dli_09_0205__p11573151398">No.</p>
|
|
</th>
|
|
<th align="left" class="cellrowborder" valign="top" width="23.34113973458236%" id="mcps1.3.3.2.2.2.5.1.2"><p id="dli_09_0205__p8211239475">Phase</p>
|
|
</th>
|
|
<th align="left" class="cellrowborder" valign="top" width="10.724043715846996%" id="mcps1.3.3.2.2.2.5.1.3"><p id="dli_09_0205__p167011419911">Software Portal</p>
|
|
</th>
|
|
<th align="left" class="cellrowborder" valign="top" width="59.10421545667448%" id="mcps1.3.3.2.2.2.5.1.4"><p id="dli_09_0205__p1921103911712">Description</p>
|
|
</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody><tr id="dli_09_0205__row1722811511589"><td class="cellrowborder" valign="top" width="6.830601092896176%" headers="mcps1.3.3.2.2.2.5.1.1 "><p id="dli_09_0205__p32288513813">1</p>
|
|
</td>
|
|
<td class="cellrowborder" valign="top" width="23.34113973458236%" headers="mcps1.3.3.2.2.2.5.1.2 "><p id="dli_09_0205__p622813512814">Create a queue for general use.</p>
|
|
</td>
|
|
<td class="cellrowborder" valign="top" width="10.724043715846996%" headers="mcps1.3.3.2.2.2.5.1.3 "><p id="dli_09_0205__p172281051989">DLI console</p>
|
|
</td>
|
|
<td class="cellrowborder" valign="top" width="59.10421545667448%" headers="mcps1.3.3.2.2.2.5.1.4 "><p id="dli_09_0205__p32282511387">The DLI queue is created for running your job.</p>
|
|
</td>
|
|
</tr>
|
|
<tr id="dli_09_0205__row13783723184813"><td class="cellrowborder" valign="top" width="6.830601092896176%" headers="mcps1.3.3.2.2.2.5.1.1 "><p id="dli_09_0205__p47841723154817">2</p>
|
|
</td>
|
|
<td class="cellrowborder" valign="top" width="23.34113973458236%" headers="mcps1.3.3.2.2.2.5.1.2 "><p id="dli_09_0205__p3784122334811">Upload data to an OBS bucket.</p>
|
|
</td>
|
|
<td class="cellrowborder" valign="top" width="10.724043715846996%" headers="mcps1.3.3.2.2.2.5.1.3 "><p id="dli_09_0205__p137841823104818">OBS console</p>
|
|
</td>
|
|
<td class="cellrowborder" valign="top" width="59.10421545667448%" headers="mcps1.3.3.2.2.2.5.1.4 "><p id="dli_09_0205__p291154724818">The test data needs to be uploaded to your OBS bucket.</p>
|
|
</td>
|
|
</tr>
|
|
<tr id="dli_09_0205__row102114391879"><td class="cellrowborder" valign="top" width="6.830601092896176%" headers="mcps1.3.3.2.2.2.5.1.1 "><p id="dli_09_0205__p65761516918">3</p>
|
|
</td>
|
|
<td class="cellrowborder" valign="top" width="23.34113973458236%" headers="mcps1.3.3.2.2.2.5.1.2 "><p id="dli_09_0205__p4211133911710">Create a Maven project and configure the POM file.</p>
|
|
</td>
|
|
<td class="cellrowborder" rowspan="3" valign="top" width="10.724043715846996%" headers="mcps1.3.3.2.2.2.5.1.3 "><p id="dli_09_0205__p81691210101">IntelliJ IDEA</p>
|
|
</td>
|
|
<td class="cellrowborder" rowspan="3" valign="top" width="59.10421545667448%" headers="mcps1.3.3.2.2.2.5.1.4 "><p id="dli_09_0205__p321103914719"></p>
|
|
<p id="dli_09_0205__p152111391671">Write your code by referring to the sample code for reading data from OBS.</p>
|
|
<p id="dli_09_0205__p694692512124"></p>
|
|
</td>
|
|
</tr>
|
|
<tr id="dli_09_0205__row1211123914712"><td class="cellrowborder" valign="top" headers="mcps1.3.3.2.2.2.5.1.1 "><p id="dli_09_0205__p55731512916">4</p>
|
|
</td>
|
|
<td class="cellrowborder" valign="top" headers="mcps1.3.3.2.2.2.5.1.2 "><p id="dli_09_0205__p16211739576">Write code.</p>
|
|
</td>
|
|
</tr>
|
|
<tr id="dli_09_0205__row79452250121"><td class="cellrowborder" valign="top" headers="mcps1.3.3.2.2.2.5.1.1 "><p id="dli_09_0205__p79461255124">5</p>
|
|
</td>
|
|
<td class="cellrowborder" valign="top" headers="mcps1.3.3.2.2.2.5.1.2 "><p id="dli_09_0205__p10946172551215">Debug, compile, and pack the code into a Jar package.</p>
|
|
</td>
|
|
</tr>
|
|
<tr id="dli_09_0205__row86521956191210"><td class="cellrowborder" valign="top" width="6.830601092896176%" headers="mcps1.3.3.2.2.2.5.1.1 "><p id="dli_09_0205__p7652456101218">6</p>
|
|
</td>
|
|
<td class="cellrowborder" valign="top" width="23.34113973458236%" headers="mcps1.3.3.2.2.2.5.1.2 "><p id="dli_09_0205__p10652185691214">Upload the Jar package to OBS and DLI.</p>
|
|
</td>
|
|
<td class="cellrowborder" valign="top" width="10.724043715846996%" headers="mcps1.3.3.2.2.2.5.1.3 "><p id="dli_09_0205__p565211562128">OBS console</p>
|
|
</td>
|
|
<td class="cellrowborder" valign="top" width="59.10421545667448%" headers="mcps1.3.3.2.2.2.5.1.4 "><p id="dli_09_0205__p1165216565129">You can upload the generated Spark JAR package to an OBS directory and DLI program package.</p>
|
|
</td>
|
|
</tr>
|
|
<tr id="dli_09_0205__row18133049101414"><td class="cellrowborder" valign="top" width="6.830601092896176%" headers="mcps1.3.3.2.2.2.5.1.1 "><p id="dli_09_0205__p1513384931416">7</p>
|
|
</td>
|
|
<td class="cellrowborder" valign="top" width="23.34113973458236%" headers="mcps1.3.3.2.2.2.5.1.2 "><p id="dli_09_0205__p17133194971413">Create a Spark Jar Job.</p>
|
|
</td>
|
|
<td class="cellrowborder" valign="top" width="10.724043715846996%" headers="mcps1.3.3.2.2.2.5.1.3 "><p id="dli_09_0205__p11133449181419">DLI console</p>
|
|
</td>
|
|
<td class="cellrowborder" valign="top" width="59.10421545667448%" headers="mcps1.3.3.2.2.2.5.1.4 "><p id="dli_09_0205__p107651124156">The Spark Jar job is created and submitted on the DLI console.</p>
|
|
</td>
|
|
</tr>
|
|
<tr id="dli_09_0205__row9403719162"><td class="cellrowborder" valign="top" width="6.830601092896176%" headers="mcps1.3.3.2.2.2.5.1.1 "><p id="dli_09_0205__p134035191618">8</p>
|
|
</td>
|
|
<td class="cellrowborder" valign="top" width="23.34113973458236%" headers="mcps1.3.3.2.2.2.5.1.2 "><p id="dli_09_0205__p114038181618">Check execution result of the job.</p>
|
|
</td>
|
|
<td class="cellrowborder" valign="top" width="10.724043715846996%" headers="mcps1.3.3.2.2.2.5.1.3 "><p id="dli_09_0205__p184101541614">DLI console</p>
|
|
</td>
|
|
<td class="cellrowborder" valign="top" width="59.10421545667448%" headers="mcps1.3.3.2.2.2.5.1.4 "><p id="dli_09_0205__p17403415169">You can view the job running status and run logs.</p>
|
|
</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="dli_09_0205__section3345113541312"><a name="dli_09_0205__section3345113541312"></a><a name="section3345113541312"></a><h4 class="sectiontitle">Step 1: Create a Queue for General Purpose</h4><div class="p" id="dli_09_0205__p628631819213">If you submit a Spark job for the first time, you need to create a queue first. For example, create a queue, name it <strong id="dli_09_0205__b1118111954613">sparktest</strong>, and set <strong id="dli_09_0205__b151891914618">Queue Usage</strong> to <strong id="dli_09_0205__b981911164473">For general purpose</strong>.<ol id="dli_09_0205__ol20286618192116"><li id="dli_09_0205__li1328719182211">In the navigation pane of the DLI management console, choose <span class="uicontrol" id="dli_09_0205__uicontrol1335111394482"><b>Queue Management</b></span>.</li><li id="dli_09_0205__li728716186219">In the upper right corner of the <span class="wintitle" id="dli_09_0205__wintitle315024624813"><b>Queue Management</b></span> page, click Create Queue to create a queue.</li><li id="dli_09_0205__li16287181811214">Create a queue, name it <strong id="dli_09_0205__b9798102695019">sparktest</strong>, and set the queue usage to for general purpose. For details about how to create a queue, see Creating a Queue.</li><li id="dli_09_0205__li14287318162119">Click <span class="uicontrol" id="dli_09_0205__uicontrol9930051125114"><b>Create Now</b></span> to create a queue.</li></ol>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="dli_09_0205__section66881652423"><h4 class="sectiontitle">Step 2: Upload Data to OBS</h4><ol id="dli_09_0205__ol2028912141832"><li id="dli_09_0205__li181196724910">Create the <strong id="dli_09_0205__b109591750185214">people.json</strong> file containing the following content:<pre class="screen" id="dli_09_0205__screen12653146115017">{"name":"Michael"}
|
|
{"name":"Andy", "age":30}
|
|
{"name":"Justin", "age":19}</pre>
|
|
</li><li id="dli_09_0205__li12184172017233">Log in to the OBS Console. In the Bucket page, click the name of the created OBS bucket. In this example, the bucket name is <strong id="dli_09_0205__b4183341175314">dli-test-obs01</strong>. The overview page is displayed.</li><li id="dli_09_0205__li3616145163117">In the navigation pane on the left, choose <strong id="dli_09_0205__b36558422553">Objects</strong>. Click <strong id="dli_09_0205__b10924235619">Upload Object</strong> to upload the file to the root directory of the OBS bucket.</li><li id="dli_09_0205__li204841438205413">In the root directory of the OBS bucket, click <strong id="dli_09_0205__b1942243216569">Create Folder</strong> to create a folder and name it <strong id="dli_09_0205__b18243125018566">result</strong>.</li><li id="dli_09_0205__li4289161418312">Click the <strong id="dli_09_0205__b14121303572">result</strong> folder, click <strong id="dli_09_0205__b1027334115718">Create Folder</strong> on the displayed page to create a folder and name it <strong id="dli_09_0205__b02593203584">parquet</strong>.</li></ol>
|
|
</div>
|
|
<div class="section" id="dli_09_0205__section155442205718"><h4 class="sectiontitle">Step 3: Create a Maven Project and Configure the pom Dependency</h4><div class="p" id="dli_09_0205__p1323313312581">This step uses IntelliJ IDEA 2020.2 as an example.<ol id="dli_09_0205__ol2397109104513"><li id="dli_09_0205__li1039714915454">Start IntelliJ IDEA and choose <strong id="dli_09_0205__b1697841012114">File</strong> > <strong id="dli_09_0205__b1797913101419">New</strong> > <strong id="dli_09_0205__b109792101412">Project</strong>.<div class="fignone" id="dli_09_0205__fig975857114919"><span class="figcap"><b>Figure 2 </b>Creating a project</span><br><span><img id="dli_09_0205__image9757576496" src="en-us_image_0000001252187705.png"></span></div>
|
|
</li><li id="dli_09_0205__li13857332152816">Choose <strong id="dli_09_0205__b97831331213">Maven</strong>, set <strong id="dli_09_0205__b771513417216">Project SDK</strong> to <strong id="dli_09_0205__b157312371226">1.8</strong>, and click <strong id="dli_09_0205__b138956388218">Next</strong>.<div class="fignone" id="dli_09_0205__fig19233645184918"><span class="figcap"><b>Figure 3 </b>Creating a project</span><br><span><img id="dli_09_0205__image8234144510494" src="en-us_image_0000001637557382.png"></span></div>
|
|
</li><li id="dli_09_0205__li1974116643214">Set the project name, configure the storage path, and click <strong id="dli_09_0205__b1243316211836">Finish</strong>.<div class="fignone" id="dli_09_0205__fig138297307504"><span class="figcap"><b>Figure 4 </b>Creating a project</span><br><span><img id="dli_09_0205__image582993017505" src="en-us_image_0000001637398494.png"></span></div>
|
|
<p id="dli_09_0205__p1740793545412">In this example, the Maven project name is <strong id="dli_09_0205__b1034319411030">SparkJarObs</strong>, and the project storage path is <strong id="dli_09_0205__b2010418451939">D:\DLITest\SparkJarObs</strong>.</p>
|
|
</li><li id="dli_09_0205__li56201025357">Add the following content to the <strong id="dli_09_0205__b16128113719719">pom.xml</strong> file.<pre class="screen" id="dli_09_0205__screen175551817363"><dependencies>
|
|
<dependency>
|
|
<groupId>org.apache.spark</groupId>
|
|
<artifactId>spark-sql_2.11</artifactId>
|
|
<version>2.3.2</version>
|
|
</dependency>
|
|
</dependencies></pre>
|
|
<div class="p" id="dli_09_0205__p1141017953718"><div class="fignone" id="dli_09_0205__fig139555551639"><span class="figcap"><b>Figure 5 </b>Modifying the <strong id="dli_09_0205__b157544614720">pom.xml</strong> file</span><br><span><img id="dli_09_0205__image89552551238" src="en-us_image_0000001252053711.png"></span></div>
|
|
</div>
|
|
</li><li id="dli_09_0205__li532734873814">Choose <strong id="dli_09_0205__b1474859695">src</strong> > <strong id="dli_09_0205__b174179116109">main</strong> and right-click the <strong id="dli_09_0205__b12902111418109">java</strong> folder. Choose <strong id="dli_09_0205__b124474351018">New</strong> > <strong id="dli_09_0205__b1185416448106">Package</strong> to create a package and a class file.<div class="fignone" id="dli_09_0205__fig17237937195216"><span class="figcap"><b>Figure 6 </b>Creating a package</span><br><span><img id="dli_09_0205__image142371437155211" src="en-us_image_0000001637399398.png"></span></div>
|
|
<p id="dli_09_0205__p242315145436">Set the package name as you need. Then, press <strong id="dli_09_0205__b1297721822814">Enter</strong>.</p>
|
|
<p id="dli_09_0205__p14790156134412">Create a Java Class file in the package path. In this example, the Java Class file is <strong id="dli_09_0205__b1120805131516">SparkDemoObs</strong>.</p>
|
|
</li></ol>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="dli_09_0205__section584152211144"><h4 class="sectiontitle">Step 4: Write Code</h4><p id="dli_09_0205__p19566440161818">Code the <strong id="dli_09_0205__b1721825815223">SparkDemoObs</strong> program to read the <strong id="dli_09_0205__b1312815218234">people.json</strong> file from the OBS bucket, create the temporary table <strong id="dli_09_0205__b83847415245">people</strong>, and query data.</p>
|
|
<p id="dli_09_0205__p858265318217">For the sample code, see <a href="#dli_09_0205__section536212344115">Sample Code</a>.</p>
|
|
<ol id="dli_09_0205__ol153077251228"><li id="dli_09_0205__li73076259225">Import dependencies.<pre class="screen" id="dli_09_0205__screen1063393611226">import org.apache.spark.sql.Dataset;
|
|
import org.apache.spark.sql.Row;
|
|
import org.apache.spark.sql.SaveMode;
|
|
import org.apache.spark.sql.SparkSession;
|
|
|
|
import static org.apache.spark.sql.functions.col;</pre>
|
|
</li><li id="dli_09_0205__li138215458228">Create Spark session <strong id="dli_09_0205__b198511337">spark</strong> using the AK and SK of the current account.<pre class="screen" id="dli_09_0205__screen2979429112917">SparkSession spark = SparkSession
|
|
.builder()
|
|
.config("spark.hadoop.fs.obs.access.key", "<em id="dli_09_0205__i10979152992913">xxx</em>")
|
|
.config("spark.hadoop.fs.obs.secret.key", "<em id="dli_09_0205__i8979112982914">yyy</em>")
|
|
.appName("java_spark_demo")
|
|
.getOrCreate();</pre>
|
|
<ul id="dli_09_0205__ul16979132982918"><li id="dli_09_0205__li11979172914298">Replace <em id="dli_09_0205__i2558146318">xxx</em> of "spark.hadoop.fs.obs.access.key" with the AK of the account.</li><li id="dli_09_0205__li1297932952917">Replace <em id="dli_09_0205__i193111472318">yyy</em> of "spark.hadoop.fs.obs.secret.key" with the SK of the account.</li></ul>
|
|
</li><li id="dli_09_0205__li166965435316">Read the <strong id="dli_09_0205__b111121433202817">people.json</strong> file from the OBS bucket.<div class="p" id="dli_09_0205__p752417212617"><strong id="dli_09_0205__b15126745142820">dli-test-obs01</strong> is the name of the sample OBS bucket. Replace it with the actual OBS bucket name.<pre class="screen" id="dli_09_0205__screen1084812201763">Dataset<Row> df = spark.read().json("obs://dli-test-obs01/people.json");
|
|
df.printSchema();</pre>
|
|
</div>
|
|
</li><li id="dli_09_0205__li1574253853513">Create temporary table <strong id="dli_09_0205__b1817710182911">people</strong> to read data.<pre class="screen" id="dli_09_0205__screen964994313378">df.createOrReplaceTempView("people");</pre>
|
|
</li><li id="dli_09_0205__li16364658123618">Query data in the <strong id="dli_09_0205__b5179046182919">people</strong> table.<pre class="screen" id="dli_09_0205__screen89381108393">Dataset<Row> sqlDF = spark.sql("SELECT * FROM people");
|
|
sqlDF.show();</pre>
|
|
</li><li id="dli_09_0205__li1093890183918">Export <strong id="dli_09_0205__b713795311303">people</strong> table data in Parquet format to the <strong id="dli_09_0205__b74868203116">result/parquet </strong>directory of the OBS bucket.<pre class="screen" id="dli_09_0205__screen951176194318">sqlDF.write().mode(SaveMode.Overwrite).parquet("obs://dli-test-obs01/result/parquet");
|
|
spark.read().parquet("obs://dli-test-obs01/result/parquet").show();</pre>
|
|
</li><li id="dli_09_0205__li15115664311">Disable the <strong id="dli_09_0205__b9115626103018">spark</strong> session.<pre class="screen" id="dli_09_0205__screen817220167439">spark.stop();</pre>
|
|
</li></ol>
|
|
</div>
|
|
<div class="section" id="dli_09_0205__section1618514424450"><h4 class="sectiontitle">Step 5: Debug, compile, and pack the code into a JAR package.</h4><ol id="dli_09_0205__ol3387191918248"><li id="dli_09_0205__li469121413243">Double-click <strong id="dli_09_0205__b1229914915362">Maven</strong> in the tool bar on the right, and double-click <strong id="dli_09_0205__b14681933612">clean</strong> and <strong id="dli_09_0205__b399012183610">compile</strong> to compile the code.<p id="dli_09_0205__p1190194252510">After the compilation is successful, double-click <strong id="dli_09_0205__b252853043712">package</strong>.</p>
|
|
<p id="dli_09_0205__p7583182817281">The generated JAR package is stored in the <strong id="dli_09_0205__b162913190418">target</strong> directory. In this example, <strong id="dli_09_0205__b16559134544110">SparkJarObs-1.0-SNAPSHOT.jar</strong> is stored in <strong id="dli_09_0205__b144781951194110">D:\DLITest\SparkJarObs\target</strong>.</p>
|
|
</li></ol>
|
|
</div>
|
|
<div class="section" id="dli_09_0205__section633044910536"><a name="dli_09_0205__section633044910536"></a><a name="section633044910536"></a><h4 class="sectiontitle">Step 6: Upload the JAR Package to OBS and DLI</h4><ul id="dli_09_0205__ul111975114175"><li id="dli_09_0205__li161911514174"><strong id="dli_09_0205__b18314204261719">Spark 3.3 or later:</strong><p id="dli_09_0205__p1531718318105">You can only set the <strong id="dli_09_0205__b0693346165520">Application</strong> parameter when creating a Spark job and select the required JAR file from OBS.</p>
|
|
<ol id="dli_09_0205__ol1588813105188"><li id="dli_09_0205__li588891021816"><a name="dli_09_0205__li588891021816"></a><a name="li588891021816"></a>Log in to the OBS console and upload the <strong id="dli_09_0205__b129611665018">SparkJarObs-1.0-SNAPSHOT.jar</strong> file to the OBS path.</li><li id="dli_09_0205__li16347151261812">Log in to the DLI console. In the navigation pane, choose <strong id="dli_09_0205__b996791115012">Job Management</strong> > <strong id="dli_09_0205__b796831117506">Spark Jobs</strong>.</li><li id="dli_09_0205__li82002617194">Locate the row containing a desired job and click <strong id="dli_09_0205__b8957233185018">Edit</strong> in the <strong id="dli_09_0205__b695833315506">Operation</strong> column.</li><li id="dli_09_0205__li19220134211911">Set <strong id="dli_09_0205__b1558376155419">Application</strong> to the OBS path in <a href="#dli_09_0205__li588891021816">1</a>.</li></ol>
|
|
</li><li id="dli_09_0205__li1954555331713"><strong id="dli_09_0205__b7373134831716">Versions earlier than Spark 3.3:</strong><p id="dli_09_0205__p36911513131220">Upload the JAR file to OBS and DLI.</p>
|
|
<ol id="dli_09_0205__ol397711215413"><li id="dli_09_0205__li19776129541">Log in to the OBS console and upload the <strong id="dli_09_0205__b203671733164814">SparkJarObs-1.0-SNAPSHOT.jar</strong> file to the OBS path.</li><li id="dli_09_0205__li125643214189">Upload the file to DLI for package management.<ol type="a" id="dli_09_0205__ol756103218180"><li id="dli_09_0205__li1556183241810">Log in to the DLI management console and choose <strong id="dli_09_0205__b022511617490">Data Management</strong> > <strong id="dli_09_0205__b14225156124915">Package Management</strong>.</li><li id="dli_09_0205__li8565328181">On the <strong id="dli_09_0205__b98791344173614">Package Management</strong> page, click <strong id="dli_09_0205__b387964463616">Create</strong> in the upper right corner.</li><li id="dli_09_0205__li8561032181818">In the <strong id="dli_09_0205__b1056714783619">Create Package</strong> dialog, set the following parameters:<ol class="substepthirdol" id="dli_09_0205__ol7567320189"><li id="dli_09_0205__li55633241816"><strong id="dli_09_0205__b86342557361">Type</strong>: Select <strong id="dli_09_0205__b14635115517366">JAR</strong>.</li><li id="dli_09_0205__li185693281815"><strong id="dli_09_0205__b1416216582366">OBS Path</strong>: Specify the OBS path for storing the package.</li><li id="dli_09_0205__li135663221819">Set <strong id="dli_09_0205__b1477921217375">Group</strong> and <strong id="dli_09_0205__b127808123371">Group Name</strong> as required for package identification and management.</li></ol>
|
|
</li><li id="dli_09_0205__li756193221817">Click <strong id="dli_09_0205__b1580019181376">OK</strong>.</li></ol>
|
|
</li></ol>
|
|
</li></ul>
|
|
</div>
|
|
<div class="section" id="dli_09_0205__section1780916256569"><h4 class="sectiontitle">Step 7: Create a Spark Jar Job</h4><ol id="dli_09_0205__ol1811712355154"><li id="dli_09_0205__li1711733561514">Log in to the DLI console. In the navigation pane, choose <strong id="dli_09_0205__b185711126113719">Job Management</strong> > <strong id="dli_09_0205__b05721326143719">Spark Jobs</strong>.</li><li id="dli_09_0205__li138433811617">On the <strong id="dli_09_0205__b19996123295610">Spark Jobs</strong> page, click <strong id="dli_09_0205__b8396153515620">Create Job</strong>.</li><li id="dli_09_0205__li144811015170">On the displayed page, configure the following parameters:<ul id="dli_09_0205__ul1657584082316"><li id="dli_09_0205__li1157515409235"><strong id="dli_09_0205__b116945325816">Queue</strong>: Select the created queue. For example, select the queue <strong id="dli_09_0205__b1458219366582">sparktest</strong> created in <a href="#dli_09_0205__section3345113541312">Step 1: Create a Queue for General Purpose</a>.</li><li id="dli_09_0205__li127004201623">Select a supported Spark version from the drop-down list. The latest version is recommended.</li><li id="dli_09_0205__li171210566241"><strong id="dli_09_0205__b46231858175817">Job Name (--name)</strong>: Name of the Spark Jar job. For example, <strong id="dli_09_0205__b93347302593">SparkTestObs</strong>.</li><li id="dli_09_0205__li270281332618"><strong id="dli_09_0205__b1088015386593">Application</strong>: Select the package uploaded in <a href="#dli_09_0205__section633044910536">Step 6: Upload the JAR Package to OBS and DLI</a>. For example, select <strong id="dli_09_0205__b82131554155916">SparkJarObs-1.0-SNAPSHOT.jar</strong>.</li><li id="dli_09_0205__li10823122920272"><strong id="dli_09_0205__b6937101819015">Main Class (--class)</strong>: The format is program package name + class name. </li></ul>
|
|
<p id="dli_09_0205__p144945218206">You do not need to set other parameters.</p>
|
|
</li><li id="dli_09_0205__li19508132763014">Click <strong id="dli_09_0205__b67597451924">Execute</strong> to submit the Spark Jar job. On the Job management page, view the running status.</li></ol>
|
|
</div>
|
|
<div class="section" id="dli_09_0205__section9559162614323"><h4 class="sectiontitle">Step 8: View Job Execution Result</h4><ol id="dli_09_0205__ol16745538334"><li id="dli_09_0205__li117420534337">On the Job management page, view the running status. The initial status is <strong id="dli_09_0205__b188439275418">Starting</strong>.</li><li id="dli_09_0205__li194787517369">If the job is successfully executed, the job status is <strong id="dli_09_0205__b1152513116516">Finished</strong>. Click <strong id="dli_09_0205__b1375151611513">More</strong> in the <strong id="dli_09_0205__b1178610364519">Operation</strong> column and select <strong id="dli_09_0205__b1331112918515">Driver Logs</strong> to view the running log.<div class="fignone" id="dli_09_0205__fig145811810174118"><span class="figcap"><b>Figure 7 </b>Driver logs</span><br><span><img id="dli_09_0205__image1558131012417" src="en-us_image_0000001251907299.png"></span></div>
|
|
</li><li id="dli_09_0205__li498916557597">If the job is successfully executed, go to the <strong id="dli_09_0205__b1187216616497">result/parquet</strong> directory in the OBS bucket to view the generated <strong id="dli_09_0205__b16717923104915">parquet</strong> file.</li><li id="dli_09_0205__li649953013416">If the job fails to be executed, choose <strong id="dli_09_0205__b5943123444911">More</strong> > <strong id="dli_09_0205__b58461637194914">Driver Logs</strong> in the <strong id="dli_09_0205__b8203164314491">Operation</strong> column to view the detailed error information.<div class="p" id="dli_09_0205__p151551031049">For example, the following figure shows that when you create the Spark Jar job, you did not add the package path to the main class name.<div class="fignone" id="dli_09_0205__fig16753111582217"><span class="figcap"><b>Figure 8 </b>Error information</span><br><span><img id="dli_09_0205__image1275421582211" src="en-us_image_0000001686339805.png"></span></div>
|
|
</div>
|
|
<p id="dli_09_0205__p1974800164515">In the <strong id="dli_09_0205__b1215553614568">Operation</strong> column, click <strong id="dli_09_0205__b194961111135715">Edit</strong>, change the value of <strong id="dli_09_0205__b736443195815">Main Class</strong> to <strong id="dli_09_0205__b1180982915345">com.</strong><strong id="dli_09_0205__b180982913348"></strong><strong id="dli_09_0205__b08101829183416">SparkDemoObs</strong>, and click <strong id="dli_09_0205__b147048388341">Execute</strong> to run the job again.</p>
|
|
</li></ol>
|
|
</div>
|
|
<div class="section" id="dli_09_0205__section536212344115"><a name="dli_09_0205__section536212344115"></a><a name="section536212344115"></a><h4 class="sectiontitle">Sample Code</h4><div class="note" id="dli_09_0205__note84662050195919"><img src="public_sys-resources/note_3.0-en-us.png"><span class="notetitle"> </span><div class="notebody"><p id="dli_09_0205__p1140569541">Hard-coded or plaintext <strong id="dli_09_0205__b10985721018">access.key</strong> and <strong id="dli_09_0205__b578516610018">secret.key</strong> pose significant security risks. To ensure security, encrypt your AK and SK, store them in configuration files or environment variables, and decrypt them when needed.</p>
|
|
</div></div>
|
|
<pre class="screen" id="dli_09_0205__screen169072296410">package com.dli.demo;
|
|
|
|
import org.apache.spark.sql.Dataset;
|
|
import org.apache.spark.sql.Row;
|
|
import org.apache.spark.sql.SaveMode;
|
|
import org.apache.spark.sql.SparkSession;
|
|
|
|
import static org.apache.spark.sql.functions.col;
|
|
|
|
public class SparkDemoObs {
|
|
public static void main(String[] args) {
|
|
SparkSession spark = SparkSession
|
|
.builder()
|
|
.config("spark.hadoop.fs.obs.access.key", "<em id="dli_09_0205__i206218197427">xxx</em>")
|
|
.config("spark.hadoop.fs.obs.secret.key", "<em id="dli_09_0205__i155843014427">yyy</em>")
|
|
.appName("java_spark_demo")
|
|
.getOrCreate();
|
|
// can also be used --conf to set the ak sk when submit the app
|
|
|
|
// test json data:
|
|
// {"name":"Michael"}
|
|
// {"name":"Andy", "age":30}
|
|
// {"name":"Justin", "age":19}
|
|
Dataset<Row> df = spark.read().json("obs://dli-test-obs01/people.json");
|
|
df.printSchema();
|
|
// root
|
|
// |-- age: long (nullable = true)
|
|
// |-- name: string (nullable = true)
|
|
|
|
// Displays the content of the DataFrame to stdout
|
|
df.show();
|
|
// +----+-------+
|
|
// | age| name|
|
|
// +----+-------+
|
|
// |null|Michael|
|
|
// | 30| Andy|
|
|
// | 19| Justin|
|
|
// +----+-------+
|
|
|
|
// Select only the "name" column
|
|
df.select("name").show();
|
|
// +-------+
|
|
// | name|
|
|
// +-------+
|
|
// |Michael|
|
|
// | Andy|
|
|
// | Justin|
|
|
// +-------+
|
|
|
|
// Select people older than 21
|
|
df.filter(col("age").gt(21)).show();
|
|
// +---+----+
|
|
// |age|name|
|
|
// +---+----+
|
|
// | 30|Andy|
|
|
// +---+----+
|
|
|
|
// Count people by age
|
|
df.groupBy("age").count().show();
|
|
// +----+-----+
|
|
// | age|count|
|
|
// +----+-----+
|
|
// | 19| 1|
|
|
// |null| 1|
|
|
// | 30| 1|
|
|
// +----+-----+
|
|
|
|
// Register the DataFrame as a SQL temporary view
|
|
df.createOrReplaceTempView("people");
|
|
|
|
Dataset<Row> sqlDF = spark.sql("SELECT * FROM people");
|
|
sqlDF.show();
|
|
// +----+-------+
|
|
// | age| name|
|
|
// +----+-------+
|
|
// |null|Michael|
|
|
// | 30| Andy|
|
|
// | 19| Justin|
|
|
// +----+-------+
|
|
|
|
sqlDF.write().mode(SaveMode.Overwrite).parquet("obs://dli-test-obs01/result/parquet");
|
|
spark.read().parquet("obs://dli-test-obs01/result/parquet").show();
|
|
|
|
spark.stop();
|
|
}
|
|
}</pre>
|
|
<p id="dli_09_0205__p14654122024112"></p>
|
|
</div>
|
|
</div>
|
|
<div>
|
|
<div class="familylinks">
|
|
<div class="parentlink"><strong>Parent topic:</strong> <a href="dli_09_0203.html">Spark Jar Jobs</a></div>
|
|
</div>
|
|
</div>
|
|
|