@@ -16,45 +16,38 @@ object StackOverFlowSurvey {
16
16
17
17
val dataFrameReader = session.read
18
18
19
- val responses = dataFrameReader.option(" header" , " true" ).csv(" in/2016-stack-overflow-survey-responses.csv" )
19
+ val responses = dataFrameReader
20
+ .option(" header" , " true" )
21
+ .option(" inferSchema" , value = true )
22
+ .csv(" in/2016-stack-overflow-survey-responses.csv" )
20
23
21
24
System .out.println(" === Print out schema ===" )
22
25
responses.printSchema()
23
26
24
- System .out.println(" === Print 20 records of responses table ===" )
25
- responses.show(20 )
27
+ val responseWithSelectedColumns = responses.select(" country" , " occupation" , AGE_MIDPOINT , SALARY_MIDPOINT )
26
28
27
- System .out.println(" === Print the so_region and self_identification columns of gender table ===" )
28
- responses.select( " so_region " , " self_identification " ) .show()
29
+ System .out.println(" === Print the selected columns of the table ===" )
30
+ responseWithSelectedColumns .show()
29
31
30
32
System .out.println(" === Print records where the response is from Afghanistan ===" )
31
- responses .filter(responses .col(" country" ).=== (" Afghanistan" )).show()
33
+ responseWithSelectedColumns .filter(responseWithSelectedColumns .col(" country" ).=== (" Afghanistan" )).show()
32
34
33
35
System .out.println(" === Print the count of occupations ===" )
34
- val groupedDataset = responses .groupBy(" occupation" )
36
+ val groupedDataset = responseWithSelectedColumns .groupBy(" occupation" )
35
37
groupedDataset.count().show()
36
38
37
- System .out.println(" === Cast the salary mid point and age mid point to integer ===" )
38
- val castedResponse = responses.withColumn(SALARY_MIDPOINT , responses.col(SALARY_MIDPOINT ).cast(" integer" ))
39
- .withColumn(AGE_MIDPOINT , responses.col(AGE_MIDPOINT ).cast(" integer" ))
40
-
41
- System .out.println(" === Print out casted schema ===" )
42
- castedResponse.printSchema()
43
-
44
- import session .implicits ._
45
39
System .out.println(" === Print records with average mid age less than 20 ===" )
46
- castedResponse .filter($ " age_midpoint " < 20 ).show()
40
+ responseWithSelectedColumns .filter(responseWithSelectedColumns.col( AGE_MIDPOINT ) < 20 ).show()
47
41
48
42
System .out.println(" === Print the result by salary middle point in descending order ===" )
49
- castedResponse .orderBy(castedResponse .col(SALARY_MIDPOINT ).desc).show()
43
+ responseWithSelectedColumns .orderBy(responseWithSelectedColumns .col(SALARY_MIDPOINT ).desc).show()
50
44
51
45
System .out.println(" === Group by country and aggregate by average salary middle point and max age middle point ===" )
52
- val datasetGroupByCountry = castedResponse .groupBy(" country" )
46
+ val datasetGroupByCountry = responseWithSelectedColumns .groupBy(" country" )
53
47
datasetGroupByCountry.avg(SALARY_MIDPOINT ).show()
54
48
55
-
56
- val responseWithSalaryBucket = castedResponse.withColumn(
57
- SALARY_MIDPOINT_BUCKET , castedResponse.col(SALARY_MIDPOINT ).divide(20000 ).cast(" integer" ).multiply(20000 ))
49
+ val responseWithSalaryBucket = responses.withColumn(SALARY_MIDPOINT_BUCKET ,
50
+ responses.col(SALARY_MIDPOINT ).divide(20000 ).cast(" integer" ).multiply(20000 ))
58
51
59
52
System .out.println(" === With salary bucket column ===" )
60
53
responseWithSalaryBucket.select(SALARY_MIDPOINT , SALARY_MIDPOINT_BUCKET ).show()
0 commit comments