Skip to content

Commit da0083f

Browse files
authored
Update StackOverFlowSurvey.scala
1 parent 3a6e110 commit da0083f

File tree

1 file changed

+14
-21
lines changed

1 file changed

+14
-21
lines changed

src/main/scala/com/sparkTutorial/sparkSql/StackOverFlowSurvey.scala

Lines changed: 14 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,45 +16,38 @@ object StackOverFlowSurvey {
1616

1717
val dataFrameReader = session.read
1818

19-
val responses = dataFrameReader.option("header", "true").csv("in/2016-stack-overflow-survey-responses.csv")
19+
val responses = dataFrameReader
20+
.option("header", "true")
21+
.option("inferSchema", value = true)
22+
.csv("in/2016-stack-overflow-survey-responses.csv")
2023

2124
System.out.println("=== Print out schema ===")
2225
responses.printSchema()
2326

24-
System.out.println("=== Print 20 records of responses table ===")
25-
responses.show(20)
27+
val responseWithSelectedColumns = responses.select("country", "occupation", AGE_MIDPOINT, SALARY_MIDPOINT)
2628

27-
System.out.println("=== Print the so_region and self_identification columns of gender table ===")
28-
responses.select("so_region", "self_identification").show()
29+
System.out.println("=== Print the selected columns of the table ===")
30+
responseWithSelectedColumns.show()
2931

3032
System.out.println("=== Print records where the response is from Afghanistan ===")
31-
responses.filter(responses.col("country").===("Afghanistan")).show()
33+
responseWithSelectedColumns.filter(responseWithSelectedColumns.col("country").===("Afghanistan")).show()
3234

3335
System.out.println("=== Print the count of occupations ===")
34-
val groupedDataset = responses.groupBy("occupation")
36+
val groupedDataset = responseWithSelectedColumns.groupBy("occupation")
3537
groupedDataset.count().show()
3638

37-
System.out.println("=== Cast the salary mid point and age mid point to integer ===")
38-
val castedResponse = responses.withColumn(SALARY_MIDPOINT, responses.col(SALARY_MIDPOINT).cast("integer"))
39-
.withColumn(AGE_MIDPOINT, responses.col(AGE_MIDPOINT).cast("integer"))
40-
41-
System.out.println("=== Print out casted schema ===")
42-
castedResponse.printSchema()
43-
44-
import session.implicits._
4539
System.out.println("=== Print records with average mid age less than 20 ===")
46-
castedResponse.filter($"age_midpoint" < 20).show()
40+
responseWithSelectedColumns.filter(responseWithSelectedColumns.col(AGE_MIDPOINT) < 20).show()
4741

4842
System.out.println("=== Print the result by salary middle point in descending order ===")
49-
castedResponse.orderBy(castedResponse.col(SALARY_MIDPOINT).desc).show()
43+
responseWithSelectedColumns.orderBy(responseWithSelectedColumns.col(SALARY_MIDPOINT).desc).show()
5044

5145
System.out.println("=== Group by country and aggregate by average salary middle point and max age middle point ===")
52-
val datasetGroupByCountry = castedResponse.groupBy("country")
46+
val datasetGroupByCountry = responseWithSelectedColumns.groupBy("country")
5347
datasetGroupByCountry.avg(SALARY_MIDPOINT).show()
5448

55-
56-
val responseWithSalaryBucket = castedResponse.withColumn(
57-
SALARY_MIDPOINT_BUCKET, castedResponse.col(SALARY_MIDPOINT).divide(20000).cast("integer").multiply(20000))
49+
val responseWithSalaryBucket = responses.withColumn(SALARY_MIDPOINT_BUCKET,
50+
responses.col(SALARY_MIDPOINT).divide(20000).cast("integer").multiply(20000))
5851

5952
System.out.println("=== With salary bucket column ===")
6053
responseWithSalaryBucket.select(SALARY_MIDPOINT, SALARY_MIDPOINT_BUCKET).show()

0 commit comments

Comments
 (0)