-
Notifications
You must be signed in to change notification settings - Fork 0
/
Sixth-Copy1.scala
80 lines (52 loc) · 3.67 KB
/
Sixth-Copy1.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import org.apache.spark.ml.clustering.LDA
import org.apache.spark.sql.{Dataset, Row, SparkSession}
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel, RegexTokenizer}
import org.apache.spark.sql.functions.udf
import scala.collection.mutable
import org.apache.spark.sql.functions._
val df = spark.read.option("header", true)
.csv("file:///home/ozzy/Desktop/bd/dtst.csv")
.select("member_name","sitting_date", "roles","member_gender")
.na
.drop
// keep rows of a specific year
import org.apache.spark.sql.functions.{to_date, to_timestamp}
val df_date = df.withColumn("date_y", to_date($"sitting_date", "dd/MM/yyyy")).drop("sitting_date")
val ss = SparkSession.builder().appName("sixth").master("local[*]").getOrCreate()
ss.sparkContext.setLogLevel("ERROR")
/*for ( year <- 1990 to 2020 by 4){
var df_years = df_date.where(s"year(date_y) == ${year}")
var women = df_years.select("member_name", "member_gender").dropDuplicates().groupBy("member_gender").count
var count = df_years.select("member_name", "member_gender").dropDuplicates().count
println(year)
val women1 = women.withColumn("No of women in Vouli %", round(women("count")/count,3)*100)
var roles = df_years.select("member_name", "member_gender", "roles").dropDuplicates()
var temp = roles.withColumn("roles1", regexp_replace(roles.col("roles"), "[\\[\\]'']", "")).drop("roles")
var nonvoul = temp.where(s"roles1 != 'βουλευτης'").groupBy("member_gender").count
var sumSteps1 = nonvoul.agg(sum("count")).first.get(0)
var nonvoul1 = nonvoul.withColumn("Other than Vouleutis %", round(nonvoul("count")/sumSteps1,3)*100)
var upourgos = temp.where(s"roles1 like 'υπουργος%'").groupBy("member_gender").count
var sumSteps2 = upourgos.agg(sum("count")).first.get(0)
var upourgos1 =upourgos.withColumn("Ypourgos %", round(upourgos("count")/sumSteps2,3)*100)
var ufipourgos = temp.where(s"roles1 like 'υφυπουργος%'").groupBy("member_gender").count
var sumSteps3 = ufipourgos.agg(sum("count")).first.get(0)
var ufipourgos1 =ufipourgos.withColumn("Yfipourgos %", round(ufipourgos("count")/sumSteps3,3)*100)
nonvoul1.join(women, "member_gender").join(women1, "member_gender").join(upourgos1,"member_gender").join(ufipourgos1,"member_gender").drop("count").show
}
*/
var df_years = df_date.where(s"year(date_y) == 2020")
var women = df_years.select("member_name", "member_gender").dropDuplicates().groupBy("member_gender").count
var count = df_years.select("member_name", "member_gender").dropDuplicates().count
var women1 = women.withColumn("percentage %", round(women("count")/count,3)*100)
var roles = df_years.select("member_name", "member_gender", "roles").dropDuplicates()
var temp = roles.withColumn("roles1", regexp_replace(roles.col("roles"), "[\\[\\]'']", "")).drop("roles")
var nonvoul = temp.where(s"roles1 != 'βουλευτης'").groupBy("member_gender").count
var sumSteps1 = nonvoul.agg(sum("count")).first.get(0)
var nonvoul1 = nonvoul.withColumn("Other than Vouleutis %", round(nonvoul("count")/sumSteps1,3)*100)
var upourgos = temp.where(s"roles1 like 'υπουργος%'").groupBy("member_gender").count
var sumSteps2 = upourgos.agg(sum("count")).first.get(0)
var upourgos1 =upourgos.withColumn("Ypourgos %", round(upourgos("count")/sumSteps2,3)*100)
var ufipourgos = temp.where(s"roles1 like 'υφυπουργος%'").groupBy("member_gender").count
var sumSteps3 = ufipourgos.agg(sum("count")).first.get(0)
var ufipourgos1 =ufipourgos.withColumn("Yfipourgos %", round(ufipourgos("count")/sumSteps3,3)*100)
nonvoul1.join(women, "member_gender").join(women1, "member_gender").join(upourgos1,"member_gender").join(ufipourgos1,"member_gender").drop("count").show