This repository has been archived by the owner on Apr 2, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Notebook.json
1 lines (1 loc) · 21.1 KB
/
Notebook.json
1
{"paragraphs":[{"text":"%spark.dep\nz.load(\"org.mongodb.spark:mongo-spark-connector_2.11:2.1.1\")","user":"anonymous","dateUpdated":"2018-12-19T18:29:01+0000","config":{"colWidth":12,"editorMode":"ace/mode/scala","results":{},"enabled":true,"editorSetting":{"language":"scala"}},"settings":{"params":{},"forms":{}},"results":{"code":"ERROR","msg":[{"type":"TEXT","data":"Must be used before SparkInterpreter (%spark) initialized\nHint: put this paragraph before any Spark code and restart Zeppelin/Interpreter"}]},"apps":[],"jobName":"paragraph_1544306097111_-2052240187","id":"20181208-203830_460150541","dateCreated":"2018-12-08T21:54:57+0000","dateStarted":"2018-12-19T18:29:01+0000","dateFinished":"2018-12-19T18:29:01+0000","status":"ERROR","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:12638"},{"text":"%pyspark\nfrom pyspark.sql import SparkSession\n\nmy_spark = SparkSession \\\n .builder \\\n .appName(\"myApp\") \\\n .config(\"spark.mongodb.input.uri\", \"mongodb://172.19.0.1:8888/articles.posts\") \\\n .config(\"spark.mongodb.output.uri\", \"mongodb://172.19.0.1:8888/articles.posts\") \\\n .getOrCreate()\n\ndf = spark.read.format(\"com.mongodb.spark.sql.DefaultSource\").load()","user":"anonymous","dateUpdated":"2018-12-19T18:29:06+0000","config":{"colWidth":12,"editorMode":"ace/mode/python","results":{},"enabled":true,"editorSetting":{"language":"python","editOnDblClick":false}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1544306097112_-2054163931","id":"20181208-203926_1416928688","dateCreated":"2018-12-08T21:54:57+0000","dateStarted":"2018-12-19T18:29:06+0000","dateFinished":"2018-12-19T18:29:06+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:12639"},{"text":"%pyspark\n\nfrom pyspark.ml.feature import Tokenizer, StopWordsRemover\nfrom pyspark.ml.feature import CountVectorizer, RegexTokenizer\nfrom nltk.stem import WordNetLemmatizer\nfrom nltk.stem.porter import *\nfrom pyspark.sql.types import *\nfrom pyspark.sql.functions import udf\nimport pyspark.sql.functions as f\n\ntokenizer = Tokenizer(inputCol=\"article\", outputCol=\"words\")\nwordsData = tokenizer.transform(df)\nremover = StopWordsRemover(inputCol=\"words\", outputCol=\"filtered\")\nfilteredData = remover.transform(wordsData)\nfilteredData.cache()\n\ndef stem(in_vec):\n out_vec = []\n for t in in_vec:\n t_stem = stemmer.lemmatize(t)\n if len(t_stem) > 5:\n out_vec.append(t_stem) \n return out_vec\n \nstemmer_udf = udf(lambda x: stem(x), ArrayType(StringType()))\n#lemmatize words\nstemmer = WordNetLemmatizer()\narticles = filteredData.select('article')\ntokenizer = RegexTokenizer(inputCol=\"article\", outputCol=\"words\", pattern=r'[^\\w+]')\nwordsData = tokenizer.transform(articles)\n#remove stopwords\nremover = StopWordsRemover(inputCol=\"words\", outputCol=\"filtered\")\nfilteredData1 = remover.transform(wordsData)\n \n# Create user defined function for stemming with return type Array<String>\nvector_stemmed_df = (\n filteredData1\n .withColumn(\"vector_stemmed\", stemmer_udf(\"filtered\"))\n .select(\"vector_stemmed\")\n )\n \n#obtain top 20 words \nwordCounts = vector_stemmed_df.withColumn('wordCount', f.explode(f.col('vector_stemmed'))).groupBy('wordCount').count().sort('count', ascending=False).select('wordCount','count').limit(20)\nwordCounts.show()","user":"anonymous","dateUpdated":"2018-12-19T18:29:08+0000","config":{"colWidth":12,"enabled":true,"results":{},"editorSetting":{"language":"python"},"editorMode":"ace/mode/python","tableHide":false},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"+----------+-----+\n| wordCount|count|\n+----------+-----+\n| police| 4523|\n| december| 3215|\n|government| 2638|\n| updated| 2185|\n| official| 1837|\n| people| 1733|\n| minister| 1721|\n| district| 1601|\n| around| 1354|\n|department| 1241|\n| student| 1178|\n| hospital| 1177|\n| station| 1162|\n| congress| 1087|\n| accused| 1069|\n| indian| 1055|\n| school| 1047|\n| however| 1027|\n| project| 1020|\n| officer| 994|\n+----------+-----+\n\n"}]},"apps":[],"jobName":"paragraph_1544907302594_1829741070","id":"20181215-205502_768180065","dateCreated":"2018-12-15T20:55:02+0000","dateStarted":"2018-12-19T18:29:08+0000","dateFinished":"2018-12-19T18:29:19+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:12641"},{"text":"%pyspark\n#group based on state\nfrom pyspark.sql.types import *\nimport re\n\ndistinctStates = filteredData.select('state').distinct()\nlists = []\n#for all states\nfor state in distinctStates.collect():\n articles = filteredData.select('article').where(filteredData.state == state[\"state\"])\n #get count of articles per state\n count = articles.count()\n stateId = state[\"state\"]\n tokenizer = RegexTokenizer(inputCol=\"article\", outputCol=\"words\", pattern=r'[^\\w+]')\n wordsData = tokenizer.transform(articles)\n #remove stopwords\n remover = StopWordsRemover(inputCol=\"words\", outputCol=\"filtered\")\n filteredData1 = remover.transform(wordsData)\n #lemmatize data\n vector_stemmed_df = (\n filteredData1\n .withColumn(\"vector_stemmed\", stemmer_udf(\"filtered\"))\n .select(\"vector_stemmed\")\n )\n #obtain top 20 words\n vector = vector_stemmed_df.withColumn('wordCount', f.explode(f.col('vector_stemmed'))).groupBy('wordCount').count().sort('count', ascending=False).limit(20)\n vector.show()\n vectorWords = vector.agg(f.concat_ws(\", \", f.collect_list('wordCount')).alias('words'))\n lists.append((stateId, vectorWords.first()[0], count))\n \nwordsState = spark.createDataFrame(lists, ['State', 'Keywords', 'ArticleCount'])\nwordsState=wordsState.where(\"state!=''\")\nwordsState.show()\n \n \n\n \n \n","user":"anonymous","dateUpdated":"2018-12-19T18:29:27+0000","config":{"colWidth":12,"enabled":true,"results":{},"editorSetting":{"language":"python"},"editorMode":"ace/mode/python","tableHide":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"+----------+-----+\n| wordCount|count|\n+----------+-----+\n| ranchi| 169|\n| police| 133|\n| jharkhand| 129|\n| station| 104|\n| official| 77|\n| minister| 69|\n| district| 64|\n| village| 62|\n| congress| 57|\n|government| 55|\n| officer| 53|\n| people| 48|\n| humanoid| 48|\n| president| 48|\n|srivastava| 47|\n|department| 46|\n| railway| 44|\n| church| 42|\n| student| 42|\n| bhavan| 42|\n+----------+-----+\n\n+-------------+-----+\n| wordCount|count|\n+-------------+-----+\n| december| 50|\n| mangaluru| 33|\n| farmer| 24|\n| special| 23|\n| updated| 22|\n| police| 20|\n| district| 18|\n| fishery| 17|\n|correspondent| 17|\n| paryaya| 17|\n| association| 16|\n| government| 15|\n| tirtha| 14|\n| junction| 13|\n| vehicle| 13|\n| slipway| 12|\n| traffic| 11|\n| commissioner| 11|\n| railway| 11|\n| temple| 10|\n+-------------+-----+\n\n+----------+-----+\n| wordCount|count|\n+----------+-----+\n| december| 241|\n|government| 203|\n| police| 193|\n| updated| 134|\n| kerala| 125|\n| service| 87|\n| minister| 87|\n| reporter| 82|\n|department| 75|\n|sabarimala| 75|\n| kozhikode| 74|\n| official| 73|\n| people| 70|\n| student| 68|\n| project| 67|\n| station| 63|\n| committee| 62|\n| protest| 61|\n| district| 61|\n| member| 61|\n+----------+-----+\n\n+----------+-----+\n| wordCount|count|\n+----------+-----+\n| police| 339|\n| december| 323|\n| updated| 148|\n| accused| 146|\n|government| 141|\n| school| 125|\n| mediation| 107|\n| reporter| 90|\n| admission| 73|\n| sunday| 73|\n| centre| 72|\n| family| 70|\n| official| 70|\n| incident| 68|\n| arrested| 68|\n| allegedly| 67|\n| around| 66|\n| officer| 66|\n| victim| 65|\n| complaint| 63|\n+----------+-----+\n\n+----------+-----+\n| wordCount|count|\n+----------+-----+\n|university| 84|\n| police| 68|\n|government| 68|\n| december| 63|\n| people| 54|\n| district| 50|\n| express| 44|\n| college| 43|\n| saturday| 43|\n| station| 43|\n| updated| 41|\n| plastic| 40|\n| minister| 39|\n| student| 38|\n| election| 37|\n| building| 36|\n| official| 34|\n| friday| 34|\n| railway| 33|\n| nitish| 32|\n+----------+-----+\n\n+----------+-----+\n| wordCount|count|\n+----------+-----+\n| police| 764|\n| accused| 287|\n| official| 222|\n| station| 194|\n| complaint| 183|\n| village| 179|\n|government| 172|\n| people| 159|\n| district| 152|\n| updated| 151|\n| gujarat| 150|\n| resident| 149|\n| vehicle| 136|\n|department| 127|\n| arrested| 122|\n| minister| 121|\n| december| 119|\n| officer| 118|\n| according| 112|\n| around| 110|\n+----------+-----+\n\n+-----------+-----+\n| wordCount|count|\n+-----------+-----+\n| district| 122|\n| odisha| 116|\n| police| 108|\n| minister| 107|\n| government| 87|\n|bhubaneswar| 81|\n| people| 81|\n| naveen| 76|\n| farmer| 67|\n| election| 61|\n| around| 55|\n| patnaik| 48|\n| congress| 47|\n| teacher| 46|\n| pradhan| 44|\n| centre| 40|\n| student| 36|\n| ganjam| 36|\n| station| 36|\n| monday| 35|\n+-----------+-----+\n\n+----------+-----+\n| wordCount|count|\n+----------+-----+\n| kolkata| 99|\n| police| 57|\n| student| 47|\n| school| 47|\n| official| 45|\n| around| 40|\n| hospital| 34|\n| passenger| 34|\n| station| 34|\n| december| 33|\n|government| 29|\n| airport| 29|\n| bengal| 29|\n| market| 28|\n| tuesday| 28|\n| however| 26|\n| system| 24|\n| minister| 24|\n| officer| 24|\n| traffic| 23|\n+----------+-----+\n\n+----------+-----+\n| wordCount|count|\n+----------+-----+\n| december| 223|\n| hyderabad| 146|\n| police| 106|\n| updated| 99|\n| telangana| 96|\n| district| 78|\n|government| 72|\n| official| 63|\n| project| 57|\n| president| 52|\n| election| 50|\n| centre| 48|\n| special| 48|\n| minister| 47|\n| andhra| 45|\n| people| 44|\n| cyclone| 44|\n| monday| 44|\n| leader| 43|\n| tuesday| 43|\n+----------+-----+\n\n+-----------+-----+\n| wordCount|count|\n+-----------+-----+\n| police| 427|\n| december| 426|\n| official| 245|\n| updated| 234|\n| government| 227|\n| chennai| 223|\n| district| 182|\n| people| 181|\n| department| 175|\n| student| 150|\n| elephant| 144|\n| officer| 140|\n| minister| 133|\n| temple| 130|\n|corporation| 118|\n| coimbatore| 117|\n| hospital| 114|\n| school| 108|\n| project| 99|\n| around| 98|\n+-----------+-----+\n\n+----------+-----+\n| wordCount|count|\n+----------+-----+\n| police| 572|\n| student| 178|\n| station| 159|\n| school| 158|\n| kanpur| 145|\n| district| 139|\n| family| 132|\n| sector| 131|\n| allahabad| 129|\n| updated| 126|\n| minister| 115|\n| people| 114|\n|government| 108|\n| incident| 106|\n| village| 103|\n|department| 96|\n| official| 92|\n| authority| 91|\n| december| 87|\n| around| 85|\n+----------+-----+\n\n+----------+-----+\n| wordCount|count|\n+----------+-----+\n| police| 676|\n| december| 309|\n| mumbai| 306|\n| hospital| 287|\n| official| 280|\n| around| 194|\n|government| 187|\n| people| 182|\n| updated| 182|\n| accused| 166|\n| officer| 135|\n| arrested| 134|\n| resident| 132|\n| family| 129|\n| incident| 129|\n| project| 129|\n| however| 123|\n| nagpur| 122|\n| station| 119|\n| friday| 119|\n+----------+-----+\n\n+----------+-----+\n| wordCount|count|\n+----------+-----+\n| temple| 511|\n| police| 466|\n|government| 370|\n| hospital| 296|\n| district| 269|\n| december| 236|\n| people| 210|\n| bengaluru| 194|\n| updated| 189|\n| karnataka| 177|\n| minister| 170|\n| project| 161|\n| authority| 160|\n| devotee| 160|\n| friday| 152|\n|department| 140|\n| official| 140|\n| around| 134|\n| mysuru| 131|\n| student| 120|\n+----------+-----+\n\n+----------+-----+\n| wordCount|count|\n+----------+-----+\n| police| 281|\n| report| 163|\n|department| 132|\n| punjab| 131|\n| resident| 117|\n| accused| 110|\n| sector| 97|\n|registered| 95|\n|chandigarh| 93|\n| around| 90|\n| minister| 88|\n|government| 86|\n| school| 84|\n| section| 78|\n| official| 77|\n| district| 75|\n| community| 75|\n| however| 73|\n| congress| 70|\n| member| 66|\n+----------+-----+\n\n+----------+-----+\n| wordCount|count|\n+----------+-----+\n|government| 167|\n| minister| 148|\n| parrikar| 100|\n| panaji| 66|\n| official| 57|\n| mining| 53|\n| source| 53|\n|department| 52|\n| updated| 52|\n| people| 51|\n| health| 51|\n| bridge| 50|\n| sardesai| 48|\n| project| 48|\n| tawadkar| 48|\n| manohar| 44|\n| safety| 42|\n| school| 40|\n| political| 39|\n| election| 38|\n+----------+-----+\n\n+----------+-----+\n| wordCount|count|\n+----------+-----+\n| congress| 278|\n| gehlot| 259|\n| minister| 168|\n| jaipur| 130|\n|government| 128|\n| leader| 121|\n| rajasthan| 118|\n| police| 115|\n| supporter| 105|\n| election| 86|\n| worker| 79|\n| deputy| 79|\n| sachin| 71|\n| gandhi| 71|\n| official| 69|\n| residence| 66|\n| ceremony| 59|\n| candidate| 57|\n| meeting| 55|\n| president| 55|\n+----------+-----+\n\n+-----------+-----+\n| wordCount|count|\n+-----------+-----+\n| panchayat| 174|\n| congress| 165|\n| district| 133|\n| member| 113|\n| election| 88|\n| minister| 75|\n| candidate| 71|\n| counting| 59|\n| president| 55|\n|independent| 55|\n| government| 51|\n| national| 51|\n| police| 49|\n| updated| 47|\n| declared| 47|\n| result| 46|\n| parishad| 46|\n| thursday| 44|\n| friday| 44|\n| alliance| 42|\n+-----------+-----+\n\n+----------+-----+\n| wordCount|count|\n+----------+-----+\n| december| 792|\n| indian| 652|\n| series| 567|\n| updated| 557|\n| zealand| 484|\n|government| 449|\n| wicket| 442|\n| second| 412|\n| iphone| 364|\n| country| 356|\n| company| 351|\n| player| 326|\n| people| 306|\n| australia| 295|\n| around| 273|\n| however| 251|\n| bowler| 243|\n| mishra| 242|\n| according| 236|\n| batting| 233|\n+----------+-----+\n\n+-------------+-----+\n| wordCount|count|\n+-------------+-----+\n| december| 129|\n|visakhapatnam| 78|\n| updated| 58|\n| cyclone| 55|\n| official| 54|\n| district| 44|\n| monday| 44|\n| police| 40|\n| minister| 40|\n| department| 35|\n| people| 34|\n| phethai| 32|\n| tourist| 31|\n| vijayawada| 28|\n| tuesday| 25|\n| government| 23|\n| according| 23|\n| tourism| 22|\n| medical| 20|\n| special| 20|\n+-------------+-----+\n\n+-----+--------------------+------------+\n|State| Keywords|ArticleCount|\n+-----+--------------------+------------+\n| JH|ranchi, police, j...| 80|\n| PD|december, mangalu...| 25|\n| KL|december, governm...| 161|\n| DL|police, december,...| 154|\n| BR|university, polic...| 80|\n| GJ|police, accused, ...| 400|\n| OD|district, odisha,...| 80|\n| WB|kolkata, police, ...| 63|\n| TS|december, hyderab...| 113|\n| TN|police, december,...| 306|\n| UP|police, student, ...| 300|\n| MH|police, december,...| 378|\n| KA|temple, police, g...| 362|\n| PJ|police, report, d...| 160|\n| GA|government, minis...| 80|\n| RJ|congress, gehlot,...| 80|\n| AS|panchayat, congre...| 80|\n| AP|december, visakha...| 49|\n+-----+--------------------+------------+\n\n"}]},"apps":[],"jobName":"paragraph_1544898240420_-1046238795","id":"20181215-182400_1684783351","dateCreated":"2018-12-15T18:24:00+0000","dateStarted":"2018-12-19T18:29:28+0000","dateFinished":"2018-12-19T18:30:09+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:12642"},{"text":"%pyspark\n#group based on news category\ndistinctCategories = filteredData.select('category').distinct()\nlists = []\n#for every category\nfor category in distinctCategories.collect():\n if len(category) < 2:\n articles = filteredData.select('article').where(filteredData.category == category[\"category\"])\n #get count of articles per category\n count = articles.count()\n categoryId = category[\"category\"]\n tokenizer = RegexTokenizer(inputCol=\"article\", outputCol=\"words\", pattern=r'[^\\w+]')\n wordsData = tokenizer.transform(articles)\n #remove stop words\n remover = StopWordsRemover(inputCol=\"words\", outputCol=\"filtered\")\n filteredData1 = remover.transform(wordsData)\n #lemmatize words\n vector_stemmed_df = (\n filteredData1\n .withColumn(\"vector_stemmed\", stemmer_udf(\"filtered\"))\n .select(\"vector_stemmed\")\n )\n #get word count\n vector = vector_stemmed_df.withColumn('wordCount', f.explode(f.col('vector_stemmed'))).groupBy('wordCount').count().sort('count', ascending=False).limit(20)\n vectorWords = vector.agg(f.concat_ws(\", \", f.collect_list('wordCount')).alias('words'))\n lists.append((categoryId, vectorWords.first()[0], count))\n \nwordsCategory = spark.createDataFrame(lists, ['Category', 'Keywords', 'ArticleCount'])\nwordsCategory = wordsCategory.where(\"Category!='\\n'\")\nwordsCategory.show()\n","user":"anonymous","dateUpdated":"2018-12-19T18:30:14+0000","config":{"colWidth":12,"enabled":true,"results":{},"editorSetting":{"language":"python"},"editorMode":"ace/mode/python","tableHide":false},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"+--------+--------------------+------------+\n|Category| Keywords|ArticleCount|\n+--------+--------------------+------------+\n| BU\n|government, compa...| 105|\n| ENV\n|pollution, climat...| 60|\n| SC\n|researcher, scien...| 71|\n| LS\n|weight, catriona,...| 6|\n| SP\n|december, player,...| 160|\n| ED\n|candidate, examin...| 59|\n| OP\n|government, elect...| 80|\n| HE\n|cancer, disease, ...| 60|\n| LS|december, christm...| 34|\n| EN\n|wedding, december...| 89|\n| CR\n|series, zealand, ...| 101|\n| TE\n|iphone, company, ...| 60|\n+--------+--------------------+------------+\n\n"}]},"apps":[],"jobName":"paragraph_1544910372149_-1282160584","id":"20181215-214612_758211859","dateCreated":"2018-12-15T21:46:12+0000","dateStarted":"2018-12-19T18:30:15+0000","dateFinished":"2018-12-19T18:30:33+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:12643"},{"text":"%pyspark \nwordsState.toPandas().to_csv('Statewise_count.csv')\nwordsCategory.toPandas().to_csv('Categorywise_count.csv')\nwordCounts.toPandas().to_csv('Overall_count.csv')","user":"anonymous","dateUpdated":"2018-12-19T18:30:35+0000","config":{"colWidth":12,"enabled":true,"results":{},"editorSetting":{"language":"python"},"editorMode":"ace/mode/python"},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1544911182884_688675403","id":"20181215-215942_160077398","dateCreated":"2018-12-15T21:59:42+0000","dateStarted":"2018-12-19T18:30:35+0000","dateFinished":"2018-12-19T18:30:44+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:12644"},{"text":"%pyspark\n","user":"anonymous","dateUpdated":"2018-12-15T17:03:29+0000","config":{"colWidth":12,"enabled":true,"results":{},"editorSetting":{"language":"python"},"editorMode":"ace/mode/python"},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1544893409233_322545376","id":"20181215-170329_7521743","dateCreated":"2018-12-15T17:03:29+0000","status":"READY","progressUpdateIntervalMs":500,"$$hashKey":"object:12645"}],"name":"plotly_check","id":"2E11G8CA5","angularObjects":{"2DY8EFHWD:shared_process":[],"2DYG98XSV:shared_process":[],"2DZ6MX7EC:shared_process":[],"2DWZA64KS:shared_process":[],"2DZ2WQ9X4:shared_process":[],"2DWGA36YT:shared_process":[],"2DZGRJCCW:shared_process":[],"2DWCY6VWA:shared_process":[],"2DYU6NB6H:shared_process":[],"2DY623ERA:shared_process":[],"2DXWH58R1:shared_process":[],"2DWBZPD1R:shared_process":[],"2DYQM1MZ9:shared_process":[],"2DZ49486P:shared_process":[],"2DY67BTT7:shared_process":[],"2DXVA1N7R:shared_process":[],"2DZVD13DM:shared_process":[],"2DWFKC7QW:shared_process":[],"2DYV2RG2J:shared_process":[]},"config":{"looknfeel":"default","personalizedMode":"false"},"info":{}}