Skip to content
rxin edited this page May 22, 2012 · 46 revisions

Setup

Launch AMI

~/Documents/shark/mesos/trunk/ec2/mesos-ec2 -k rxin-us-east \
-i ~/.ec2/rxin-us-east.pem -s 100 --ami ami-502d8a39 -t m2.2xlarge launch shark-sigmod

Update Shark/Spark (maybe optional)

Setup AWS credentials for s3n in ephemeral-hdfs/conf/core-site.xml

<property>
    <name>fs.s3n.awsAccessKeyId</name>
    <value>ID</value>
</property>

<property>
    <name>fs.s3n.awsSecretAccessKey</name>
    <value>SECRET</value>
</property>

Start MapReduce job tracker

/root/ephemeral-hdfs/bin/start-mapred.sh

Copy files over from s3

ephemeral-hdfs/bin/hadoop distcp s3n://wiki-traffic/pagecounts hdfs://`hostname`:9000/wiki/pagecounts
ephemeral-hdfs/bin/hadoop distcp s3n://wiki-traffic/wiki-dump hdfs://`hostname`:9000/wiki/dump

run the renaming script and load all tables into Hive

cp /root/ephemeral-hdfs/conf/ /root/spark/conf/core-site.xml

Set ulimit

ulimit -n 1000000

Queries

shark/bin/shark-shell
def s = sql2console _

Some configs:

set mapred.job.reuse.jvm.num.tasks=-1;
set hive.merge.mapfiles=false;
set mapred.reduce.tasks=400;
set hive.map.aggr=false;

Cache Jan and Feb data (140 secs - this is long because Shark needs to decompress the data)

create table wiki_filtered as select * from wikistats where (dt like '200902%' or dt like '200901%') and project_code = 'en' and not page_name like 'Special:%';
create table wiki_cached as select * from wiki_filtered;

Create external table for text

create external table text_with_title (title string, body string) row format delimited fields terminated by '\t' location '/wiki/dump/text_with_title/';
create table page_cached as select * from text_with_title;

Query 1: Top Pages on Valentine's Day (20 secs)

set mapred.reduce.tasks=200;
select page_name, sum(page_views) s from wiki_cached where dt='20090214' group by page_name order by s desc limit 20;
create table top_pages_cached as select page_name, sum(page_views) s from wiki_cached where dt='20090214' group by page_name order by s desc limit 500;

Query 2: Hits on pages related to Valentine's day (6 secs)

set mapred.reduce.tasks=8;
select page_name, sum(page_views) s from wiki_cached where page_name like '%Valentine%' group by page_name order by s desc limit 10;

Query 3. Histogram (6 secs)

set mapred.reduce.tasks=10;
select dt, sum(page_views) s from wiki_cached where page_name like 'Valentine%' group by dt order by dt limit 100;

Query 4. Join

set mapred.reduce.tasks=200;
create table top_pages as select page_name, sum(page_views) s from wiki_cached where dt='20090214' group by page_name order by s desc limit 200;
create table top_pages_with_titles as select title, body from text_with_title t join top_pages p on t.title = p.page_name;

In shark-shell

val inputPages = sql2rdd("select * from top_pages_with_titles")
val docs = inputPages.mapRows { row => (row.getString(0), row.getString(1).replaceAll("""\\n"""," ").split("\\W").filter(_ != "").mkString(" ").toLowerCase) }
val docVecs = termVectors(docs, 300).cache()
val seqs = kmeans(docVecs, 5)
for ((seq, i) <- seqs.zipWithIndex) {
  println("Cluster " + i + ": " + seq.take(10).mkString(", "))
}