forked from antirez/smaz
-
Notifications
You must be signed in to change notification settings - Fork 8
/
extract_tweets.c
111 lines (101 loc) · 2.54 KB
/
extract_tweets.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
/*
(C) Paul Gardner-Stephen 2012-2013
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
int extractTweet(char *s,int allowUnicode)
{
int len=0;
char out[8192];
int unicode=0;
while(*s) {
switch (*s) {
case '"': goto done;
case '&':
if (!strncmp(s,"&",5)) { out[len++]='&'; s+=4; }
else if (!strncmp(s,">",4)) { out[len++]='>'; s+=3; }
else if (!strncmp(s,"<",4)) { out[len++]='<'; s+=3; }
else {
out[len++]=*s;
}
break;
case '\\':
s++;
switch(*s) {
case 'u':
{
/* unicode character */
char hex[5];
hex[0]=*(++s);
hex[1]=*(++s);
hex[2]=*(++s);
hex[3]=*(++s);
hex[4]=0;
unsigned int codepoint=strtol(hex,NULL,16);
if (codepoint<0x80) {
out[len++]=codepoint;
} else if (codepoint<0x0800) {
out[len++]=0xc0+(codepoint>>6);
out[len++]=0x80+(codepoint&0x3f);
unicode++;
} else {
out[len++]=0xe0+(codepoint>>12);
out[len++]=0x80+((codepoint>>6)&0x3f);
out[len++]=0x80+(codepoint&0x3f);
unicode++;
}
}
break;
case '\'': case '"': case '/':
/* remove escaping from these characters */
out[len++]=*s;
break;
case 'n': case 'r': case '\\':
default:
/* Keep escaped */
out[len++]='\\';
out[len++]=*s;
break;
}
break;
default:
out[len++]=*s;
break;
}
s++;
}
done:
out[len]=0;
if (!unicode||allowUnicode) printf("%s\n",out);
return 0;
}
int main(int argc,char **argv)
{
int i;
char line[8192];
int allowUnicode=atoi(argv[1]?argv[1]:"0");
line[0]=0; fgets(line,8192,stdin);
while(line[0]) {
for(i=0;line[i]&&line[i+10];i++)
if (!strncasecmp(&line[i],"\"text\":\"",7)) {
// Line is for the creation of a tweet
extractTweet(&line[8+i],allowUnicode);
break;
}
line[0]=0; fgets(line,8192,stdin);
}
return 0;
}