-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.m
91 lines (78 loc) · 3.52 KB
/
preprocess.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
function sentences = preprocess(raw, min_length)
%PREPROCESS strips non-dialog text
% The raw text contains more than the dialogue by characters, so we need
% to clearn them up. Fortunately, Shakespeare's plays follow a fairly
% standardized format.
%
% ACT
%
% Scene
%
% Name.
% Dialogue
%
% [Stage Direction]
if nargin == 1
min_length = 3;
end
%% Initial processing
% We will split the text using the standard format.
% split text into larger sections - let's call them |paragraphs|
paragraphs = regexp(raw, '\r\n\r\n', 'split'); % split double line breaks
% split |paragraphs| into sentences
sentences = regexp(paragraphs',... % split by punctuations
'(?<=[!.?;:])\s', 'split');
% remove non-dialogue text
for i = 1:length(sentences) % loop over sentences
if length(sentences{i}) == 1 % only 1 sentence per line
if regexp(sentences{i}{1},... % if starts with 'ACT...'
'^(\r\n)*(ACT|Act).+\.$')
sentences{i} = []; % remove it
elseif regexp(sentences{i}{1},... % if enclosed in '[]'
'^(\r\n)*\[.+\]\.?$')
sentences{i} = []; % remove stage directions
end
else % > 2 sentences per line
if regexp(sentences{i}{1},... % if starts with 'Scene...'
'^(\r\n)*Scene.+\.$')
sentences{i} = []; % remove the line
elseif regexp(sentences{i}{1},... % if name ends with '.'
'^(\r\n)*\d?\s?\w+\s*\w+\.$')
sentences{i}(1) = []; % remove it
elseif ~isempty(regexp(sentences{i}{1},...
'^(\r\n)*\[.+', 'once')) &&... % if starts with '[
~isempty(regexp(sentences{i}{end},...
'.+\]\.?$', 'once')) % ends with ']'
sentences{i} = []; % remove it
end
end
end
sentences = [sentences{:}]'; % flatten the cell array
sentences(cellfun(@isempty, sentences)) = []; % remove empty cells
%% Dealing with exceptions
% We have some remaining issues.
sentences = regexprep(sentences, '\[.+\]', ''); % remove stage directions
sentences = regexp(sentences, '--', 'split'); % split by double hyphens
sentences = [sentences{:}]'; % flatten the cell array
sentences(cellfun(@isempty, sentences)) = []; % remove empty cells
sentences = regexprep(sentences, '^\n\r', ''); % remove LFCR
sentences = regexprep(sentences, '^\r\n', ''); % remove CRLF
sentences = regexprep(sentences, '^\n', ''); % remove LF
sentences = regexprep(sentences, '^\r', ''); % remove CR
sentences = regexprep(sentences, '^:', ''); % remove colon
sentences = regexprep(sentences, '^\.', ''); % remove period
sentences = regexprep(sentences, '^\s', ''); % remove space
sentences(cellfun(@isempty, sentences)) = []; % remove empty cells
%% Remove short ssentences
% If a sentence is too short, then it doesn't help.
tokens = cellfun(@strsplit, sentences,... % tokenize sentences
'UniformOutput', false);
isShort = cellfun(@length, tokens) < min_length;% shorter than minimum?
sentences(isShort)= []; % remove short sentences
%% Add Sentence Markers
% Now we have mostly clean data. For further processing, we need to add <s>
% and </s> to mark the start and the end of sentences.
for i = 1:length(sentences)
sentences{i} = ['<s> ' strtrim(sentences{i}) ' </s>'];
end
end