From f7af5bbcd8bb8a650cd8957664a8f3759eaa1623 Mon Sep 17 00:00:00 2001 From: Zach Thompson Date: Sun, 20 Mar 2016 11:56:32 -0600 Subject: [PATCH 01/10] Handle control characters --- lib/DDG/Goodie/Bin2Unicode.pm | 82 +++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/lib/DDG/Goodie/Bin2Unicode.pm b/lib/DDG/Goodie/Bin2Unicode.pm index f742d6b8d58..f944c9bd2a6 100644 --- a/lib/DDG/Goodie/Bin2Unicode.pm +++ b/lib/DDG/Goodie/Bin2Unicode.pm @@ -14,6 +14,77 @@ triggers query => qr{^([01\s]{8,})(?:\s+(?:to\s+)?(?:unicode|text|ascii))?$}; my $MAX_CODE_PT = 1114111; +my %ctrl_chars = ( + 0 => 'Null character (NUL)', + 1 => 'Start of Heading (SOH)', + 2 => 'Start of Text (STX)', + 3 => 'End-of-text character (ETX)', + 4 => 'End-of-transmission character (EOT)', + 5 => 'Enquiry character (ENQ)', + 6 => 'Acknowledge character (ACK)', + 7 => 'Bell character (BEL)', + 8 => 'Backspace (BS)', + 9 => 'Horizontal tab (HT)', + 10 => 'Line feed (LF)', + 11 => 'Vertical tab (VT)', + 12 => 'Form feed (FF)', + 13 => 'Carriage return (CR)', + 14 => 'Shift Out (SO)', + 15 => 'Shift In (SI)', + 16 => 'Data Link Escape (DLE)', + 17 => 'Device Control 1 (DC1)', + 18 => 'Device Control 2 (DC2)', + 19 => 'Device Control 3 (DC3)', + 20 => 'Device Control 4 (DC4)', + 21 => 'Negative-acknowledge character (NAK)', + 22 => 'Synchronous Idle (SYN)', + 23 => 'End of Transmission Block (ETB)', + 24 => 'Cancel character (CAN)', + 25 => 'End of Medium (EM)', + 26 => 'Substitute character (SUB)', + 27 => 'Escape character (ESC)', + 28 => 'File Separator (FS)', + 29 => 'Group Separator (GS)', + 30 => 'Record Separator (RS)', + 31 => 'Unit Separator (US)', + 32 => 'Space (SP)', + 127 => 'Delete (DEL)', + 128 => 'Padding Character (PAD)', + 129 => 'High Octet Preset (HOP)', + 130 => 'Break Permitted Here (BPH)', + 131 => 'No Break Here (NBH)', + 132 => 'Index (IND)', + 133 => 'Next Line (NEL)', + 134 => 'Start of Selected Area (SSA)', + 135 => 'End of Selected Area (ESA)', + 136 => 'Character Tabulation Set (HTS)', + 137 => 'Character Tabulation with Justification (HTJ)', + 138 => 'Line Tabulation Set (VTS)', + 139 => 'Partial Line Forward (PLD)', + 140 => 'Partial Line Backward (PLU)', + 141 => 'Reverse Line Feed (RI)', + 142 => 'Single-Shift Two (SS2)', + 143 => 'Single-Shift Three (SS3)', + 144 => 'Device Control String (DCS)', + 145 => 'Private Use 1 (PU1)', + 146 => 'Private Use 2 (PU2)', + 147 => 'Set Transmit State (STS)', + 148 => 'Cancel character (CCH)', + 149 => 'Message Waiting (MW)', + 150 => 'Start of Protected Area (SPA)', + 151 => 'End of Protected Area (EPA)', + 152 => 'Start of String (SOS)', + 153 => 'Single Graphic Character Introducer (SGCI)', + 154 => 'Single Character Intro Introducer (SCI)', + 155 => 'Control Sequence Introducer (CSI)', + 156 => 'String Terminator (ST)', + 157 => 'Operating System Command (OSC)', + 158 => 'Private Message (PM)', + 159 => 'Application Program Command (APC)' +); + +my $zaahirs_hideout = '0' x 48; + handle matches => sub { my $q = $_; # orginal query my $bin_string = shift @_; # captured binary string @@ -27,6 +98,17 @@ handle matches => sub { return if length($b) % 8; # Overflow/non-portable warnings expected my $i = oct("0b$b"); + if($bin_string eq $zaahirs_hideout){ + $str = q{Congratulations, you've discovered Zaahir's hideout!}; + last; + } + elsif(exists $ctrl_chars{$i}){ + if(@bins == 1){ + $str = "Control character: $ctrl_chars{$i}"; + last; + } + else{ return } # only allow a single binary to unicode char + } # Assume ascii if out of range or explicitly requested. # This will work for characters all in the same string # but will not print the right non-ascii characters *if* From 0a602ad68f71c213ef22571780bff8603e6d78bc Mon Sep 17 00:00:00 2001 From: Zach Thompson Date: Sun, 20 Mar 2016 12:15:00 -0600 Subject: [PATCH 02/10] Alter logic. Translate control characters if more than one and check for word characters in end string. This allow for spaces as we used to do --- lib/DDG/Goodie/Bin2Unicode.pm | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/lib/DDG/Goodie/Bin2Unicode.pm b/lib/DDG/Goodie/Bin2Unicode.pm index f944c9bd2a6..e87f34dd19b 100644 --- a/lib/DDG/Goodie/Bin2Unicode.pm +++ b/lib/DDG/Goodie/Bin2Unicode.pm @@ -89,26 +89,23 @@ handle matches => sub { my $q = $_; # orginal query my $bin_string = shift @_; # captured binary string + my $str; + if($bin_string eq $zaahirs_hideout){ + $str = q{Congratulations, you've discovered Zaahir's hideout!}; + goto DONE; + } my $want_ascii = $q =~ /\bascii\b/; my @bins = $bin_string =~ /([01]+|\s+)/g; - my $str; for my $b (@bins){ if($b =~ /^[01]+$/){ return if length($b) % 8; # Overflow/non-portable warnings expected my $i = oct("0b$b"); - if($bin_string eq $zaahirs_hideout){ - $str = q{Congratulations, you've discovered Zaahir's hideout!}; + if((exists $ctrl_chars{$i}) && (@bins == 1)){ + $str = "Control character: $ctrl_chars{$i}"; last; } - elsif(exists $ctrl_chars{$i}){ - if(@bins == 1){ - $str = "Control character: $ctrl_chars{$i}"; - last; - } - else{ return } # only allow a single binary to unicode char - } # Assume ascii if out of range or explicitly requested. # This will work for characters all in the same string # but will not print the right non-ascii characters *if* @@ -122,6 +119,10 @@ handle matches => sub { } } + DONE: + + return unless $str =~ /\w+/; + return "Binary '$bin_string' converted to " . $want_ascii ? 'ascii' : 'unicode' . " is '$str'", structured_answer => { id => 'bin2unicode', From 45acabc018514302f11918ec3410bef7b7aa696d Mon Sep 17 00:00:00 2001 From: Zach Thompson Date: Sun, 20 Mar 2016 12:39:17 -0600 Subject: [PATCH 03/10] Check for all control/space characters --- lib/DDG/Goodie/Bin2Unicode.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/DDG/Goodie/Bin2Unicode.pm b/lib/DDG/Goodie/Bin2Unicode.pm index e87f34dd19b..01e9d2097ec 100644 --- a/lib/DDG/Goodie/Bin2Unicode.pm +++ b/lib/DDG/Goodie/Bin2Unicode.pm @@ -121,7 +121,7 @@ handle matches => sub { DONE: - return unless $str =~ /\w+/; + return if $str =~ /^[\x{0}-\x{20}\x{7F}-\x{9F}]+$/; # all control/space return "Binary '$bin_string' converted to " . $want_ascii ? 'ascii' : 'unicode' . " is '$str'", structured_answer => { From 07f2d3a08e6d7cecfd8d7e74ca77504cbab119c3 Mon Sep 17 00:00:00 2001 From: Zach Thompson Date: Sun, 20 Mar 2016 12:41:21 -0600 Subject: [PATCH 04/10] Add link --- lib/DDG/Goodie/Bin2Unicode.pm | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/DDG/Goodie/Bin2Unicode.pm b/lib/DDG/Goodie/Bin2Unicode.pm index 01e9d2097ec..5adb41f71ae 100644 --- a/lib/DDG/Goodie/Bin2Unicode.pm +++ b/lib/DDG/Goodie/Bin2Unicode.pm @@ -121,7 +121,8 @@ handle matches => sub { DONE: - return if $str =~ /^[\x{0}-\x{20}\x{7F}-\x{9F}]+$/; # all control/space + # return if all control/space (https://en.wikipedia.org/wiki/List_of_Unicode_characters#Control_codes) + return if $str =~ /^[\x{0}-\x{20}\x{7F}-\x{9F}]+$/; return "Binary '$bin_string' converted to " . $want_ascii ? 'ascii' : 'unicode' . " is '$str'", structured_answer => { From 77f4032b5512810b0046ebc8dcb5af615f76a890 Mon Sep 17 00:00:00 2001 From: Zach Thompson Date: Sun, 20 Mar 2016 12:58:50 -0600 Subject: [PATCH 05/10] Update regex to be easier to understand --- lib/DDG/Goodie/Bin2Unicode.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/DDG/Goodie/Bin2Unicode.pm b/lib/DDG/Goodie/Bin2Unicode.pm index 5adb41f71ae..794fa42b389 100644 --- a/lib/DDG/Goodie/Bin2Unicode.pm +++ b/lib/DDG/Goodie/Bin2Unicode.pm @@ -122,7 +122,7 @@ handle matches => sub { DONE: # return if all control/space (https://en.wikipedia.org/wiki/List_of_Unicode_characters#Control_codes) - return if $str =~ /^[\x{0}-\x{20}\x{7F}-\x{9F}]+$/; + return if $str =~ /^[\p{Control}]\s]+$/; return "Binary '$bin_string' converted to " . $want_ascii ? 'ascii' : 'unicode' . " is '$str'", structured_answer => { From e479164c70370e9befd2948b4b7cf778c6973083 Mon Sep 17 00:00:00 2001 From: Zach Thompson Date: Sun, 20 Mar 2016 13:08:44 -0600 Subject: [PATCH 06/10] Only need space, not class of spaces --- lib/DDG/Goodie/Bin2Unicode.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/DDG/Goodie/Bin2Unicode.pm b/lib/DDG/Goodie/Bin2Unicode.pm index 794fa42b389..c8c51457dcd 100644 --- a/lib/DDG/Goodie/Bin2Unicode.pm +++ b/lib/DDG/Goodie/Bin2Unicode.pm @@ -122,7 +122,7 @@ handle matches => sub { DONE: # return if all control/space (https://en.wikipedia.org/wiki/List_of_Unicode_characters#Control_codes) - return if $str =~ /^[\p{Control}]\s]+$/; + return if $str =~ /^[\p{Control}] ]+$/; return "Binary '$bin_string' converted to " . $want_ascii ? 'ascii' : 'unicode' . " is '$str'", structured_answer => { From 6a73236464b01083f4f2d8b1afed951a6c4abaf1 Mon Sep 17 00:00:00 2001 From: Zach Thompson Date: Sun, 20 Mar 2016 13:20:50 -0600 Subject: [PATCH 07/10] space/delete not strictly control characters --- lib/DDG/Goodie/Bin2Unicode.pm | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/DDG/Goodie/Bin2Unicode.pm b/lib/DDG/Goodie/Bin2Unicode.pm index c8c51457dcd..07c0ad5c784 100644 --- a/lib/DDG/Goodie/Bin2Unicode.pm +++ b/lib/DDG/Goodie/Bin2Unicode.pm @@ -103,7 +103,8 @@ handle matches => sub { # Overflow/non-portable warnings expected my $i = oct("0b$b"); if((exists $ctrl_chars{$i}) && (@bins == 1)){ - $str = "Control character: $ctrl_chars{$i}"; + $str = $ctrl_chars{$i}; + $str = "Control character: $str" unless ($i == 32) || ($i == 127); last; } # Assume ascii if out of range or explicitly requested. From e3fa7ad6e160ab40660635ab948a89c0ebbfa19c Mon Sep 17 00:00:00 2001 From: Zach Thompson Date: Sun, 20 Mar 2016 15:06:24 -0600 Subject: [PATCH 08/10] Remove extra bracket from regex Add test for control characters --- lib/DDG/Goodie/Bin2Unicode.pm | 2 +- t/Bin2Unicode.t | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/DDG/Goodie/Bin2Unicode.pm b/lib/DDG/Goodie/Bin2Unicode.pm index 07c0ad5c784..130193a724f 100644 --- a/lib/DDG/Goodie/Bin2Unicode.pm +++ b/lib/DDG/Goodie/Bin2Unicode.pm @@ -123,7 +123,7 @@ handle matches => sub { DONE: # return if all control/space (https://en.wikipedia.org/wiki/List_of_Unicode_characters#Control_codes) - return if $str =~ /^[\p{Control}] ]+$/; + return if $str =~ /^[\p{Control} ]+$/; return "Binary '$bin_string' converted to " . $want_ascii ? 'ascii' : 'unicode' . " is '$str'", structured_answer => { diff --git a/t/Bin2Unicode.t b/t/Bin2Unicode.t index c04f9f9a8d6..5975a801585 100644 --- a/t/Bin2Unicode.t +++ b/t/Bin2Unicode.t @@ -59,7 +59,9 @@ ddg_goodie_test( 'Muchas Gracias CompasMð', 1)), '010101' => undef, - '201 to text' => undef + '201 to text' => undef, + '00000000000000001' => undef, + '00000001 00000010' => undef ); done_testing; From 77e72484421750b2bdb0c7e297318a76bf7ced16 Mon Sep 17 00:00:00 2001 From: Zach Thompson Date: Sun, 20 Mar 2016 15:29:03 -0600 Subject: [PATCH 09/10] Test for all multiple control character sequences failing test for single control character conversions returning --- t/Bin2Unicode.t | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/t/Bin2Unicode.t b/t/Bin2Unicode.t index 5975a801585..76ba4815568 100644 --- a/t/Bin2Unicode.t +++ b/t/Bin2Unicode.t @@ -29,6 +29,12 @@ sub gen_struc_ans { }; } +my %ctrl_tests; +for (1..32, 127..159){ + my $b = sprintf '%08b', $_; + $ctrl_tests{join ' ', $b, $b} = undef; +} + ddg_goodie_test( [qw( DDG::Goodie::Bin2Unicode )], '0110100001100101011011000110110001101111 to text' => test_zci(gen_struc_ans( @@ -58,10 +64,19 @@ ddg_goodie_test( '0100110101110101011000110110100001100001011100110010000001000111011100100110000101100011011010010110000101110011001000000100001101101111011011010111000001100001011100110100110111110000', 'Muchas Gracias CompasMð', 1)), + '00000000' => test_zci(gen_struc_ans( + '00000000', + '00000000', + 'Control character: Null character (NUL)', + 0)), + '000000000000000000100000' => test_zci(gen_struc_ans( + '000000000000000000100000', + '000000000000000000100000', + 'Space (SP)', + 0)), '010101' => undef, '201 to text' => undef, - '00000000000000001' => undef, - '00000001 00000010' => undef + %ctrl_tests ); done_testing; From f21a1dd1f9e65dd834d605b94b46a83006a921d0 Mon Sep 17 00:00:00 2001 From: Zach Thompson Date: Sun, 20 Mar 2016 15:59:33 -0600 Subject: [PATCH 10/10] Put label below regex --- lib/DDG/Goodie/Bin2Unicode.pm | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/DDG/Goodie/Bin2Unicode.pm b/lib/DDG/Goodie/Bin2Unicode.pm index 130193a724f..914c827c7e5 100644 --- a/lib/DDG/Goodie/Bin2Unicode.pm +++ b/lib/DDG/Goodie/Bin2Unicode.pm @@ -120,11 +120,10 @@ handle matches => sub { } } - DONE: - # return if all control/space (https://en.wikipedia.org/wiki/List_of_Unicode_characters#Control_codes) return if $str =~ /^[\p{Control} ]+$/; + DONE: return "Binary '$bin_string' converted to " . $want_ascii ? 'ascii' : 'unicode' . " is '$str'", structured_answer => { id => 'bin2unicode',