From 5d3f206ae36a3699a7a923cea92d080a22b95711 Mon Sep 17 00:00:00 2001
From: Oleksiy Kovyrin <alexey@kovyrin.net>
Date: Wed, 5 Mar 2014 18:52:55 -0500
Subject: [PATCH 1/2] Ignore build dirs and resulting binary

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 .gitignore
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..df74212
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+/json2avro
+/avrolib
+/avro-c/build

From c18f4352c154ac16bb7cc60c39c8ca2716277370 Mon Sep 17 00:00:00 2001
From: Oleksiy Kovyrin <alexey@kovyrin.net>
Date: Wed, 5 Mar 2014 19:20:27 -0500
Subject: [PATCH 2/2] Add an option to read avro schema from an external file.

In addition to the new -S <file> option, this commit adds more argument error handling,
much mroe detailed help information, memory stats feature flag help and a check that makes
sure memory stats are not executed on non-linux systems.
---
 README.md   |  22 +++++++++--
 json2avro.c | 107 ++++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 109 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index 83d0696..6db7ae2 100644
--- a/README.md
+++ b/README.md
@@ -32,9 +32,23 @@ and starts parsing afresh. (This behavior can be turned off with the
 ## Usage
 
 ```sh
-./json2avro
-Usage: ./json2avro [-c null|snappy|deflate|lzma] [-b <block_size (dft: 16384)>] [-d] [-j] [-x (abort on error)] -s <schema> [<infile.json>] <outfile.avro|->
-If infile.json is not specified, stdin is assumed. outfile.avro of '-' is stdout.
+$ ./json2avro -h
+Usage: ./json2avro [options] [input_file.json] <output_file.avro>
+
+Where options are:
+ -s schema (required) Avro schema to use for conversion.
+ -S file   (required) JSON file to read the avro schema from.
+ -c algo   (optional) Set output compression algorithm: null, snappy, deflate, lzma
+                      Default: no compression
+ -b bytes  (optional) Set output block size in bytes. Default: 16384
+ -d        (optional) Turn on debug mode.
+ -j        (optional) Dump unexpected JSON objects as strings.
+ -x        (optional) Abort on JSON parsing errors. Default: skip invalid json.
+ -z bytes  (optional) Maximum JSON string size. Default: no limit.
+ -m        (optional) Linux only, enable periodic memory stats information output.
+ -h                   Show this help and exit.
+
+If infile.json is not specified, STDIN is assumed. outfile.avro of '-' means STDOUT.
 ```
 
 ## Example
@@ -49,7 +63,7 @@ If we have the following JSON called `input.json`:
 "a_fixed":"abcd", "an_int_array":[123, 456, -32, 0, 12],
 "a_float_map":{"foo":2.345, "bar":-3.456}}    {"a_null":null,
 "a_bool":false, "an_int":54321, "a_long":9876543212,
-"a_float":7.654321, "a_double":8.76543217654321E7, 
+"a_float":7.654321, "a_double":8.76543217654321E7,
 "a_string":"foo bar",
 "random_bytes":"\u0006K\u0007\nV@H#3\u001ad\u001a\u0006",
 "a_fixed":"dcba", "an_int_array":[321, 654, -23, 0, 21],
diff --git a/json2avro.c b/json2avro.c
index a9a8a9e..ba391cd 100644
--- a/json2avro.c
+++ b/json2avro.c
@@ -1,6 +1,6 @@
 /*
  * Copyright 2013 Gregory Trubetskoy
- * 
+ *
  * Licensed under the Apache License, Version 2.0 (the "License"); you
  * may not use this file except in compliance with the License.  You
  * may obtain a copy of the License at
@@ -28,6 +28,38 @@
 #include <jansson.h>
 #include <avro.h>
 
+#define MAX_SCHEMA_LEN ((off_t) 1024*1024)
+
+char *read_schema_file(char *file_name) {
+    FILE *schema_file = fopen(file_name, "rt");
+    if (errno != 0) {
+        fprintf(stderr, "Could not find or access file: %s\n", file_name);
+        return 0;
+    }
+
+    // Get file size
+    fseek(schema_file, 0, SEEK_END);
+    off_t file_size = ftell(schema_file);
+    fseek(schema_file, 0, SEEK_SET);
+
+    if (file_size == 0) {
+        fprintf(stderr, "Empty schema file: %s\n", file_name);
+        return 0;
+    }
+
+    if (file_size > MAX_SCHEMA_LEN) {
+        fprintf(stderr, "Schema file size is too big: %lld bytes > %lld maximum supported length\n", file_size, MAX_SCHEMA_LEN);
+        return 0;
+    }
+
+    // Allocate buffer for the schema and read the data
+    char *buf = (char*) malloc(file_size);
+    fread(buf, 1, file_size, schema_file);
+    fclose(schema_file);
+
+    return buf;
+}
+
 void memory_status() {
     /* This is obviously Linux-only */
     const char* statm_path = "/proc/self/statm";
@@ -38,7 +70,7 @@ void memory_status() {
     fclose(f);
 }
 
-int schema_traverse(const avro_schema_t schema, json_t *json, json_t *dft, 
+int schema_traverse(const avro_schema_t schema, json_t *json, json_t *dft,
                     avro_value_t *current_val, int quiet, int strjson, size_t max_str_sz) {
 
     json = json ? json : dft;
@@ -67,7 +99,7 @@ int schema_traverse(const avro_schema_t schema, json_t *json, json_t *dft,
 
             avro_value_t field;
             avro_value_get_by_index(current_val, i, &field, NULL);
-            
+
             if (schema_traverse(field_schema, json_val, dft, &field, quiet, strjson, max_str_sz))
                 return 1;
         }
@@ -170,7 +202,7 @@ int schema_traverse(const avro_schema_t schema, json_t *json, json_t *dft,
                 fprintf(stderr, "ERROR: Expecting JSON null for Avro null, got something else\n");
             return 1;
         }
-        avro_value_set_null(current_val); 
+        avro_value_set_null(current_val);
         break;
 
     case AVRO_ENUM:
@@ -250,7 +282,7 @@ int schema_traverse(const avro_schema_t schema, json_t *json, json_t *dft,
     return 0;
 }
 
-void process_file(FILE *input, avro_file_writer_t out, avro_schema_t schema, 
+void process_file(FILE *input, avro_file_writer_t out, avro_schema_t schema,
                   int verbose, int memstat, int errabort, int strjson, size_t max_str_sz) {
 
     json_error_t err;
@@ -286,7 +318,7 @@ void process_file(FILE *input, avro_file_writer_t out, avro_schema_t schema,
 
         } else
             fprintf(stderr, "Error processing record %d, skipping...\n", n);
-        
+
         avro_value_iface_decref(iface);
         avro_value_decref(&record);
 
@@ -297,12 +329,38 @@ void process_file(FILE *input, avro_file_writer_t out, avro_schema_t schema,
         json = json_loadf(input, JSON_DISABLE_EOF_CHECK, &err);
     }
 
+    if (memstat) memory_status();
 
     avro_schema_decref(schema);
 }
 
-int main(int argc, char *argv[]) {
+void print_help(char *program_name) {
+    fprintf(stderr, "Usage: %s [options] [input_file.json] <output_file.avro>\n", program_name);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "Where options are:\n");
+    fprintf(stderr, " -s schema (required) Avro schema to use for conversion.\n");
+    fprintf(stderr, " -S file   (required) JSON file to read the avro schema from.\n");
+    fprintf(stderr, " -c algo   (optional) Set output compression algorithm: null, snappy, deflate, lzma\n");
+    fprintf(stderr, "                      Default: no compression\n");
+    fprintf(stderr, " -b bytes  (optional) Set output block size in bytes. Default: 16384\n");
+    fprintf(stderr, " -d        (optional) Turn on debug mode.\n");
+    fprintf(stderr, " -j        (optional) Dump unexpected JSON objects as strings.\n");
+    fprintf(stderr, " -x        (optional) Abort on JSON parsing errors. Default: skip invalid json.\n");
+    fprintf(stderr, " -z bytes  (optional) Maximum JSON string size. Default: no limit.\n");
+    fprintf(stderr, " -m        (optional) Linux only, enable periodic memory stats information output.\n");
+    fprintf(stderr, " -h                   Show this help and exit.\n");
+    fprintf(stderr, "\n");
+    fprintf(stderr, "If infile.json is not specified, STDIN is assumed. outfile.avro of '-' means STDOUT.\n");
+    fprintf(stderr, "\n");
+}
+
+void usage_error(char *program_name, char *message) {
+    if (message) fprintf(stderr, "ERROR: %s\n\n", message);
+    print_help(program_name);
+    exit(EXIT_FAILURE);
+}
 
+int main(int argc, char *argv[]) {
     FILE *input;
 
     avro_schema_t schema;
@@ -319,11 +377,14 @@ int main(int argc, char *argv[]) {
     extern char *optarg;
     extern int optind, optopt;
 
-    while ((opt = getopt(argc, argv, "c:s:b:z:dmxj")) != -1) {
+    while ((opt = getopt(argc, argv, "c:s:S:b:z:dmxjh")) != -1) {
         switch (opt) {
-        case 's': 
+        case 's':
             schema_arg = optarg;
             break;
+        case 'S':
+            schema_arg = read_schema_file(optarg);
+            break;
         case 'b':
             block_sz = strtol(optarg, &endptr, 0);
             if (*endptr) {
@@ -338,7 +399,7 @@ int main(int argc, char *argv[]) {
                 opterr++;
             }
             break;
-        case 'c': 
+        case 'c':
             codec = optarg;
             break;
         case 'd':
@@ -351,8 +412,15 @@ int main(int argc, char *argv[]) {
             strjson = 1;
             break;
         case 'm':
-            memstat = 1;
+            #if defined(__linux__)
+              memstat = 1;
+            #else
+              usage_error(argv[0], "Memory stats is a Linux-only feature!");
+            #endif
             break;
+        case 'h':
+            print_help(argv[0]);
+            exit(0);
         case ':':
             fprintf(stderr, "ERROR: Option -%c requires an operand\n", optopt);
             opterr++;
@@ -363,11 +431,17 @@ int main(int argc, char *argv[]) {
         }
     }
 
-    if ((argc - optind) < 1 || (argc - optind) > 2 || opterr || !schema_arg) {
-        fprintf(stderr, "Usage: %s [-c null|snappy|deflate|lzma] [-b <block_size (dft: 16384)>] [-d] [-j] [-x (abort on error)] [-z <max_str_sz>] -s <schema> [<infile.json>] <outfile.avro|->\n", argv[0]);
-        fprintf(stderr, "If infile.json is not specified, stdin is assumed. outfile.avro of '-' is stdout.\n");
-        exit(EXIT_FAILURE);
+    int file_args_cnt = (argc - optind);
+    if (file_args_cnt == 0) {
+        usage_error(argv[0], "Please provide at least one file name argument");
     }
+    if (file_args_cnt > 2) {
+        fprintf(stderr, "Too many file name arguments: %d!\n", file_args_cnt);
+        usage_error(argv[0], 0);
+    }
+
+    if (opterr) usage_error(argv[0], 0);
+    if (!schema_arg) usage_error(argv[0], "Please provide correct schema!");
 
     if (!codec) codec = "null";
     else if (strcmp(codec, "snappy") && strcmp(codec, "deflate") && strcmp(codec, "lzma") && strcmp(codec, "null")) {
@@ -382,7 +456,8 @@ int main(int argc, char *argv[]) {
         outpath = argv[optind+1];
         input = fopen(argv[optind], "rb");
         if ( errno != 0 ) {
-            perror("ERROR: Cannot open file");
+            fprintf(stderr, "ERROR: Cannot open input file: %s: ", argv[optind]);
+            perror(0);
             exit(EXIT_FAILURE);
         }
     }