Skip to content

Commit

Permalink
Merge pull request #5 from ekorkut/ekorkut1_hashsign
Browse files Browse the repository at this point in the history
Hash sign in urls is not encoded and more strict error check for order and number of metadata columns
  • Loading branch information
ekorkut authored Nov 22, 2017
2 parents 1c62ec0 + d68bbd4 commit 02cff71
Show file tree
Hide file tree
Showing 21 changed files with 378 additions and 22 deletions.
13 changes: 12 additions & 1 deletion pycsvw/csvw.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@

from . import nt_serializer
from .csvw_exceptions import NoDefaultOrValueUrlError, \
BothDefaultAndValueUrlError, BothLangAndDatatypeError, RiotWarning, RiotError
BothDefaultAndValueUrlError, BothLangAndDatatypeError, \
VirtualColumnPrecedesNonVirtualColumn, RiotWarning, RiotError

READ_PERMISSIONS = stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH

Expand Down Expand Up @@ -92,6 +93,16 @@ def _read_metadata(handle):

col["aboutUrl"] = col.get("aboutUrl", None)
col["suppressOutput"] = col.get("suppressOutput", False)
# Non-virtual columns should precede virtual columns
virtual_seen_yet = False
for col in table_schema["columns"]:
if not virtual_seen_yet and col["virtual"]:
virtual_seen_yet = True
continue
if virtual_seen_yet and not col["virtual"]:
raise VirtualColumnPrecedesNonVirtualColumn(
"Non-virtual column {} comes after a virtual column. "
"All virtual columns should come after non-virtual columns.".format(col))

return out

Expand Down
16 changes: 16 additions & 0 deletions pycsvw/csvw_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,22 @@ class NoDefaultOrValueUrlError(Exception):
pass


class NumberOfNonVirtualColumnsMismatch(Exception):
"""
The exception throw when number of non-virtual columns in metadata
does not match number of columns in CSV file.
"""
pass


class VirtualColumnPrecedesNonVirtualColumn(Exception):
"""
The exception thrown when a virtual column precedes a non-virtual column
in the specified metadata.
"""
pass


class BothDefaultAndValueUrlError(Exception):
"""
The exception thrown when a virtual column specifies both
Expand Down
10 changes: 9 additions & 1 deletion pycsvw/nt_serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@

from .generator_utils import process_dates_times, DATATYPE_MAP, read_csv
from .csvw_exceptions import NullValueException, BothValueAndLiteralError, \
BothValueAndDatatypeError, NoValueOrLiteralError, InvalidItemError
BothValueAndDatatypeError, NoValueOrLiteralError, InvalidItemError, \
NumberOfNonVirtualColumnsMismatch
from .rdf_utils import is_null_value, get_column_map, get_subject_for_cell, \
get_predicate_for_cell, apply_all_subs

Expand Down Expand Up @@ -271,6 +272,7 @@ def serialize(tables, md_tables, custom_prefixes, output_obj):
'prefixes': custom_prefixes
}
}
num_nonvirtual_columns = sum([1 for x in metadata["tableSchema"]["columns"] if not x["virtual"]])
# Read the csv file fresh after rewinding the file
table_file_obj = tables[table_url]
table_file_obj.seek(0)
Expand All @@ -279,4 +281,10 @@ def serialize(tables, md_tables, custom_prefixes, output_obj):
next(table_csv_reader) # Ignore header

for row_num, row in enumerate(table_csv_reader):
if len(row) != num_nonvirtual_columns:
raise NumberOfNonVirtualColumnsMismatch(
"The number of non-virtual columns in metadata, {}, "
"do not match with the number of columns in row {}, {}, "
"of the csv file '{}'.".format(
num_nonvirtual_columns, row_num + 1, len(row), table_url))
write_row(output_obj, str(row_num + 1), row, table_info)
4 changes: 2 additions & 2 deletions pycsvw/rdf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from .csvw_exceptions import NullValueException, MissingColumnError, FailedSubstitutionError


SUB_PATTERN = re.compile(r'{(\w+)}')
SUB_PATTERN = re.compile(r'{([A-Za-z0-9_\-# /:]+)}')


def is_null_value(val, null_values):
Expand Down Expand Up @@ -60,7 +60,7 @@ def apply_sub(url, row, column_name_to_sub, column_info, quote_sub=True):

rep_before = "{" + column_name_to_sub + "}"
if quote_sub:
return url.replace(rep_before, quote(rep_after.encode('utf-8'), safe=':/'))
return url.replace(rep_before, quote(rep_after.encode('utf-8'), safe=':/#'))
else:
return url.replace(rep_before, rep_after)

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

setup(
name='pycsvw',
version="1.0.1",
version="1.0.2",
description='Generate JSON and RDF from csv files with metadata',
url='https://github.com/bloomberg/pycsvw',
author='Dev Ramudit, Erman Korkut',
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"@context": "http://www.w3.org/ns/csvw",
"url": "http://example.org/simple.csv",
"tableSchema": {
"columns": [{
"titles": "t1"
},{
"titles": "t2"
}]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"@context": "http://www.w3.org/ns/csvw",
"url": "http://example.org/simple.csv",
"tableSchema": {
"columns": [{
"titles": "t1"
},{
"titles": "t2"
},{
"titles": "t3"
}, {
"titles": "t4"
}
]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"@context": "http://www.w3.org/ns/csvw",
"url": "http://example.org/simple.csv",
"tableSchema": {
"columns": [{
"titles": "t1"
},
{
"name": "v1",
"virtual": true,
"aboutUrl": "owl:sub-{_row}",
"propertyUrl": "owl:obj-{_row}",
"valueUrl": "owl:pred-{_row}"
}, {
"titles": "t2"
}]
}
}
3 changes: 3 additions & 0 deletions tests/negative.metadata_mismatch.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
expense,description,amount
taxi,from conference to hotel,20
fee,conference registration fee,50
109 changes: 109 additions & 0 deletions tests/rdf/test_url_special_chars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# Copyright 2017 Bloomberg Finance L.P.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This test covers the cases where csv file and metadata works with url safe characters like #, /, :
"""

from rdflib import ConjunctiveGraph, Literal, URIRef, Namespace

from pycsvw import CSVW

PRE_NS = Namespace('http://www.example.org/')


def test_url_safe_chars():

csvw = CSVW(csv_path="tests/url_special_chars.csv",
metadata_path="tests/url_special_chars.csv-metadata.json")
rdf_output = csvw.to_rdf()

g = ConjunctiveGraph()
g.parse(data=rdf_output, format="turtle")

# Check subjects
sub1 = URIRef('http://www.example.org/c#1/chash2/chash3/chash4/chash5/chash6')
literals = [Literal('c#1'), Literal('chash2'), Literal('chash3'),
Literal('chash4'), Literal('chash6'), Literal('chash5')]
verify_non_virtual_columns(sub1, g, literals)
verify_virtual_columns(sub1, g, '#/:- _r1', '#/:-%20_r1')

sub2 = URIRef('http://www.example.org/c/1/c/2/c/3/c/4/c/5/c/6')
literals = [Literal('c/1'), Literal('c/2'), Literal('c/3'),
Literal('c/4'), Literal('c/6'), Literal('c/5')]
verify_non_virtual_columns(sub2, g, literals)
verify_virtual_columns(sub2, g, '/#:- _r2','/#:-%20_r2')

sub3 = URIRef('http://www.example.org/c:1/c:2/c:3/c:4/c:5/c:6')
literals = [Literal('c:1'), Literal('c:2'), Literal('c:3'),
Literal('c:4'), Literal('c:6'), Literal('c:5')]
verify_non_virtual_columns(sub3, g, literals)
verify_virtual_columns(sub3, g, ':#/-_ r3', ':#/-_%20r3')

sub4 = URIRef('http://www.example.org/c-1/c-2/c-3/c-4/c-5/c-6')
literals = [Literal('c-1'), Literal('c-2'), Literal('c-3'),
Literal('c-4'), Literal('c-6'), Literal('c-5')]
verify_non_virtual_columns(sub4, g, literals)
verify_virtual_columns(sub4, g, '-/#_ :r4', '-/#_%20:r4')

sub5 = URIRef('http://www.example.org/c%201/c%202/c%203/c%204/c%205/c%206')
literals = [Literal('c 1'), Literal('c 2'), Literal('c 3'),
Literal('c 4'), Literal('c 6'), Literal('c 5')]
verify_non_virtual_columns(sub5, g, literals)
verify_virtual_columns(sub5, g, ' -/#:_r5', '%20-/#:_r5')

sub6 = URIRef('http://www.example.org/c_1/c_2/c_3/c_4/c_5/c_6')
literals = [Literal('c_1'), Literal('c_2'), Literal('c_3'),
Literal('c_4'), Literal('c_6'), Literal('c_5')]
verify_non_virtual_columns(sub6, g, literals)
verify_virtual_columns(sub6, g, '_ /:#r6', '_%20/:#r6')


def verify_virtual_columns(sub, g, orig_value_str, encoded_value_str):
v1_triples = list(g.triples((sub, PRE_NS['v1p{}'.format(encoded_value_str)], None)))
assert len(v1_triples) == 1
assert "v1p{}".format(encoded_value_str) in str(v1_triples[0][1])
assert orig_value_str == str(v1_triples[0][2])
v2_triples = list(g.triples((sub, PRE_NS['v2p{}'.format(encoded_value_str)], None)))
assert len(v2_triples) == 1
assert "v2p{}".format(encoded_value_str) in str(v2_triples[0][1])
assert 'v2v{}'.format(encoded_value_str) in str(v2_triples[0][2])

# Standalone virtual column
standalone_sub = URIRef('http://www.example.org/v3s{}'.format(encoded_value_str))
v3_triples = list(g.triples((standalone_sub, None, None)))
assert len(v3_triples) == 1
assert "v3p{}".format(encoded_value_str) in str(v3_triples[0][1])
assert 'v3v{}'.format(encoded_value_str) in str(v3_triples[0][2])


def verify_non_virtual_columns(sub, g, literals):
all_triples = list(g.triples((sub, None, None)))
assert len(all_triples) == 9
assert (sub, PRE_NS['t#p'], literals[0]) in all_triples
assert (sub, PRE_NS['t/p'], literals[1]) in all_triples
assert (sub, PRE_NS['t:p'], literals[2]) in all_triples
assert (sub, PRE_NS['t-p'], literals[3]) in all_triples
assert (sub, PRE_NS['t_p'], literals[4]) in all_triples
# Space value
space_triple = list(g.triples((sub, PRE_NS['t%20p'], None)))
assert len(space_triple) == 1
assert "%20" in str(space_triple[0][1])
assert " " not in str(space_triple[0][1])
assert literals[5] == space_triple[0][2]








40 changes: 39 additions & 1 deletion tests/test_negative.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
import pytest

from pycsvw import CSVW
from pycsvw.csvw_exceptions import BothLangAndDatatypeError
from pycsvw.csvw_exceptions import BothLangAndDatatypeError, \
NumberOfNonVirtualColumnsMismatch, \
VirtualColumnPrecedesNonVirtualColumn


def test_single_csv():
Expand Down Expand Up @@ -136,6 +138,42 @@ def test_json_generation():
assert "JSON generation" in str(exc.value)


def test_metadata_mismatch():
csv_path = "tests/negative.metadata_mismatch.csv"

csvw1 = CSVW(csv_path=csv_path,
metadata_path="tests/negative.NumberOfNonVirtualColumnsMismatch1.csv-metadata.json")
csvw2 = CSVW(csv_path=csv_path,
metadata_path="tests/negative.NumberOfNonVirtualColumnsMismatch2.csv-metadata.json")

with pytest.raises(NumberOfNonVirtualColumnsMismatch) as exc:
print(csvw1.to_rdf())
assert "metadata, 2" in str(exc.value)
assert "row 1, 3" in str(exc.value)

with pytest.raises(NumberOfNonVirtualColumnsMismatch) as exc:
print(csvw2.to_rdf())
assert "metadata, 4" in str(exc.value)
assert "row 1, 3" in str(exc.value)

with pytest.raises(VirtualColumnPrecedesNonVirtualColumn) as exc:
CSVW(csv_path=csv_path,
metadata_path='tests/negative.VirtualColumnPrecedesNonVirtualColumn.csv-metadata.json')
assert "t2" in str(exc.value)

















Expand Down
7 changes: 7 additions & 0 deletions tests/url_special_chars.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
t#,t/,t:,t-,t ,t_,tnormal
c#1,chash2,chash3,chash4,chash5,chash6,#/:- _r1
c/1,c/2,c/3,c/4,c/5,c/6,/#:- _r2
c:1,c:2,c:3,c:4,c:5,c:6,:#/-_ r3
c-1,c-2,c-3,c-4,c-5,c-6,-/#_ :r4
c 1,c 2,c 3,c 4,c 5,c 6, -/#:_r5
c_1,c_2,c_3,c_4,c_5,c_6,_ /:#r6
Loading

0 comments on commit 02cff71

Please sign in to comment.