Skip to content

Commit

Permalink
[SEDONA-497] Fix incorrect fieldNames property of SpatialRDD read fro…
Browse files Browse the repository at this point in the history
…m a directory containing multiple shapefiles (apache#1243)
  • Loading branch information
Kontinuation authored Feb 19, 2024
1 parent b8f9fd3 commit 12d3dab
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -178,10 +178,16 @@ public static List<String> readFieldNames(JavaSparkContext sc, String inputPath)
fieldDescriptors = fieldDescriptors.reduceByKey(new Function2<String, String, String>()
{
@Override
public String call(String descripter1, String descripter2)
public String call(String descriptor1, String descriptor2)
throws Exception
{
return descripter1 + " " + descripter2;
if (!descriptor1.equals(descriptor2)) {
String message = String.format("Detected different schema in the input shapefiles:\n %s\n %s\n" +
"Please make sure all shapefiles have the same schema.",
descriptor1, descriptor2);
throw new IOException(message);
}
return descriptor1;
}
});
// if there is a result assign it to variable : fieldNames
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

package org.apache.sedona.core.formatMapper.shapefileParser.shapes;

import org.apache.commons.io.FileUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.LocatedFileStatus;
Expand All @@ -35,6 +36,7 @@
import org.apache.sedona.core.spatialRDD.PointRDD;
import org.apache.sedona.core.spatialRDD.PolygonRDD;
import org.apache.sedona.core.spatialRDD.SpatialRDD;
import org.apache.spark.SparkException;
import org.geotools.data.DataStore;
import org.geotools.data.DataStoreFinder;
import org.geotools.data.FeatureSource;
Expand Down Expand Up @@ -62,6 +64,8 @@
import java.util.Map;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThrows;
import static org.junit.Assert.assertTrue;

public class ShapefileReaderTest
extends TestBase
Expand Down Expand Up @@ -392,5 +396,27 @@ public void testReadMultipleShapeFilesByMultiPartitions()
// load shapes with our tool
SpatialRDD shapeRDD = ShapefileReader.readToGeometryRDD(sc, inputLocation);
assert (shapeRDD.rawSpatialRDD.getNumPartitions() == 2);
assertEquals("[STATEFP, COUNTYFP, COUNTYNS, AFFGEOID, GEOID, NAME, LSAD, ALAND, AWATER]", shapeRDD.fieldNames.toString());
}

/**
* Test reading multiple shape files with inconsistent schema. It should throw an exception.
*/
@Test
public void testReadMultipleShapeFilesWithInconsistentSchema() throws IOException {
String outputLocation = getShapeFilePath("multipleshapefiles") + "-multischema";
try {
String inputLocation = getShapeFilePath("multipleshapefiles");
String inputLocation2 = getShapeFilePath("gis_osm_pois_free_1");
FileUtils.copyDirectory(new File(inputLocation), new File(outputLocation));
FileUtils.copyDirectory(new File(inputLocation2), new File(outputLocation));
SparkException exception = assertThrows(SparkException.class, () -> ShapefileReader.readToGeometryRDD(sc, outputLocation));
assertTrue(exception.getMessage().contains("different schema"));
} finally {
File file = new File(outputLocation);
if (file.exists()) {
FileUtils.deleteDirectory(file);
}
}
}
}

0 comments on commit 12d3dab

Please sign in to comment.