Skip to content

Commit

Permalink
21169: Infer feature attributes: Added an example CSV parser, fixed a…
Browse files Browse the repository at this point in the history
… bug that caused iterating over return values to order alphabetically, updated the unknown type defaults (#21)

- Altered the unknown feature return type to `{ type: "continuous",
data_type: "number", bounds: { allow_null: true } }`
  • Loading branch information
lancegliser authored Oct 2, 2024
1 parent fde04d8 commit 86dfea4
Show file tree
Hide file tree
Showing 7 changed files with 276 additions and 4 deletions.
3 changes: 3 additions & 0 deletions jest.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
export default {
preset: "ts-jest",
testEnvironment: "jsdom",
moduleNameMapper: {
"^d3-dsv$": "<rootDir>/node_modules/d3-dsv/dist/d3-dsv.min.js",
},
testMatch: ["**/?(*.)+(spec|test).ts?(x)"],
transform: {
"^.+\\.[tj]s$": [
Expand Down
79 changes: 79 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,14 @@
"@eslint/js": "^9.10.0",
"@rollup/plugin-terser": "^0.4.4",
"@rollup/plugin-typescript": "^11.1.6",
"@types/d3-dsv": "^3.0.7",
"@types/emscripten": "^1.39.10",
"@types/eslint__js": "^8.42.3",
"@types/jest": "^29.5.13",
"@types/node": "^18.15.2",
"@types/uuid": "^9.0.1",
"@typescript-eslint/parser": "^8.5.0",
"d3-dsv": "^3.0.1",
"eslint": "^9.10.0",
"eslint-config-prettier": "^9.1.0",
"eslint-plugin-prettier": "^5.2.1",
Expand Down
12 changes: 8 additions & 4 deletions src/features/sources/Base.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,13 @@ export abstract class InferFeatureAttributesBase {

/* Entrypoint */
public async infer(options: InferFeatureAttributesOptions = {}): Promise<FeatureAttributesIndex> {
const attributes: Record<string, FeatureAttributes> = options.defaults || {};
const { ordinalFeatureValues = {}, dependentFeatures = {} } = options;
// Loop the columns into attributes immediately to get order assigned. Probably should be a Map...
const columns = await this.getFeatureNames();
const attributes: FeatureAttributesIndex = columns.reduce((attributes, column) => {
attributes[column] = (options.defaults?.[column] || {}) as FeatureAttributes;
return attributes;
}, {} as FeatureAttributesIndex);
const { ordinalFeatureValues = {}, dependentFeatures = {} } = options;

const getFeatureAttributes = async (featureName: string): Promise<FeatureAttributes | undefined> => {
const originalFeatureType = await this.getOriginalFeatureType(featureName);
Expand Down Expand Up @@ -232,7 +236,7 @@ export abstract class InferFeatureAttributesBase {
/* eslint-disable-next-line @typescript-eslint/no-unused-vars*/
featureName: string,
): Promise<FeatureAttributes> {
return { type: "nominal" };
return { type: "continuous", data_type: "number", bounds: { allow_null: true } };
}

/* Feature properties */
Expand All @@ -256,7 +260,7 @@ export abstract class InferFeatureAttributesBase {
if (!this.statistics[featureName]) {
throw new Error(`this.statistics[${featureName}] is undefined`);
}
return this.statistics[featureName]?.samples.at(0);
return this.statistics[featureName]?.samples.filter((sample) => sample !== null || sample !== undefined).at(0);
}

/* Descriptive operations */
Expand Down
83 changes: 83 additions & 0 deletions src/features/sources/examples/CSV.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import { readFileSync } from "fs";
import type { FeatureAttributes, FeatureAttributesIndex } from "../../../types";
import type { AbstractDataType } from "../../base";
import { expectFeatureAttributesIndex } from "../../infer.test";
import { InferFeatureAttributesFromCSV, type InferFeatureAttributesFromCSVOptions } from "./CSV";

describe("features/sources/CSV", () => {
describe("InferFeatureAttributesFromArray", () => {
it("isAcceptedSourceFormat should accept csv only", () => {
expect(InferFeatureAttributesFromCSV.isAcceptedSourceFormat("" as unknown as AbstractDataType)).toBe(true);
// @ts-expect-error Invalid data type on purpose
expect(InferFeatureAttributesFromCSV.isAcceptedSourceFormat([{}, {}])).toBe(false);
});

describe("infer", () => {
const data = readFileSync("src/tests/assets/asteroids-sample.csv").toString();
describe("asteroids", () => {
const columns = [
"full_name",
"a",
"e",
"G",
"i",
"om",
"w",
"q",
"ad",
"per_y",
"data_arc",
"condition_code",
"n_obs_used",
"H",
"diameter",
"extent",
"albedo",
"rot_per",
"GM",
"BV",
"UB",
"IR",
"spec_B",
"spec_T",
"neo",
"pha",
"moid",
];

it("should infer feature attributes from data", async () => {
const service = new InferFeatureAttributesFromCSV(data);
const features = await service.infer();
expectFeatureAttributesIndex(features);
expectAsteroids(features);
});

it("should infer feature attributes from data using options", async () => {
const serviceOptions: InferFeatureAttributesFromCSVOptions = {
limit: 5,
samplesLimit: 2,
};
const service = new InferFeatureAttributesFromCSV(data, serviceOptions);
expect(service.samples?.length).toBe(serviceOptions.samplesLimit);

const features = await service.infer({ includeSample: true });
expect(Object.keys(features)).toStrictEqual(columns);
expectFeatureAttributesIndex(features);
expectAsteroids(features);
});

const expectAsteroids = (features: FeatureAttributesIndex) => {
expect(Object.keys(features)).toStrictEqual(columns);
expectIRFeature(features.IR);
};

// IR is a special snowflake, there's no data for it. Testing the unknown handling
const expectIRFeature = (attributes: FeatureAttributes) => {
expect(attributes.type).toBe("continuous");
expect(attributes.data_type).toBe("number");
expect(attributes.bounds?.allow_null).toBe(true);
};
});
});
});
});
64 changes: 64 additions & 0 deletions src/features/sources/examples/CSV.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* This class is an example implementation of a CSV parser.
* It provides inspiration, and an easy means for testing our own CSV files.
* Your implementation will vary, especially in options.
*/

import { autoType, csvParse } from "d3-dsv";
import type { AbstractDataType, FeatureSourceFormat } from "../../base";
import { InferFeatureAttributesFromArray } from "../Array";

export type InferFeatureAttributesFromCSVOptions = {
limit?: number;
/** The number of samples to be returned. Default: 5 */
samplesLimit?: number;
};
export class InferFeatureAttributesFromCSV extends InferFeatureAttributesFromArray {
public static sourceFormat: FeatureSourceFormat = "parsed_array";
public readonly samples: ReturnType<typeof samplesAutoType>[] | undefined;

public static isAcceptedSourceFormat(data: AbstractDataType): boolean {
return typeof data === "string";
}

constructor(dataset: string, options: InferFeatureAttributesFromCSVOptions = {}) {
options.samplesLimit ||= 5;

const raw = csvParse(dataset);
const limited = raw.slice(0, options.limit).map((row) => ({ ...row }));
// @ts-expect-error I'll assign column immediately below
const data: typeof raw = limited.map(autoType);
data.columns = raw.columns;

super({
columns: data.columns,
data: data.map((object) => Object.values(object)),
});

if (options.samplesLimit) {
this.samples = raw.slice(0, options.samplesLimit).map(samplesAutoType);
}
}
}

// Adapted from https://github.com/d3/d3-dsv/blob/main/src/autoType.js
function samplesAutoType(object: Record<string, string>) {
for (const key in object) {
let value: string | number | null = object[key].trim(),
number;
if (!value) value = null;
// else if (value === "true") value = true;
// else if (value === "false") value = false;
else if (value === "NaN") value = NaN;
else if (!isNaN((number = +value))) value = number;
// We don't want dates to show
// else if (m = value.match(/^([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)?(T\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/)) {
// if (fixtz && !!m[4] && !m[7]) value = value.replace(/-/g, "/").replace(/T/, " ");
// value = new Date(value);
// }
else continue;
// @ts-expect-error It's sure to be a string on the way in, but not so much on the way out
object[key] = value;
}
return object as Record<string, string | number | null>;
}
37 changes: 37 additions & 0 deletions src/tests/assets/asteroids-sample.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
full_name,a,e,G,i,om,w,q,ad,per_y,data_arc,condition_code,n_obs_used,H,diameter,extent,albedo,rot_per,GM,BV,UB,IR,spec_B,spec_T,neo,pha,moid
1 Ceres,2.769165155,0.076009029,0.12,10.59406704,80.30553157,73.59769412,2.5586836,2.979646709,4.608201802,8822,0,1002,3.34,939.4,964.4 x 964.2 x 891.8,0.09,9.07417,62.6284,0.713,0.426,,C,G,N,N,1.59478
2 Pallas,2.772465922,0.230336821,0.11,34.83623442,173.0800627,310.0488574,2.133864935,3.411066909,4.616443528,72318,0,8490,4.13,545,582x556x500,0.101,7.8132,14.3,0.635,0.284,,B,B,N,N,1.23324
3 Juno,2.669149517,0.25694232,0.32,12.98891913,169.8527598,248.1386262,1.983332047,3.354966987,4.360813923,72684,0,7104,5.33,246.596,,0.214,7.21,,0.824,0.433,,Sk,S,N,N,1.03454
4 Vesta,2.361417896,0.08872146,0.32,7.141770812,103.8108044,150.7285413,2.151909454,2.570926338,3.628837138,24288,0,9325,3.2,525.4,572.6 x 557.2 x 446.4,0.4228,5.34212766,17.8,0.782,0.492,,V,V,N,N,1.13948
5 Astraea,2.574248919,0.191094519,,5.366987944,141.5766042,358.6876078,2.08232406,3.066173778,4.130322954,63431,0,2861,6.85,106.699,,0.274,16.806,,0.826,0.411,,S,S,N,N,1.09589
6 Hebe,2.42515999,0.203007109,0.24,14.7379011,138.6402028,239.8074902,1.93283527,2.917484709,3.776754838,62329,0,6034,5.71,185.18,,0.2679,7.2745,,0.822,0.399,,S,S,N,N,0.973965
7 Iris,2.385333814,0.231205792,,5.523651387,259.5632307,145.2651058,1.833830821,2.936836807,3.684104574,62452,0,5206,5.51,199.83,,0.2766,7.139,,0.855,0.484,,S,S,N,N,0.8461
8 Flora,2.201764189,0.156499251,0.28,5.88695456,110.8893299,285.2874622,1.857189743,2.546338635,3.267114898,62655,0,2744,6.49,147.491,,0.226,12.865,,0.885,0.489,,,S,N,N,0.874176
9 Metis,2.385636536,0.123114272,0.17,5.576815511,68.9085767,6.417369231,2.09193063,2.679342441,3.684805919,61821,0,2649,6.28,190,,0.118,5.079,,0.858,0.496,,,S,N,N,1.10691
10 Hygiea,3.141539179,0.112460658,,3.831560034,283.2021669,312.3152062,2.788239617,3.494838741,5.56829099,62175,0,3409,5.43,407.12,,0.0717,27.63,7,0.696,0.351,,C,C,N,N,1.77839
11 Parthenope,2.453109376,0.100472272,,4.629885838,125.5465868,195.5503938,2.206639904,2.699578848,3.842231878,61699,0,5475,6.55,142.887,,0.191,13.7204,,0.837,0.417,,Sk,S,N,N,1.19322
12 Victoria,2.334315086,0.220171581,0.22,8.373074237,235.4101683,69.64182011,1.820365243,2.84826493,3.566542615,61581,0,3051,7.24,115.087,,0.163,8.6599,,0.874,0.515,,L,S,N,N,0.824953
13 Egeria,2.575981091,0.085121415,,16.53612295,43.22191706,80.54483096,2.356709935,2.795252248,4.1344925,61531,0,2359,6.74,222.792,,0.07,7.045,,0.745,0.452,,Ch,G,N,N,1.43633
14 Irene,2.585567305,0.16658231,,9.121643599,86.12266133,97.85899133,2.15485753,3.016277081,4.157593008,61450,0,2688,6.3,152,,0.159,15.028,,0.833,0.388,,S,S,N,N,1.17966
15 Eunomia,2.644100304,0.186084362,0.23,11.75242982,292.9343387,98.49868084,2.152074586,3.136126023,4.299570669,61247,0,2501,5.28,231.689,,0.248,6.083,,0.839,0.451,,S,S,N,N,1.19485
16 Psyche,2.92381368,0.133568415,0.2,3.096005209,150.0456664,228.8230714,2.533284522,3.314342839,4.999571031,12856,0,2364,5.9,226 ,279 x 232 x 189,0.1203,4.196,1.53,0.729,0.299,,X,M,N,N,1.5358
17 Thetis,2.470354085,0.133031598,,5.591204766,125.5529437,136.2082517,2.141718935,2.798989236,3.882817805,61117,0,3650,7.76,84.899,,0.193,12.27048,,0.829,0.438,,Sl,S,N,N,1.12981
18 Melpomene,2.29665351,0.217674362,0.25,10.1287313,150.3838618,227.9508469,1.796730922,2.796576098,3.480578404,60906,0,5082,6.51,139.594,,0.181,11.57,,0.854,0.425,,S,S,N,N,0.813258
19 Fortuna,2.442710697,0.158046851,0.1,1.573782207,211.1440435,182.0650176,2.056647963,2.828773431,3.817827076,60970,0,3316,7.13,200,,0.037,7.4432,,0.719,0.324,,Ch,G,N,N,1.06213
20 Massalia,2.409781792,0.142066705,0.25,0.708751203,206.1089109,256.7731964,2.067432032,2.752131551,3.740888639,59461,0,2481,6.5,135.68,,0.241,8.098,,0.854,0.463,,S,S,N,N,1.08461
21 Lutetia,2.434958098,0.163356103,0.11,3.064054861,80.86581221,249.9169213,2.037192832,2.832723363,3.799666154,55915,0,4752,7.35,95.76,,0.2212,8.1655,,0.686,0.189,,Xk,M,N,N,1.02791
22 Kalliope,2.914849074,0.097676889,0.21,13.71531205,66.05276971,356.0820817,2.630135684,3.199562465,4.976595142,60393,0,2923,6.45,167.536,,0.166,4.1483,0.491,0.715,0.234,,X,M,N,N,1.64321
23 Thalia,2.625683428,0.234983917,,10.11425811,66.84681403,60.63681441,2.008690051,3.242676805,4.254727466,59343,0,2213,6.95,107.53,,0.2536,12.312,,0.859,0.442,,S,S,N,N,1.04633
24 Themis,3.136171315,0.124742775,0.19,0.751587935,35.92589903,106.957169,2.744956601,3.527386028,5.554025502,60645,0,3662,7.08,198,,0.067,8.374,,0.684,0.336,,B,C,N,N,1.75813
25 Phocaea,2.400160565,0.254613898,,21.60484031,214.1306087,90.26321459,1.789046327,3.011274804,3.718507367,58150,0,3132,7.83,61.054,,0.35,9.9341,,0.932,0.513,,S,S,N,N,0.923495
26 Proserpina,2.654330893,0.090144874,,3.563412996,45.77798089,193.4497991,2.415056568,2.893605218,4.324548727,57907,0,2259,7.4,94.8,,0.1966,13.11,,0.891,0.525,,S,S,N,N,1.40287
27 Euterpe,2.346664413,0.173225845,,1.583713718,94.78795695,356.4498658,1.940161487,2.753167339,3.594882363,60345,0,2669,7,96,,0.215,10.4082,,0.878,0.502,,S,S,N,N,0.956809
28 Bellona,2.77576593,0.151862613,,9.429548932,144.2961697,344.1009091,2.354230863,3.197300998,4.624688265,59495,0,2738,7.09,120.9,,0.1763,15.706,,0.845,0.469,,S,S,N,N,1.37086
29 Amphitrite,2.554113563,0.072695536,0.2,6.082523344,356.3417672,63.36324048,2.368440909,2.739786216,4.081957767,59923,0,2318,5.85,189.559,,0.216,5.3921,,0.838,0.449,,S,S,N,N,1.3871
30 Urania,2.365572137,0.127581,,2.095743731,307.4686116,87.42620759,2.063770078,2.667374196,3.638417204,58087,0,3289,7.57,92.787,,0.192,13.686,,0.873,0.459,,Sl,S,N,N,1.0709
31 Euphrosyne,3.155402311,0.220857011,,26.30334546,31.11858135,61.47037758,2.458509589,3.852295033,5.605189646,58007,0,2377,6.74,267.08,,0.053,5.53,,0.687,0.317,,Cb,C,N,N,1.56992
32 Pomona,2.588507764,0.080901389,,5.523952791,220.4415627,339.5357809,2.379093891,2.797921637,4.164687411,56838,0,2133,7.56,80.76,,0.2564,9.448,,0.857,0.429,,S,S,N,N,1.37704
33 Polyhymnia,2.87345553,0.331346295,0.33,1.852843675,8.466365462,338.3718433,1.921346687,3.825564373,4.870964033,59639,0,2832,8.55,52.929,,0.24,18.608,,0.848,0.438,,Sq,S,N,N,0.913808
34 Circe,2.68738341,0.105948047,,5.496017155,184.3613092,330.6761387,2.402660386,2.972106434,4.405575516,56728,0,3115,8.51,132.992,,0.052,12.15,,0.707,0.357,,Ch,C,N,N,1.41435
35 Leukothea,2.994071835,0.225588162,,7.932713103,353.7389409,213.3967316,2.318644673,3.669498998,5.180856057,58229,0,3165,8.5,103.055,,0.066,31.9,,0.703,0.335,,C,C,N,N,1.31998
36 Atalante,2.74682401,0.304906016,,18.36926143,358.2100148,47.79654908,1.909300844,3.584347177,4.552547174,56069,0,1720,8.46,115.204,,0.036,9.93,,0.713,0.363,,,C,N,N,0.959267

0 comments on commit 86dfea4

Please sign in to comment.