From 846dd72cd1ddf42954b64839cbd4e4cb3ff7202e Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Fri, 18 Aug 2023 09:57:22 +0100 Subject: [PATCH 01/44] Initial commit of queued ingestion client Author: Krishan Mistry --- Cargo.toml | 2 +- azure-kusto-ingest/Cargo.toml | 23 ++ azure-kusto-ingest/src/data_format.rs | 108 ++++++ azure-kusto-ingest/src/descriptors.rs | 98 +++++ azure-kusto-ingest/src/errors.rs | 6 + azure-kusto-ingest/src/ingestion_blob_info.rs | 125 ++++++ .../src/ingestion_properties.rs | 108 ++++++ azure-kusto-ingest/src/lib.rs | 7 + azure-kusto-ingest/src/queued_ingest.rs | 126 ++++++ azure-kusto-ingest/src/resource_manager.rs | 364 ++++++++++++++++++ azure-kusto-ingest/src/result.rs | 40 ++ 11 files changed, 1006 insertions(+), 1 deletion(-) create mode 100644 azure-kusto-ingest/Cargo.toml create mode 100644 azure-kusto-ingest/src/data_format.rs create mode 100644 azure-kusto-ingest/src/descriptors.rs create mode 100644 azure-kusto-ingest/src/errors.rs create mode 100644 azure-kusto-ingest/src/ingestion_blob_info.rs create mode 100644 azure-kusto-ingest/src/ingestion_properties.rs create mode 100644 azure-kusto-ingest/src/lib.rs create mode 100644 azure-kusto-ingest/src/queued_ingest.rs create mode 100644 azure-kusto-ingest/src/resource_manager.rs create mode 100644 azure-kusto-ingest/src/result.rs diff --git a/Cargo.toml b/Cargo.toml index 739a289..d5a5705 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,2 +1,2 @@ [workspace] -members = ["azure-kusto-data"] +members = ["azure-kusto-data", "azure-kusto-ingest"] diff --git a/azure-kusto-ingest/Cargo.toml b/azure-kusto-ingest/Cargo.toml new file mode 100644 index 0000000..c9b178a --- /dev/null +++ b/azure-kusto-ingest/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "azure-kusto-ingest" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +uuid = {version = "1", features = ["v4", "serde"]} +serde = { version = "1", features = ["serde_derive"] } +serde_json = "1" +serde_repr = "0.1" +url = "2" +azure-kusto-data = {path = "../azure-kusto-data"} +anyhow = "1.0.72" +tokio = { version = "1", features = ["full"] } + +azure_core = "0.13" +azure_storage = "0.13" +azure_storage_blobs = "0.13" +azure_storage_queues = "0.13" +time = { version = "0.3", features = ["serde"] } +azure_identity = "0.13.0" diff --git a/azure-kusto-ingest/src/data_format.rs b/azure-kusto-ingest/src/data_format.rs new file mode 100644 index 0000000..0a985e1 --- /dev/null +++ b/azure-kusto-ingest/src/data_format.rs @@ -0,0 +1,108 @@ +use serde::Serialize; + +#[derive(Serialize, Clone, Debug)] +pub enum IngestionMappingKind { + #[serde(rename = "Csv")] + CSV, + #[serde(rename = "Json")] + JSON, + Avro, + ApacheAvro, + Parquet, + SStream, + #[serde(rename = "Orc")] + ORC, + #[serde(rename = "W3CLogFile")] + W3CLOGFILE, + Unknown, +} + +/// All data formats supported by Kusto +#[derive(Serialize, Clone, Debug)] +pub enum DataFormat { + #[serde(rename = "apacheavro")] + ApacheAvro, + #[serde(rename = "avro")] + Avro, + #[serde(rename = "csv")] + CSV, + #[serde(rename = "json")] + JSON, + #[serde(rename = "multijson")] + MultiJSON, + #[serde(rename = "orc")] + ORC, + #[serde(rename = "parquet")] + Parquet, + #[serde(rename = "psv")] + PSV, + #[serde(rename = "raw")] + RAW, + #[serde(rename = "scsv")] + SCSV, + #[serde(rename = "sohsv")] + SOHsv, + #[serde(rename = "singlejson")] + SingleJSON, + #[serde(rename = "sstream")] + SStream, + #[serde(rename = "tsv")] + TSV, + #[serde(rename = "tsve")] + TSVe, + #[serde(rename = "txt")] + TXT, + #[serde(rename = "w3clogfile")] + W3CLOGFILE, +} + +impl DataFormat { + pub fn default() -> Self { + DataFormat::CSV + } + + pub fn ingestion_mapping_kind(self) -> IngestionMappingKind { + match self { + DataFormat::CSV => IngestionMappingKind::CSV, + DataFormat::TSV => IngestionMappingKind::CSV, + DataFormat::SCSV => IngestionMappingKind::CSV, + DataFormat::SOHsv => IngestionMappingKind::CSV, + DataFormat::PSV => IngestionMappingKind::CSV, + DataFormat::TXT => IngestionMappingKind::CSV, + DataFormat::TSVe => IngestionMappingKind::CSV, + DataFormat::JSON => IngestionMappingKind::JSON, + DataFormat::SingleJSON => IngestionMappingKind::JSON, + DataFormat::MultiJSON => IngestionMappingKind::JSON, + DataFormat::Avro => IngestionMappingKind::Avro, + DataFormat::ApacheAvro => IngestionMappingKind::ApacheAvro, + DataFormat::Parquet => IngestionMappingKind::Parquet, + DataFormat::SStream => IngestionMappingKind::SStream, + DataFormat::ORC => IngestionMappingKind::ORC, + DataFormat::RAW => IngestionMappingKind::CSV, + DataFormat::W3CLOGFILE => IngestionMappingKind::W3CLOGFILE, + } + } + + /// Binary formats should not be compressed + pub fn compressible(self) -> bool { + match self { + DataFormat::CSV => true, + DataFormat::TSV => true, + DataFormat::SCSV => true, + DataFormat::SOHsv => true, + DataFormat::PSV => true, + DataFormat::TXT => true, + DataFormat::TSVe => true, + DataFormat::JSON => true, + DataFormat::SingleJSON => true, + DataFormat::MultiJSON => true, + DataFormat::Avro => true, + DataFormat::ApacheAvro => true, + DataFormat::Parquet => false, + DataFormat::SStream => false, + DataFormat::ORC => false, + DataFormat::RAW => true, + DataFormat::W3CLOGFILE => true, + } + } +} diff --git a/azure-kusto-ingest/src/descriptors.rs b/azure-kusto-ingest/src/descriptors.rs new file mode 100644 index 0000000..bd4a7bf --- /dev/null +++ b/azure-kusto-ingest/src/descriptors.rs @@ -0,0 +1,98 @@ +use std::{io::Read, path::PathBuf, fmt::format}; + +use azure_storage::StorageCredentials; +use url::Url; +use uuid::Uuid; + +#[derive(Clone, Debug)] +pub enum BlobAuth { + SASToken(), + // adds `;managed_identity=` to the blob path + UserAssignedManagedIdentity(String), + // adds `;managed_identity=system` to the blob path + SystemAssignedManagedIdentity +} + +#[derive(Clone, Debug)] +pub struct BlobDescriptor { + uri: Url, + pub(crate) size: Option, + pub(crate) source_id: Uuid, + blob_auth: Option +} + +impl BlobDescriptor { + pub fn new(uri: Url, size: Option, source_id: Option) -> Self { + let source_id = match source_id { + Some(source_id) => source_id, + None => Uuid::new_v4(), + }; + + Self { + uri, + size, + source_id, + blob_auth: None, + } + } + + pub fn with_blob_auth(mut self, blob_auth: BlobAuth) -> Self { + self.blob_auth = Some(blob_auth); + self + } + + pub fn uri(&self) -> String { + match &self.blob_auth { + Some(BlobAuth::SASToken()) => { + let mut uri = self.uri.clone(); + uri.set_query(Some("sas_token")); + uri.to_string() + }, + Some(BlobAuth::UserAssignedManagedIdentity(identity)) => { + format!("{};managed_identity={}", self.uri, identity) + }, + Some(BlobAuth::SystemAssignedManagedIdentity) => { + format!("{};managed_identity=system", self.uri) + }, + None => self.uri.to_string(), + } + } +} + +#[derive(Clone, Debug)] +pub struct FileDescriptor { + pub path: PathBuf, + pub size: Option, + pub source_id: Uuid, +} + +impl FileDescriptor { + pub fn new(path: PathBuf, size: Option, source_id: Option) -> Self { + unimplemented!() + } +} + +// #[derive(Clone, Debug)] +pub struct StreamDescriptor { + stream: Box, + size: Option, + source_id: Uuid, + compressed: bool, + stream_name: String, +} + +impl StreamDescriptor { + pub fn new( + stream: Box, + size: Option, + source_id: Option, + compressed: bool, + stream_name: String, + ) -> Self { + unimplemented!() + } + + pub fn from_file_descriptor(file_descriptor: FileDescriptor) -> Self { + unimplemented!() + } +} diff --git a/azure-kusto-ingest/src/errors.rs b/azure-kusto-ingest/src/errors.rs new file mode 100644 index 0000000..f833f07 --- /dev/null +++ b/azure-kusto-ingest/src/errors.rs @@ -0,0 +1,6 @@ +pub enum KustoClientError { + KustoMappingError, + KustoDuplicateMappingError, + KustoMissingMappingError, + KustoInvalidEndpointError, +} \ No newline at end of file diff --git a/azure-kusto-ingest/src/ingestion_blob_info.rs b/azure-kusto-ingest/src/ingestion_blob_info.rs new file mode 100644 index 0000000..806e662 --- /dev/null +++ b/azure-kusto-ingest/src/ingestion_blob_info.rs @@ -0,0 +1,125 @@ +use std::collections::HashMap; + +use serde::Serialize; + +use crate::{ + data_format::DataFormat, + descriptors::BlobDescriptor, + ingestion_properties::{IngestionProperties, ReportLevel, ReportMethod, ValidationPolicy}, + resource_manager::KustoIdentityToken, +}; + +// Basing the ingestion message on +// https://learn.microsoft.com/en-us/azure/data-explorer/kusto/api/netfx/kusto-ingest-client-rest#ingestion-message-internal-structure +#[derive(Serialize, Debug)] +#[serde(rename_all = "PascalCase")] +pub struct QueuedIngestionMessage { + id: uuid::Uuid, + blob_path: String, + database_name: String, + table_name: String, + #[serde(skip_serializing_if = "Option::is_none")] + raw_data_size: Option, + #[serde(skip_serializing_if = "Option::is_none")] + retain_blob_on_success: Option, + #[serde(skip_serializing_if = "Option::is_none")] + flush_immediately: Option, + #[serde(skip_serializing_if = "Option::is_none")] + ignore_size_limit: Option, + // according to Go impl, the report level and method could be Option + report_level: ReportLevel, + report_method: ReportMethod, + // TODO: implement this + // #[serde(skip_serializing_if = "Option::is_none")]s + // #[serde(skip_serializing_if = "Option::is_none")] + // #[serde(with= "time::serde::iso8601")] + source_message_creation_time: String, + // The additional properties struct is modelled on: + // https://learn.microsoft.com/en-us/azure/data-explorer/ingestion-properties + additional_properties: AdditionalProperties, +} + +impl QueuedIngestionMessage { + pub fn new( + blob_descriptor: BlobDescriptor, + ingestion_properties: &IngestionProperties, + auth_context: &KustoIdentityToken, + ) -> Self { + let additional_properties = AdditionalProperties { + ingestion_mapping: None, + ingestion_mapping_reference: None, + creation_time: None, + extend_schema: None, + folder: None, + data_format: ingestion_properties.data_format.clone(), + ingest_if_not_exists: None, + ignore_first_record: None, + policy_ingestiontime: None, + recreate_schema: None, + tags: vec![], + validation_policy: None, + zip_pattern: None, + authorization_context: auth_context.clone(), + extra_additional_properties: HashMap::new(), + }; + + Self { + id: blob_descriptor.source_id, + blob_path: blob_descriptor.uri().to_string(), + raw_data_size: blob_descriptor.size, + database_name: ingestion_properties.database_name.clone(), + table_name: ingestion_properties.table_name.clone(), + retain_blob_on_success: ingestion_properties.retain_blob_on_success, + flush_immediately: ingestion_properties.flush_immediately, + report_level: ingestion_properties.report_level.clone(), + report_method: ingestion_properties.report_method.clone(), + ignore_size_limit: Some(false), + // TODO: configurability of creation time + source_message_creation_time: String::from("2023-08-16T13:30:04.639714"), + additional_properties: additional_properties, + } + } +} + +// The additional properties struct is modelled on: https://learn.microsoft.com/en-us/azure/data-explorer/ingestion-properties +#[derive(Serialize, Clone, Debug)] +pub struct AdditionalProperties { + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(rename = "ingestionMapping")] + pub ingestion_mapping: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(rename = "ingestionMappingReference")] + pub ingestion_mapping_reference: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(rename = "creationTime")] + pub creation_time: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub extend_schema: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub folder: Option, + #[serde(rename = "format")] + pub data_format: DataFormat, + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(rename = "ingestIfNotExists")] + pub ingest_if_not_exists: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(rename = "ignoreFirstRecord")] + pub ignore_first_record: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub policy_ingestiontime: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub recreate_schema: Option, + #[serde(skip_serializing_if = "Vec::is_empty")] + pub tags: Vec, + #[serde(rename = "validationPolicy")] + #[serde(skip_serializing_if = "Option::is_none")] + pub validation_policy: Option, + #[serde(rename = "zipPattern")] + #[serde(skip_serializing_if = "Option::is_none")] + pub zip_pattern: Option, + // TODO: the user shouldn't be able to set this, we should expose certain properties via IngestionProperties rather than just the AdditionalProperties struct + #[serde(rename = "authorizationContext")] + pub authorization_context: KustoIdentityToken, + #[serde(flatten)] + pub extra_additional_properties: HashMap, +} diff --git a/azure-kusto-ingest/src/ingestion_properties.rs b/azure-kusto-ingest/src/ingestion_properties.rs new file mode 100644 index 0000000..ee573a8 --- /dev/null +++ b/azure-kusto-ingest/src/ingestion_properties.rs @@ -0,0 +1,108 @@ +use crate::data_format::{DataFormat, IngestionMappingKind}; +use serde::Serialize; +use serde_repr::Serialize_repr; + +#[derive(Clone, Debug)] +pub struct IngestionProperties { + pub database_name: String, + pub table_name: String, + pub retain_blob_on_success: Option, + pub data_format: DataFormat, + // I think we could make this neater by using some enum wizardry to enforce certain checks that are being done currently + // I'm thinking of something like we give an ingestion mapping enum, with + pub ingestion_mapping: Option>, + pub ingestion_mapping_type: Option, + pub ingestion_mapping_reference: Option>, + pub additional_tags: Vec, + pub ingest_if_not_exists: Vec, + pub ingest_by_tags: Vec, + pub drop_by_tags: Vec, + pub flush_immediately: Option, + pub ignore_first_record: bool, + pub report_level: ReportLevel, + pub report_method: ReportMethod, + pub validation_policy: Option, + // TODO: don't expose AdditionalProperties to user... + // pub additional_properties: AdditionalProperties, + // pub additional_properties: AdditionalProperties, +} + +#[derive(Serialize, Clone, Debug)] +pub struct ValidationPolicy { + #[serde(rename = "ValidationOptions")] + validation_options: ValidationOptions, + #[serde(rename = "ValidationImplications")] + validation_implications: ValidationImplications, +} + +#[derive(Serialize_repr, Clone, Debug)] +#[repr(u8)] +pub enum ValidationOptions { + DoNotValidate = 0, + ValidateCsvInputConstantColumns = 1, + ValidateCsvInputColumnLevelOnly = 2, +} + +#[derive(Serialize_repr, Clone, Debug)] +#[repr(u8)] +pub enum ValidationImplications { + Fail = 0, + BestEffort = 1, +} + +#[derive(Serialize_repr, Clone, Debug)] +#[repr(u8)] +pub enum ReportLevel { + Failures = 0, + None = 1, + All = 2, +} + +#[derive(Serialize_repr, Clone, Debug)] +#[repr(u8)] +pub enum ReportMethod { + Queue = 0, + Table = 1, +} + +#[derive(Serialize, Clone, Debug)] +pub enum TransformationMethod { + PropertyBagArrayToDictionary, + SourceLocation, + SourceLineNumber, + DateTimeFromUnixSeconds, + DateTimeFromUnixMilliseconds, + DateTimeFromUnixMicroseconds, + DateTimeFromUnixNanoseconds, + DropMappedFields, + BytesAsBase64, +} + +/// Use this class to create mappings for IngestionProperties.ingestionMappings and utilize mappings that were not +/// pre-created (it is recommended to create the mappings in advance and use ingestionMappingReference). +/// To read more about mappings look here: https://docs.microsoft.com/en-us/azure/kusto/management/mappings +#[derive(Serialize, Clone, Debug)] +pub struct ColumnMapping { + #[serde(rename = "Column")] + column: String, + // TODO: can this be an enum? + #[serde(rename = "DataType")] + datatype: String, + #[serde(rename = "Properties")] + properties: ColumnMappingProperties, +} + +#[derive(Serialize, Clone, Debug)] +pub struct ColumnMappingProperties { + #[serde(rename = "Path")] + path: Option, + #[serde(rename = "Transform")] + transform: Option, + #[serde(rename = "Ordinal")] + // TODO: This should get serialized to a string + ordinal: Option, + #[serde(rename = "ConstValue")] + const_value: Option, + #[serde(rename = "Field")] + field: Option, +} diff --git a/azure-kusto-ingest/src/lib.rs b/azure-kusto-ingest/src/lib.rs new file mode 100644 index 0000000..1a250ae --- /dev/null +++ b/azure-kusto-ingest/src/lib.rs @@ -0,0 +1,7 @@ +pub mod descriptors; +pub mod ingestion_properties; +pub mod queued_ingest; +pub(crate) mod result; +pub(crate) mod resource_manager; +pub mod data_format; +pub(crate) mod ingestion_blob_info; \ No newline at end of file diff --git a/azure-kusto-ingest/src/queued_ingest.rs b/azure-kusto-ingest/src/queued_ingest.rs new file mode 100644 index 0000000..48e1ba9 --- /dev/null +++ b/azure-kusto-ingest/src/queued_ingest.rs @@ -0,0 +1,126 @@ +use std::time::Duration; + +use anyhow::Result; +use azure_core::base64; +use azure_kusto_data::prelude::KustoClient; + +use crate::descriptors::{BlobDescriptor, FileDescriptor, StreamDescriptor}; +use crate::ingestion_blob_info::QueuedIngestionMessage; +use crate::ingestion_properties::IngestionProperties; +use crate::resource_manager::ResourceManager; +use crate::result::{IngestionResult, IngestionStatus}; + +pub struct QueuedIngestClient { + // The KustoClient is used to get the ingestion resources, it should be a client against the ingestion cluster endpoint + // kusto_client: KustoClient, + resource_manager: ResourceManager, +} + +impl QueuedIngestClient { + pub fn new(kusto_client: KustoClient, refresh_period: Duration) -> Self { + let resource_manager = ResourceManager::new(kusto_client, refresh_period); + + Self { resource_manager } + } + + pub async fn ingest_from_blob( + mut self, + blob_descriptor: BlobDescriptor, + ingestion_properties: &IngestionProperties, + ) -> Result { + // the queues returned here should ideally be the storage queue client from azure-storage-queue + // as such, it may be better for ResourceManager to return a struct that contains the storage queue client + let ingestion_queues = self + .resource_manager + .secured_ready_for_aggregation_queues() + .await?; + + let auth_context = self.resource_manager.authorization_context().await?; + + println!("queues: {:#?}", ingestion_queues); + + let message = QueuedIngestionMessage::new( + blob_descriptor.clone(), + ingestion_properties, + auth_context, + ); + + println!("message as struct: {:#?}\n", message); + + // TODO: pick a random queue from the queue clients returned by the resource manager + let queue_client = ingestion_queues.first().unwrap().clone(); + println!("queue_client: {:#?}\n", queue_client); + + let message = serde_json::to_string(&message).unwrap(); + println!("message as string: {}\n", message); + // Base64 encode the ingestion message + let message = base64::encode(&message); + println!("message as base64 encoded string: {}\n", message); + + let resp = queue_client.put_message(message).await?; + + println!("resp: {:#?}\n", resp); + + Ok(IngestionResult::new( + IngestionStatus::Queued, + &ingestion_properties.database_name, + &ingestion_properties.table_name, + blob_descriptor.source_id, + Some(blob_descriptor.uri()), + )) + } + + pub async fn ingest_from_file( + self, + file_descriptor: FileDescriptor, + ingestion_properties: IngestionProperties, + ) -> Result { + unimplemented!() + // This function needs to upload the blob from the file, and then call on ingest_from_blob + + // self.ingest_from_blob(blob_descriptor, &ingestion_properties) + // .await + } + + pub async fn ingest_from_stream( + self, + stream_descriptor: StreamDescriptor, + ingestion_properties: IngestionProperties, + ) -> Result { + unimplemented!() + // This function needs to upload the blob from the stream, and then call on ingest_from_blob + + // self.ingest_from_blob(blob_descriptor, &ingestion_properties) + // .await + } + + async fn upload_from_different_descriptor( + self, + descriptor: FileDescriptor, + ingestion_properties: &IngestionProperties, + ) -> Result { + unimplemented!() + // WIP + // let blob_name = format!( + // "{database_name}_{table_name}_{source_id}_{stream_name}", + // database_name = ingestion_properties.database_name, + // table_name = ingestion_properties.table_name, + // source_id = descriptor.source_id, + // stream_name = descriptor.stream_name.to_str().unwrap().to_string() + // ); + + // let container_clients = self.resource_manager.temp_storage().await?; + // // TODO: pick a random container client from the container clients returned by the resource manager + // let container_client = container_clients.first().unwrap().clone(); + // let blob_client = container_client.blob_client(blob_name); + + // blob_client.put_block_blob(body) + + // blob_url = ""; + + // Ok(BlobDescriptor::new( + // blob_url, + // ingestion_properties.source_id, + // )) + } +} diff --git a/azure-kusto-ingest/src/resource_manager.rs b/azure-kusto-ingest/src/resource_manager.rs new file mode 100644 index 0000000..cbaef70 --- /dev/null +++ b/azure-kusto-ingest/src/resource_manager.rs @@ -0,0 +1,364 @@ +use std::time::{Duration, Instant}; + +use anyhow::{Ok, Result}; +use azure_kusto_data::{models::TableV1, prelude::KustoClient}; +use azure_storage::StorageCredentials; +use azure_storage_blobs::prelude::{ClientBuilder, ContainerClient}; +use url::Url; + +use azure_storage_queues::{QueueClient, QueueServiceClientBuilder}; + +#[derive(Debug, Clone)] +pub struct ResourceUri { + uri: String, + // parsed_uri: Url, + service_uri: String, + object_name: String, + sas_token: StorageCredentials, +} + +impl ResourceUri { + pub fn new(uri: String) -> Self { + println!("uri: {:#?}", uri); + let parsed_uri = Url::parse(&uri).unwrap(); + println!("parsed_uri: {:#?}", parsed_uri); + + let service_uri = parsed_uri.scheme().to_string() + + "://" + + parsed_uri.host_str().expect("We should get result here"); + let object_name = parsed_uri + .path() + .trim_start() + .trim_start_matches("/") + .to_string(); + let sas_token = parsed_uri + .query() + .expect("Returned URI should contain SAS token as query") + .to_string(); + let sas_token = StorageCredentials::sas_token(sas_token).unwrap(); + + Self { + uri, + // parsed_uri, + service_uri, + object_name, + sas_token, + } + } + + pub fn uri(&self) -> &str { + self.uri.as_str() + } + + pub fn service_uri(&self) -> &str { + self.service_uri.as_str() + } + + pub fn object_name(&self) -> &str { + self.object_name.as_str() + } + + pub fn sas_token(&self) -> &StorageCredentials { + &self.sas_token + } +} + +impl From<&ResourceUri> for QueueClient { + fn from(resource_uri: &ResourceUri) -> Self { + let queue_service = + QueueServiceClientBuilder::with_location(azure_storage::CloudLocation::Custom { + uri: resource_uri.service_uri().to_string(), + credentials: resource_uri.sas_token().clone(), + }) + .build(); + + queue_service.queue_client(resource_uri.object_name()) + } +} + +impl From<&ResourceUri> for ContainerClient { + fn from(resource_uri: &ResourceUri) -> Self { + ClientBuilder::with_location(azure_storage::CloudLocation::Custom { + uri: resource_uri.service_uri().to_string(), + credentials: resource_uri.sas_token().clone(), + }) + .container_client(resource_uri.object_name()) + } +} + +fn get_resource_by_name(table: &TableV1, resource_name: String) -> Vec { + let storage_root_index = table + .columns + .iter() + .position(|c| c.column_name == "StorageRoot") + .unwrap(); + let resource_type_name_index = table + .columns + .iter() + .position(|c| c.column_name == "ResourceTypeName") + .unwrap(); + + println!("table: {:#?}", table); + let resource_uris: Vec = table + .rows + .iter() + .filter(|r| r[resource_type_name_index] == resource_name) + .map(|r| { + ResourceUri::new( + r[storage_root_index] + .as_str() + .expect("We should get result here") + .to_string(), + ) + }) + .collect(); + + resource_uris +} + +pub struct IngestClientResources { + client: KustoClient, + secured_ready_for_aggregation_queues: Vec, + failed_ingestions_queues: Vec, + successful_ingestions_queues: Vec, + temp_storage: Vec, + ingestions_status_tables: Vec, + last_update: Option, + refresh_period: Duration, +} + +impl IngestClientResources { + pub fn new(client: KustoClient, refresh_period: Duration) -> Self { + Self { + client, + secured_ready_for_aggregation_queues: Vec::new(), + failed_ingestions_queues: Vec::new(), + successful_ingestions_queues: Vec::new(), + temp_storage: Vec::new(), + ingestions_status_tables: Vec::new(), + last_update: None, + refresh_period, + } + } + + fn is_not_applicable(&self) -> bool { + self.secured_ready_for_aggregation_queues.is_empty() + || self.failed_ingestions_queues.is_empty() + || self.successful_ingestions_queues.is_empty() + || self.temp_storage.is_empty() + || self.ingestions_status_tables.is_empty() + } + + // TODO: figure out refresh logic + // async fn refresh(&mut self) { + // self.get_ingest_client_resources().await + // // let interval = tokio::time::interval(self.refresh_period); + // // loop { + // // match self.get_ingest_client_resources(self.client.clone()).await { + // // Ok(_) => todo!(), + // // Err(e) => println!("Error: {}", e), + // // }; + + // // interval.tick().await; + // // } + + // // if self.last_update.is_none() + // // || self.last_update.unwrap().elapsed() > self.refresh_period + // // || self.is_not_applicable() + // // { + // // self.get_ingest_client_resources(client).await?; + // // self.last_update = Some(Instant::now()); + // // } + // // Ok(()) + // } + + // async fn refresh(&mut self, client: KustoClient) -> Result<()> { + // if self.last_update.is_none() + // || self.last_update.unwrap().elapsed() > self.refresh_period + // || self.is_not_applicable() + // { + // self.get_ingest_client_resources(client).await?; + // self.last_update = Some(Instant::now()); + // } + // Ok(()) + // } + + async fn get_ingest_client_resources(&mut self) -> Result<()> { + let results = self + .client + .execute_command("NetDefaultDB", ".get ingestion resources", None) + .await?; + let table = results.tables.first().unwrap(); + + self.secured_ready_for_aggregation_queues = + get_resource_by_name(table, "SecuredReadyForAggregationQueue".to_string()); + self.failed_ingestions_queues = + get_resource_by_name(table, "FailedIngestionsQueue".to_string()); + self.successful_ingestions_queues = + get_resource_by_name(table, "SuccessfulIngestionsQueue".to_string()); + self.temp_storage = get_resource_by_name(table, "TempStorage".to_string()); + self.ingestions_status_tables = + get_resource_by_name(table, "IngestionsStatusTable".to_string()); + + Ok(()) + } +} + +pub type KustoIdentityToken = String; +#[derive(Debug, Clone)] +pub struct AuthorizationContext { + client: KustoClient, + pub kusto_identity_token: KustoIdentityToken, + last_update: Option, + refresh_period: Duration, +} + +impl AuthorizationContext { + pub fn new(client: KustoClient, refresh_period: Duration) -> Self { + Self { + client, + kusto_identity_token: String::new(), + last_update: None, + refresh_period, + } + } + + // TODO: figure out refresh logic + // Make this spawn a tokio task to refresh the token based on elapsed time + async fn refresh(&mut self, client: KustoClient) -> Result<()> { + if self.last_update.is_none() + || self.kusto_identity_token.chars().all(char::is_whitespace) + || self.last_update.unwrap().elapsed() > self.refresh_period + { + self.get_authorization_context(client).await?; + self.last_update = Some(Instant::now()); + } + Ok(()) + } + + async fn get_authorization_context(&mut self, client: KustoClient) -> Result<()> { + let results = client + .execute_command("NetDefaultDB", ".get kusto identity token", None) + .await?; + let table = results.tables.first().unwrap(); + + println!("table: {:#?}", table); + + self.kusto_identity_token = table + .rows + .first() + .unwrap() + .first() + .unwrap() + .as_str() + .unwrap() + .to_string(); + + Ok(()) + } + + pub async fn kusto_identity_token( + &mut self, + client: KustoClient, + ) -> Result { + self.refresh(client).await?; + Ok(self.kusto_identity_token.clone()) + } +} + +pub struct ResourceManager { + // client: KustoClient, + pub ingest_client_resources: IngestClientResources, + pub authorization_context: AuthorizationContext, +} + +impl ResourceManager { + pub fn new(client: KustoClient, refresh_period: Duration) -> Self { + Self { + ingest_client_resources: IngestClientResources::new(client.clone(), refresh_period), + authorization_context: AuthorizationContext::new(client, refresh_period), + } + } + + // pub async fn secured_ready_for_aggregation_queues(&mut self) -> Result> { + pub async fn secured_ready_for_aggregation_queues(&mut self) -> Result> { + // TODO: proper refresh and caching logic so we don't need to generate new clients every time + self.ingest_client_resources + .get_ingest_client_resources() + .await?; + + // We should return Azure SDK QueueClient's here. + // Although it's recommended to share the same transport, we can't as the storage credentials (SAS tokens) differ per queue. + // So the best we can do is store the individual QueueClient's so multiple requests + + let queue_uris = self + .ingest_client_resources + .secured_ready_for_aggregation_queues + .clone(); + + Ok(queue_uris.iter().map(|q| QueueClient::from(q)).collect()) + } + + pub async fn failed_ingestions_queues(&mut self) -> Result> { + // TODO: proper refresh and caching logic so we don't need to generate new clients every time + self.ingest_client_resources + .get_ingest_client_resources() + .await?; + + let queue_uris = self + .ingest_client_resources + .failed_ingestions_queues + .clone(); + + Ok(queue_uris.iter().map(|q| QueueClient::from(q)).collect()) + } + + pub async fn successful_ingestions_queues(&mut self) -> Result> { + // TODO: proper refresh and caching logic so we don't need to generate new clients every time + self.ingest_client_resources + .get_ingest_client_resources() + .await?; + + let queue_uris = self + .ingest_client_resources + .successful_ingestions_queues + .clone(); + + Ok(queue_uris.iter().map(|q| QueueClient::from(q)).collect()) + } + + pub async fn temp_storage(&mut self) -> Result> { + // TODO: proper refresh and caching logic so we don't need to generate new clients every time + self.ingest_client_resources + .get_ingest_client_resources() + .await?; + + let container_uris = self.ingest_client_resources.temp_storage.clone(); + + Ok(container_uris + .iter() + .map(|c| ContainerClient::from(c)) + .collect()) + } + + // pub async fn ingestions_status_tables( + // &mut self, + // client: KustoClient, + // ) -> Result> { + // self.refresh(client).await?; + // Ok(self.ingestions_status_tables.clone()) + // } + + // pub fn retrieve_service_type(self) -> ServiceType { + // unimplemented!() + // } + + pub async fn authorization_context(&mut self) -> Result<&KustoIdentityToken> { + // TODO: proper refresh and caching logic so we don't need to query Kusto for the token every time + self.authorization_context + .get_authorization_context(self.ingest_client_resources.client.clone()) + .await?; + + Ok(&self.authorization_context.kusto_identity_token) + } +} diff --git a/azure-kusto-ingest/src/result.rs b/azure-kusto-ingest/src/result.rs new file mode 100644 index 0000000..64fb47c --- /dev/null +++ b/azure-kusto-ingest/src/result.rs @@ -0,0 +1,40 @@ +use uuid::Uuid; + +pub enum IngestionStatus { + // The ingestion was queued. + Queued, + // The ingestion was successfully streamed + Success +} + +// The result of an ingestion. +pub struct IngestionResult { + // Will be `Queued` if the ingestion is queued, or `Success` if the ingestion is streaming and successful. + status: IngestionStatus, + // The name of the database where the ingestion was performed. + database: String, + // The name of the table where the ingestion was performed. + table: String, + // The source id of the ingestion. + source_id: Uuid, + // The blob uri of the ingestion, if exists. + blob_uri: Option +} + +impl IngestionResult { + pub fn new( + status: IngestionStatus, + database: &String, + table: &String, + source_id: Uuid, + blob_uri: Option, + ) -> Self { + Self { + status, + database: database.clone(), + table: table.clone(), + source_id, + blob_uri, + } + } +} From b67607d9e9931652c0d5e8af66999fc473cb4d5f Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Fri, 18 Aug 2023 15:04:06 +0100 Subject: [PATCH 02/44] Initial logic for caching resources --- azure-kusto-ingest/src/ingestion_blob_info.rs | 4 +- azure-kusto-ingest/src/queued_ingest.rs | 18 +- azure-kusto-ingest/src/resource_manager.rs | 441 ++++++------------ .../resource_manager/authorization_context.rs | 78 ++++ .../src/resource_manager/cache.rs | 47 ++ .../ingest_client_resources.rs | 68 +++ .../src/resource_manager/resource_uri.rs | 86 ++++ 7 files changed, 431 insertions(+), 311 deletions(-) create mode 100644 azure-kusto-ingest/src/resource_manager/authorization_context.rs create mode 100644 azure-kusto-ingest/src/resource_manager/cache.rs create mode 100644 azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs create mode 100644 azure-kusto-ingest/src/resource_manager/resource_uri.rs diff --git a/azure-kusto-ingest/src/ingestion_blob_info.rs b/azure-kusto-ingest/src/ingestion_blob_info.rs index 806e662..a53d798 100644 --- a/azure-kusto-ingest/src/ingestion_blob_info.rs +++ b/azure-kusto-ingest/src/ingestion_blob_info.rs @@ -43,7 +43,7 @@ impl QueuedIngestionMessage { pub fn new( blob_descriptor: BlobDescriptor, ingestion_properties: &IngestionProperties, - auth_context: &KustoIdentityToken, + auth_context: KustoIdentityToken, ) -> Self { let additional_properties = AdditionalProperties { ingestion_mapping: None, @@ -76,7 +76,7 @@ impl QueuedIngestionMessage { ignore_size_limit: Some(false), // TODO: configurability of creation time source_message_creation_time: String::from("2023-08-16T13:30:04.639714"), - additional_properties: additional_properties, + additional_properties, } } } diff --git a/azure-kusto-ingest/src/queued_ingest.rs b/azure-kusto-ingest/src/queued_ingest.rs index 48e1ba9..ea48ea8 100644 --- a/azure-kusto-ingest/src/queued_ingest.rs +++ b/azure-kusto-ingest/src/queued_ingest.rs @@ -1,4 +1,4 @@ -use std::time::Duration; +use std::sync::Arc; use anyhow::Result; use azure_core::base64; @@ -13,31 +13,31 @@ use crate::result::{IngestionResult, IngestionStatus}; pub struct QueuedIngestClient { // The KustoClient is used to get the ingestion resources, it should be a client against the ingestion cluster endpoint // kusto_client: KustoClient, - resource_manager: ResourceManager, + resource_manager: Arc, } impl QueuedIngestClient { - pub fn new(kusto_client: KustoClient, refresh_period: Duration) -> Self { - let resource_manager = ResourceManager::new(kusto_client, refresh_period); + pub fn new(kusto_client: KustoClient) -> Self { + let resource_manager = Arc::new(ResourceManager::new(kusto_client)); Self { resource_manager } } pub async fn ingest_from_blob( - mut self, + self, blob_descriptor: BlobDescriptor, ingestion_properties: &IngestionProperties, ) -> Result { - // the queues returned here should ideally be the storage queue client from azure-storage-queue - // as such, it may be better for ResourceManager to return a struct that contains the storage queue client + // The queues returned here should ideally be the storage queue client from azure-storage-queue + // As such, it may be better for ResourceManager to return a struct that contains the storage queue client let ingestion_queues = self .resource_manager .secured_ready_for_aggregation_queues() .await?; + println!("queues: {:#?}", ingestion_queues); let auth_context = self.resource_manager.authorization_context().await?; - - println!("queues: {:#?}", ingestion_queues); + println!("auth_context: {:#?}\n", auth_context); let message = QueuedIngestionMessage::new( blob_descriptor.clone(), diff --git a/azure-kusto-ingest/src/resource_manager.rs b/azure-kusto-ingest/src/resource_manager.rs index cbaef70..8f1fc9f 100644 --- a/azure-kusto-ingest/src/resource_manager.rs +++ b/azure-kusto-ingest/src/resource_manager.rs @@ -1,364 +1,205 @@ -use std::time::{Duration, Instant}; +use std::{ + sync::Arc, + time::{Duration, Instant}, +}; -use anyhow::{Ok, Result}; -use azure_kusto_data::{models::TableV1, prelude::KustoClient}; -use azure_storage::StorageCredentials; -use azure_storage_blobs::prelude::{ClientBuilder, ContainerClient}; -use url::Url; - -use azure_storage_queues::{QueueClient, QueueServiceClientBuilder}; - -#[derive(Debug, Clone)] -pub struct ResourceUri { - uri: String, - // parsed_uri: Url, - service_uri: String, - object_name: String, - sas_token: StorageCredentials, -} - -impl ResourceUri { - pub fn new(uri: String) -> Self { - println!("uri: {:#?}", uri); - let parsed_uri = Url::parse(&uri).unwrap(); - println!("parsed_uri: {:#?}", parsed_uri); - - let service_uri = parsed_uri.scheme().to_string() - + "://" - + parsed_uri.host_str().expect("We should get result here"); - let object_name = parsed_uri - .path() - .trim_start() - .trim_start_matches("/") - .to_string(); - let sas_token = parsed_uri - .query() - .expect("Returned URI should contain SAS token as query") - .to_string(); - let sas_token = StorageCredentials::sas_token(sas_token).unwrap(); - - Self { - uri, - // parsed_uri, - service_uri, - object_name, - sas_token, - } - } - - pub fn uri(&self) -> &str { - self.uri.as_str() - } +pub mod authorization_context; +pub mod cache; +pub mod ingest_client_resources; +pub mod resource_uri; - pub fn service_uri(&self) -> &str { - self.service_uri.as_str() - } +use anyhow::{Ok, Result}; +use azure_kusto_data::prelude::KustoClient; +use tokio::sync::RwLock; - pub fn object_name(&self) -> &str { - self.object_name.as_str() - } +use azure_storage_queues::QueueClient; - pub fn sas_token(&self) -> &StorageCredentials { - &self.sas_token - } -} +use self::{ + authorization_context::AuthorizationContext, + cache::{Cached, Refreshing}, +}; -impl From<&ResourceUri> for QueueClient { - fn from(resource_uri: &ResourceUri) -> Self { - let queue_service = - QueueServiceClientBuilder::with_location(azure_storage::CloudLocation::Custom { - uri: resource_uri.service_uri().to_string(), - credentials: resource_uri.sas_token().clone(), - }) - .build(); +use self::ingest_client_resources::RawIngestClientResources; - queue_service.queue_client(resource_uri.object_name()) - } -} - -impl From<&ResourceUri> for ContainerClient { - fn from(resource_uri: &ResourceUri) -> Self { - ClientBuilder::with_location(azure_storage::CloudLocation::Custom { - uri: resource_uri.service_uri().to_string(), - credentials: resource_uri.sas_token().clone(), - }) - .container_client(resource_uri.object_name()) - } -} - -fn get_resource_by_name(table: &TableV1, resource_name: String) -> Vec { - let storage_root_index = table - .columns - .iter() - .position(|c| c.column_name == "StorageRoot") - .unwrap(); - let resource_type_name_index = table - .columns - .iter() - .position(|c| c.column_name == "ResourceTypeName") - .unwrap(); - - println!("table: {:#?}", table); - let resource_uris: Vec = table - .rows - .iter() - .filter(|r| r[resource_type_name_index] == resource_name) - .map(|r| { - ResourceUri::new( - r[storage_root_index] - .as_str() - .expect("We should get result here") - .to_string(), - ) - }) - .collect(); - - resource_uris -} +pub(crate) const RESOURCE_REFRESH_PERIOD: Duration = Duration::from_secs(60 * 60); pub struct IngestClientResources { client: KustoClient, - secured_ready_for_aggregation_queues: Vec, - failed_ingestions_queues: Vec, - successful_ingestions_queues: Vec, - temp_storage: Vec, - ingestions_status_tables: Vec, - last_update: Option, - refresh_period: Duration, + kusto_response: Refreshing>, + secured_ready_for_aggregation_queues: Refreshing>, + // secured_ready_for_aggregation_queues: Vec, + // failed_ingestions_queues: Vec, + // successful_ingestions_queues: Vec, + // temp_storage: Vec, + // ingestions_status_tables: Vec, } impl IngestClientResources { - pub fn new(client: KustoClient, refresh_period: Duration) -> Self { + pub fn new(client: KustoClient) -> Self { Self { client, - secured_ready_for_aggregation_queues: Vec::new(), - failed_ingestions_queues: Vec::new(), - successful_ingestions_queues: Vec::new(), - temp_storage: Vec::new(), - ingestions_status_tables: Vec::new(), - last_update: None, - refresh_period, + kusto_response: Arc::new(RwLock::new(Cached::new(None, RESOURCE_REFRESH_PERIOD))), + secured_ready_for_aggregation_queues: Arc::new(RwLock::new(Cached::new( + Vec::new(), + RESOURCE_REFRESH_PERIOD, + ))), + // secured_ready_for_aggregation_queues: Vec::new(), + // failed_ingestions_queues: Vec::new(), + // successful_ingestions_queues: Vec::new(), + // temp_storage: Vec::new(), + // ingestions_status_tables: Vec::new(), + // last_update: None, } } - fn is_not_applicable(&self) -> bool { - self.secured_ready_for_aggregation_queues.is_empty() - || self.failed_ingestions_queues.is_empty() - || self.successful_ingestions_queues.is_empty() - || self.temp_storage.is_empty() - || self.ingestions_status_tables.is_empty() - } - - // TODO: figure out refresh logic - // async fn refresh(&mut self) { - // self.get_ingest_client_resources().await - // // let interval = tokio::time::interval(self.refresh_period); - // // loop { - // // match self.get_ingest_client_resources(self.client.clone()).await { - // // Ok(_) => todo!(), - // // Err(e) => println!("Error: {}", e), - // // }; - - // // interval.tick().await; - // // } - - // // if self.last_update.is_none() - // // || self.last_update.unwrap().elapsed() > self.refresh_period - // // || self.is_not_applicable() - // // { - // // self.get_ingest_client_resources(client).await?; - // // self.last_update = Some(Instant::now()); - // // } - // // Ok(()) - // } - - // async fn refresh(&mut self, client: KustoClient) -> Result<()> { - // if self.last_update.is_none() - // || self.last_update.unwrap().elapsed() > self.refresh_period - // || self.is_not_applicable() - // { - // self.get_ingest_client_resources(client).await?; - // self.last_update = Some(Instant::now()); - // } - // Ok(()) - // } - - async fn get_ingest_client_resources(&mut self) -> Result<()> { - let results = self - .client + // TODO: Logic to get the Kusto identity token from Kusto management endpoint - handle validation here + async fn execute_kql_mgmt_query(client: KustoClient) -> Result { + let results = client .execute_command("NetDefaultDB", ".get ingestion resources", None) .await?; let table = results.tables.first().unwrap(); - self.secured_ready_for_aggregation_queues = - get_resource_by_name(table, "SecuredReadyForAggregationQueue".to_string()); - self.failed_ingestions_queues = - get_resource_by_name(table, "FailedIngestionsQueue".to_string()); - self.successful_ingestions_queues = - get_resource_by_name(table, "SuccessfulIngestionsQueue".to_string()); - self.temp_storage = get_resource_by_name(table, "TempStorage".to_string()); - self.ingestions_status_tables = - get_resource_by_name(table, "IngestionsStatusTable".to_string()); - - Ok(()) - } -} + println!("table: {:#?}", table); + RawIngestClientResources::try_from(table) + } + + async fn get(&self) -> Result<(RawIngestClientResources, Instant)> { + let kusto_response = self.kusto_response.read().await; + if !kusto_response.is_expired() { + if let Some(inner_value) = kusto_response.get() { + return Ok(( + inner_value.clone(), + kusto_response.get_last_updated().clone(), + )); + } + } + // otherwise, drop the read lock and get a write lock to refresh the token + drop(kusto_response); + let mut kusto_response = self.kusto_response.write().await; + + // check again in case another thread refreshed the token while we were + // waiting on the write lock + if let Some(inner_value) = kusto_response.get() { + return Ok(( + inner_value.clone(), + kusto_response.get_last_updated().clone(), + )); + } -pub type KustoIdentityToken = String; -#[derive(Debug, Clone)] -pub struct AuthorizationContext { - client: KustoClient, - pub kusto_identity_token: KustoIdentityToken, - last_update: Option, - refresh_period: Duration, -} + let raw_ingest_client_resources = Self::execute_kql_mgmt_query(self.client.clone()).await?; + let last_updated = Instant::now(); + kusto_response.update_with_time( + Some(raw_ingest_client_resources.clone()), + last_updated.clone(), + ); -impl AuthorizationContext { - pub fn new(client: KustoClient, refresh_period: Duration) -> Self { - Self { - client, - kusto_identity_token: String::new(), - last_update: None, - refresh_period, - } + Ok((raw_ingest_client_resources, last_updated)) } - // TODO: figure out refresh logic - // Make this spawn a tokio task to refresh the token based on elapsed time - async fn refresh(&mut self, client: KustoClient) -> Result<()> { - if self.last_update.is_none() - || self.kusto_identity_token.chars().all(char::is_whitespace) - || self.last_update.unwrap().elapsed() > self.refresh_period - { - self.get_authorization_context(client).await?; - self.last_update = Some(Instant::now()); + pub async fn get_ingestion_queues(&self) -> Result> { + let secured_ready_for_aggregation_queues = + self.secured_ready_for_aggregation_queues.read().await; + + if !secured_ready_for_aggregation_queues.is_expired() { + let vecs = secured_ready_for_aggregation_queues.get(); + if !vecs.is_empty() { + return Ok(vecs.clone()); + } } - Ok(()) - } - async fn get_authorization_context(&mut self, client: KustoClient) -> Result<()> { - let results = client - .execute_command("NetDefaultDB", ".get kusto identity token", None) - .await?; - let table = results.tables.first().unwrap(); + drop(secured_ready_for_aggregation_queues); + let mut secured_ready_for_aggregation_queues = + self.secured_ready_for_aggregation_queues.write().await; - println!("table: {:#?}", table); + let vecs = secured_ready_for_aggregation_queues.get(); + if !vecs.is_empty() { + return Ok(vecs.clone()); + } - self.kusto_identity_token = table - .rows - .first() - .unwrap() - .first() - .unwrap() - .as_str() - .unwrap() - .to_string(); - - Ok(()) - } + let (raw_ingest_client_resources, last_updated) = self.get().await?; + let queue_uris = raw_ingest_client_resources.secured_ready_for_aggregation_queues; + let queue_clients: Vec = + queue_uris.iter().map(|q| QueueClient::from(q)).collect(); + + secured_ready_for_aggregation_queues.update_with_time(queue_clients.clone(), last_updated); - pub async fn kusto_identity_token( - &mut self, - client: KustoClient, - ) -> Result { - self.refresh(client).await?; - Ok(self.kusto_identity_token.clone()) + Ok(queue_clients) } } +pub type KustoIdentityToken = String; + pub struct ResourceManager { - // client: KustoClient, - pub ingest_client_resources: IngestClientResources, - pub authorization_context: AuthorizationContext, + ingest_client_resources: Arc, + authorization_context: Arc, } impl ResourceManager { - pub fn new(client: KustoClient, refresh_period: Duration) -> Self { + pub fn new(client: KustoClient) -> Self { Self { - ingest_client_resources: IngestClientResources::new(client.clone(), refresh_period), - authorization_context: AuthorizationContext::new(client, refresh_period), + ingest_client_resources: Arc::new(IngestClientResources::new(client.clone())), + authorization_context: Arc::new(AuthorizationContext::new(client)), } } - // pub async fn secured_ready_for_aggregation_queues(&mut self) -> Result> { - pub async fn secured_ready_for_aggregation_queues(&mut self) -> Result> { - // TODO: proper refresh and caching logic so we don't need to generate new clients every time - self.ingest_client_resources - .get_ingest_client_resources() - .await?; - - // We should return Azure SDK QueueClient's here. - // Although it's recommended to share the same transport, we can't as the storage credentials (SAS tokens) differ per queue. - // So the best we can do is store the individual QueueClient's so multiple requests - - let queue_uris = self - .ingest_client_resources - .secured_ready_for_aggregation_queues - .clone(); - - Ok(queue_uris.iter().map(|q| QueueClient::from(q)).collect()) + pub async fn secured_ready_for_aggregation_queues(&self) -> Result> { + self.ingest_client_resources.get_ingestion_queues().await } - pub async fn failed_ingestions_queues(&mut self) -> Result> { - // TODO: proper refresh and caching logic so we don't need to generate new clients every time - self.ingest_client_resources - .get_ingest_client_resources() - .await?; + // pub async fn failed_ingestions_queues(&mut self) -> Result> { + // // TODO: proper refresh and caching logic so we don't need to generate new clients every time + // self.ingest_client_resources + // .get_ingest_client_resources() + // .await?; - let queue_uris = self - .ingest_client_resources - .failed_ingestions_queues - .clone(); + // let queue_uris = self + // .ingest_client_resources + // .failed_ingestions_queues + // .clone(); - Ok(queue_uris.iter().map(|q| QueueClient::from(q)).collect()) - } + // Ok(queue_uris.iter().map(|q| QueueClient::from(q)).collect()) + // } - pub async fn successful_ingestions_queues(&mut self) -> Result> { - // TODO: proper refresh and caching logic so we don't need to generate new clients every time - self.ingest_client_resources - .get_ingest_client_resources() - .await?; + // pub async fn successful_ingestions_queues(&mut self) -> Result> { + // // TODO: proper refresh and caching logic so we don't need to generate new clients every time + // self.ingest_client_resources + // .get_ingest_client_resources() + // .await?; - let queue_uris = self - .ingest_client_resources - .successful_ingestions_queues - .clone(); + // let queue_uris = self + // .ingest_client_resources + // .successful_ingestions_queues + // .clone(); - Ok(queue_uris.iter().map(|q| QueueClient::from(q)).collect()) - } + // Ok(queue_uris.iter().map(|q| QueueClient::from(q)).collect()) + // } - pub async fn temp_storage(&mut self) -> Result> { - // TODO: proper refresh and caching logic so we don't need to generate new clients every time - self.ingest_client_resources - .get_ingest_client_resources() - .await?; + // pub async fn temp_storage(&mut self) -> Result> { + // // TODO: proper refresh and caching logic so we don't need to generate new clients every time + // self.ingest_client_resources + // .get_ingest_client_resources() + // .await?; - let container_uris = self.ingest_client_resources.temp_storage.clone(); + // let container_uris = self.ingest_client_resources.temp_storage.clone(); - Ok(container_uris - .iter() - .map(|c| ContainerClient::from(c)) - .collect()) - } + // Ok(container_uris + // .iter() + // .map(|c| ContainerClient::from(c)) + // .collect()) + // } // pub async fn ingestions_status_tables( // &mut self, // client: KustoClient, // ) -> Result> { - // self.refresh(client).await?; - // Ok(self.ingestions_status_tables.clone()) + // unimplemented!() // } // pub fn retrieve_service_type(self) -> ServiceType { // unimplemented!() // } - pub async fn authorization_context(&mut self) -> Result<&KustoIdentityToken> { - // TODO: proper refresh and caching logic so we don't need to query Kusto for the token every time - self.authorization_context - .get_authorization_context(self.ingest_client_resources.client.clone()) - .await?; - - Ok(&self.authorization_context.kusto_identity_token) + pub async fn authorization_context(&self) -> Result { + self.authorization_context.get().await } } diff --git a/azure-kusto-ingest/src/resource_manager/authorization_context.rs b/azure-kusto-ingest/src/resource_manager/authorization_context.rs new file mode 100644 index 0000000..f65182c --- /dev/null +++ b/azure-kusto-ingest/src/resource_manager/authorization_context.rs @@ -0,0 +1,78 @@ +use std::sync::Arc; + +use anyhow::Result; +use azure_kusto_data::prelude::KustoClient; +use tokio::sync::RwLock; + +use super::cache::{Cached, Refreshing}; +use super::RESOURCE_REFRESH_PERIOD; + +pub type KustoIdentityToken = String; +#[derive(Debug, Clone)] +pub struct AuthorizationContext { + client: KustoClient, + auth_context_cache: Refreshing>, +} + +impl AuthorizationContext { + pub fn new(client: KustoClient) -> Self { + Self { + client, + auth_context_cache: Arc::new(RwLock::new(Cached::new(None, RESOURCE_REFRESH_PERIOD))), + } + } + + // Logic to get the Kusto identity token from Kusto management endpoint - handle validation here + async fn execute_kql_mgmt_query(client: KustoClient) -> Result { + let results = client + .execute_command("NetDefaultDB", ".get kusto identity token", None) + .await?; + // TODO: any other checks, plus error handling + let table = results.tables.first().unwrap(); + + println!("table: {:#?}", table); + + // TODO: any other checks, plus error handling + let kusto_identity_token = table + .rows + .first() + .unwrap() + .first() + .unwrap() + .as_str() + .unwrap() + .to_string(); + + if kusto_identity_token.chars().all(char::is_whitespace) { + return Err(anyhow::anyhow!("Kusto identity token is empty")); + } + + println!("kusto_identity_token: {:#?}", kusto_identity_token); + + Ok(kusto_identity_token) + } + + // handle caching here + pub async fn get(&self) -> Result { + let auth_context_cache = self.auth_context_cache.read().await; + if !auth_context_cache.is_expired() { + if let Some(inner_value) = auth_context_cache.get() { + return Ok(inner_value.clone()); + } + } + // otherwise, drop the read lock and get a write lock to refresh the token + drop(auth_context_cache); + let mut auth_context_cache = self.auth_context_cache.write().await; + + // check again in case another thread refreshed the token while we were + // waiting on the write lock + if let Some(inner_value) = auth_context_cache.get() { + return Ok(inner_value.clone()); + } + + let token = Self::execute_kql_mgmt_query(self.client.clone()).await?; + auth_context_cache.update(Some(token.clone())); + + Ok(token) + } +} diff --git a/azure-kusto-ingest/src/resource_manager/cache.rs b/azure-kusto-ingest/src/resource_manager/cache.rs new file mode 100644 index 0000000..a0334c0 --- /dev/null +++ b/azure-kusto-ingest/src/resource_manager/cache.rs @@ -0,0 +1,47 @@ +use std::{ + sync::Arc, + time::{Duration, Instant}, +}; + +use tokio::sync::RwLock; + +#[derive(Debug, Clone)] +pub struct Cached { + inner: T, + last_updated: Instant, + refresh_period: Duration, +} + +impl Cached { + pub fn new(inner: T, refresh_period: Duration) -> Self { + Self { + inner, + last_updated: Instant::now(), + refresh_period, + } + } + + pub fn get(&self) -> &T { + &self.inner + } + + pub fn get_last_updated(&self) -> &Instant { + &self.last_updated + } + + pub fn is_expired(&self) -> bool { + self.last_updated.elapsed() > self.refresh_period + } + + pub fn update(&mut self, inner: T) { + self.inner = inner; + self.last_updated = Instant::now(); + } + + pub fn update_with_time(&mut self, inner: T, last_updated: Instant) { + self.inner = inner; + self.last_updated = last_updated; + } +} + +pub type Refreshing = Arc>>; diff --git a/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs new file mode 100644 index 0000000..91315ef --- /dev/null +++ b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs @@ -0,0 +1,68 @@ +use super::resource_uri::ResourceUri; +use anyhow::Result; +use azure_kusto_data::models::TableV1; + +#[derive(Debug, Clone)] +pub struct RawIngestClientResources { + pub secured_ready_for_aggregation_queues: Vec, + pub failed_ingestions_queues: Vec, + pub successful_ingestions_queues: Vec, + pub temp_storage: Vec, + pub ingestions_status_tables: Vec, +} + +impl RawIngestClientResources { + fn get_resource_by_name(table: &TableV1, resource_name: String) -> Result> { + let storage_root_index = table + .columns + .iter() + .position(|c| c.column_name == "StorageRoot") + .unwrap(); + let resource_type_name_index = table + .columns + .iter() + .position(|c| c.column_name == "ResourceTypeName") + .unwrap(); + + println!("table: {:#?}", table); + let resource_uris: Result> = table + .rows + .iter() + .filter(|r| r[resource_type_name_index] == resource_name) + .map(|r| { + ResourceUri::try_from( + r[storage_root_index] + .as_str() + .expect("We should get result here") + .to_string(), + ) + }) + .collect(); + + resource_uris + } +} + +impl TryFrom<&TableV1> for RawIngestClientResources { + type Error = anyhow::Error; + + fn try_from(table: &TableV1) -> std::result::Result { + let secured_ready_for_aggregation_queues = + Self::get_resource_by_name(table, "SecuredReadyForAggregationQueue".to_string())?; + let failed_ingestions_queues = + Self::get_resource_by_name(table, "FailedIngestionsQueue".to_string())?; + let successful_ingestions_queues = + Self::get_resource_by_name(table, "SuccessfulIngestionsQueue".to_string())?; + let temp_storage = Self::get_resource_by_name(table, "TempStorage".to_string())?; + let ingestions_status_tables = + Self::get_resource_by_name(table, "IngestionsStatusTable".to_string())?; + + Ok(Self { + secured_ready_for_aggregation_queues, + failed_ingestions_queues, + successful_ingestions_queues, + temp_storage, + ingestions_status_tables, + }) + } +} diff --git a/azure-kusto-ingest/src/resource_manager/resource_uri.rs b/azure-kusto-ingest/src/resource_manager/resource_uri.rs new file mode 100644 index 0000000..7d766e8 --- /dev/null +++ b/azure-kusto-ingest/src/resource_manager/resource_uri.rs @@ -0,0 +1,86 @@ +use azure_storage::StorageCredentials; +use azure_storage_blobs::prelude::{ClientBuilder, ContainerClient}; +use azure_storage_queues::{QueueClient, QueueServiceClientBuilder}; +use url::Url; + +use anyhow::Result; + +#[derive(Debug, Clone)] +pub struct ResourceUri { + uri: String, + service_uri: String, + object_name: String, + sas_token: StorageCredentials, +} + +impl TryFrom for ResourceUri { + type Error = anyhow::Error; + + fn try_from(uri: String) -> Result { + println!("uri: {:#?}", uri); + let parsed_uri = Url::parse(&uri)?; + println!("parsed_uri: {:#?}", parsed_uri); + + let service_uri = parsed_uri.scheme().to_string() + + "://" + + parsed_uri.host_str().expect("We should get result here"); + let object_name = parsed_uri + .path() + .trim_start() + .trim_start_matches("/") + .to_string(); + let sas_token = parsed_uri + .query() + .expect("Returned URI should contain SAS token as query") + .to_string(); + let sas_token = StorageCredentials::sas_token(sas_token)?; + + Ok(Self { + uri, + service_uri, + object_name, + sas_token, + }) + } +} + +impl ResourceUri { + pub fn uri(&self) -> &str { + self.uri.as_str() + } + + pub fn service_uri(&self) -> &str { + self.service_uri.as_str() + } + + pub fn object_name(&self) -> &str { + self.object_name.as_str() + } + + pub fn sas_token(&self) -> &StorageCredentials { + &self.sas_token + } +} + +impl From<&ResourceUri> for QueueClient { + fn from(resource_uri: &ResourceUri) -> Self { + let queue_service = + QueueServiceClientBuilder::with_location(azure_storage::CloudLocation::Custom { + uri: resource_uri.service_uri().to_string(), + credentials: resource_uri.sas_token().clone(), + }) + .build(); + + queue_service.queue_client(resource_uri.object_name()) + } +} + +impl From<&ResourceUri> for ContainerClient { + fn from(resource_uri: &ResourceUri) -> Self { + ClientBuilder::with_location(azure_storage::CloudLocation::Custom { + uri: resource_uri.service_uri().to_string(), + credentials: resource_uri.sas_token().clone(), + }) + .container_client(resource_uri.object_name()) + } +} From d7eae304d0e026ea88d7edab9f2cacf4f290b542 Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Mon, 21 Aug 2023 16:31:24 +0100 Subject: [PATCH 03/44] More on caching logic --- azure-kusto-ingest/Cargo.toml | 3 +- azure-kusto-ingest/src/descriptors.rs | 4 +- azure-kusto-ingest/src/queued_ingest.rs | 15 +- azure-kusto-ingest/src/resource_manager.rs | 292 +++++++++++------- .../resource_manager/authorization_context.rs | 22 +- .../src/resource_manager/cache.rs | 11 +- .../ingest_client_resources.rs | 1 - .../src/resource_manager/resource_uri.rs | 59 ++-- 8 files changed, 247 insertions(+), 160 deletions(-) diff --git a/azure-kusto-ingest/Cargo.toml b/azure-kusto-ingest/Cargo.toml index c9b178a..e443f19 100644 --- a/azure-kusto-ingest/Cargo.toml +++ b/azure-kusto-ingest/Cargo.toml @@ -19,5 +19,6 @@ azure_core = "0.13" azure_storage = "0.13" azure_storage_blobs = "0.13" azure_storage_queues = "0.13" +azure_data_tables = "0.13" + time = { version = "0.3", features = ["serde"] } -azure_identity = "0.13.0" diff --git a/azure-kusto-ingest/src/descriptors.rs b/azure-kusto-ingest/src/descriptors.rs index bd4a7bf..9d37658 100644 --- a/azure-kusto-ingest/src/descriptors.rs +++ b/azure-kusto-ingest/src/descriptors.rs @@ -48,8 +48,8 @@ impl BlobDescriptor { uri.set_query(Some("sas_token")); uri.to_string() }, - Some(BlobAuth::UserAssignedManagedIdentity(identity)) => { - format!("{};managed_identity={}", self.uri, identity) + Some(BlobAuth::UserAssignedManagedIdentity(object_id)) => { + format!("{};managed_identity={}", self.uri, object_id) }, Some(BlobAuth::SystemAssignedManagedIdentity) => { format!("{};managed_identity=system", self.uri) diff --git a/azure-kusto-ingest/src/queued_ingest.rs b/azure-kusto-ingest/src/queued_ingest.rs index ea48ea8..c9220fa 100644 --- a/azure-kusto-ingest/src/queued_ingest.rs +++ b/azure-kusto-ingest/src/queued_ingest.rs @@ -10,6 +10,7 @@ use crate::ingestion_properties::IngestionProperties; use crate::resource_manager::ResourceManager; use crate::result::{IngestionResult, IngestionStatus}; +#[derive(Clone)] pub struct QueuedIngestClient { // The KustoClient is used to get the ingestion resources, it should be a client against the ingestion cluster endpoint // kusto_client: KustoClient, @@ -34,10 +35,10 @@ impl QueuedIngestClient { .resource_manager .secured_ready_for_aggregation_queues() .await?; - println!("queues: {:#?}", ingestion_queues); + // println!("queues: {:#?}", ingestion_queues); let auth_context = self.resource_manager.authorization_context().await?; - println!("auth_context: {:#?}\n", auth_context); + // println!("auth_context: {:#?}\n", auth_context); let message = QueuedIngestionMessage::new( blob_descriptor.clone(), @@ -45,21 +46,21 @@ impl QueuedIngestClient { auth_context, ); - println!("message as struct: {:#?}\n", message); + // println!("message as struct: {:#?}\n", message); // TODO: pick a random queue from the queue clients returned by the resource manager let queue_client = ingestion_queues.first().unwrap().clone(); - println!("queue_client: {:#?}\n", queue_client); + // println!("queue_client: {:#?}\n", queue_client); let message = serde_json::to_string(&message).unwrap(); - println!("message as string: {}\n", message); + // println!("message as string: {}\n", message); // Base64 encode the ingestion message let message = base64::encode(&message); - println!("message as base64 encoded string: {}\n", message); + // println!("message as base64 encoded string: {}\n", message); let resp = queue_client.put_message(message).await?; - println!("resp: {:#?}\n", resp); + // println!("resp: {:#?}\n", resp); Ok(IngestionResult::new( IngestionStatus::Queued, diff --git a/azure-kusto-ingest/src/resource_manager.rs b/azure-kusto-ingest/src/resource_manager.rs index 8f1fc9f..e57ade3 100644 --- a/azure-kusto-ingest/src/resource_manager.rs +++ b/azure-kusto-ingest/src/resource_manager.rs @@ -1,7 +1,4 @@ -use std::{ - sync::Arc, - time::{Duration, Instant}, -}; +use std::{sync::Arc, time::Duration}; pub mod authorization_context; pub mod cache; @@ -12,47 +9,59 @@ use anyhow::{Ok, Result}; use azure_kusto_data::prelude::KustoClient; use tokio::sync::RwLock; +use azure_data_tables::prelude::TableClient; +use azure_storage_blobs::prelude::ContainerClient; use azure_storage_queues::QueueClient; use self::{ authorization_context::AuthorizationContext, cache::{Cached, Refreshing}, + resource_uri::ResourceUri, }; use self::ingest_client_resources::RawIngestClientResources; pub(crate) const RESOURCE_REFRESH_PERIOD: Duration = Duration::from_secs(60 * 60); +pub struct InnerIngestClientResources { + kusto_response: Option, + secured_ready_for_aggregation_queues: Vec, + temp_storage: Vec, + ingestions_status_tables: Vec, + successful_ingestions_queues: Vec, + failed_ingestions_queues: Vec, +} + +impl InnerIngestClientResources { + pub fn new() -> Self { + Self { + kusto_response: None, + secured_ready_for_aggregation_queues: Vec::new(), + temp_storage: Vec::new(), + ingestions_status_tables: Vec::new(), + successful_ingestions_queues: Vec::new(), + failed_ingestions_queues: Vec::new(), + } + } +} + pub struct IngestClientResources { client: KustoClient, - kusto_response: Refreshing>, - secured_ready_for_aggregation_queues: Refreshing>, - // secured_ready_for_aggregation_queues: Vec, - // failed_ingestions_queues: Vec, - // successful_ingestions_queues: Vec, - // temp_storage: Vec, - // ingestions_status_tables: Vec, + resources: Refreshing, } impl IngestClientResources { pub fn new(client: KustoClient) -> Self { Self { client, - kusto_response: Arc::new(RwLock::new(Cached::new(None, RESOURCE_REFRESH_PERIOD))), - secured_ready_for_aggregation_queues: Arc::new(RwLock::new(Cached::new( - Vec::new(), + resources: Arc::new(RwLock::new(Cached::new( + InnerIngestClientResources::new(), RESOURCE_REFRESH_PERIOD, ))), - // secured_ready_for_aggregation_queues: Vec::new(), - // failed_ingestions_queues: Vec::new(), - // successful_ingestions_queues: Vec::new(), - // temp_storage: Vec::new(), - // ingestions_status_tables: Vec::new(), - // last_update: None, } } - // TODO: Logic to get the Kusto identity token from Kusto management endpoint - handle validation here + // TODO: Logic to get the Kusto identity token from Kusto management endpoint - handle any validation of the response from the query here async fn execute_kql_mgmt_query(client: KustoClient) -> Result { let results = client .execute_command("NetDefaultDB", ".get ingestion resources", None) @@ -63,67 +72,161 @@ impl IngestClientResources { RawIngestClientResources::try_from(table) } - async fn get(&self) -> Result<(RawIngestClientResources, Instant)> { - let kusto_response = self.kusto_response.read().await; - if !kusto_response.is_expired() { - if let Some(inner_value) = kusto_response.get() { - return Ok(( - inner_value.clone(), - kusto_response.get_last_updated().clone(), - )); + fn create_clients_vec(resource_uris: &Vec) -> Vec + where + T: From, + { + resource_uris.iter().map(|uri| T::from(uri.clone())).collect() + } + + fn update_clients_vec( + current_resources: Vec, + resource_uris: Vec, + ) -> Vec + where + T: From, + { + if !current_resources.is_empty() { + Self::create_clients_vec(&resource_uris) + } else { + current_resources + } + } + + // 1. Get the kusto response + // 2. Update the kusto response, and the dependent resources if they are not empty, do this by a hashmap on the URI returned + // 3. Update the time + // 4. Return the kusto response + // As such, at any one time it is guaranteed that anything that has been queried before will be available and up to date + // Anything that has not been queried before will be available to create, but not as Azure clients until explicitly queried + async fn update_from_kusto(&self) -> Result { + let resources = self.resources.read().await; + if !resources.is_expired() { + if let Some(ref inner_value) = resources.get().kusto_response { + return Ok(inner_value.clone()); } } - // otherwise, drop the read lock and get a write lock to refresh the token - drop(kusto_response); - let mut kusto_response = self.kusto_response.write().await; - - // check again in case another thread refreshed the token while we were - // waiting on the write lock - if let Some(inner_value) = kusto_response.get() { - return Ok(( - inner_value.clone(), - kusto_response.get_last_updated().clone(), - )); + // otherwise, drop the read lock and get a write lock to refresh the kusto response + drop(resources); + let mut resources = self.resources.write().await; + + // check again in case another thread refreshed the while we were waiting on the write lock + if let Some(inner_value) = &resources.get().kusto_response { + return Ok(inner_value.clone()); } let raw_ingest_client_resources = Self::execute_kql_mgmt_query(self.client.clone()).await?; - let last_updated = Instant::now(); - kusto_response.update_with_time( - Some(raw_ingest_client_resources.clone()), - last_updated.clone(), + let mut_resources = resources.get_mut(); + + mut_resources.kusto_response = Some(raw_ingest_client_resources.clone()); + + // This is ugly... the logic is to check whether we have already created clients previously, and if so, updating them + mut_resources.secured_ready_for_aggregation_queues = Self::update_clients_vec( + mut_resources.secured_ready_for_aggregation_queues.clone(), + raw_ingest_client_resources.secured_ready_for_aggregation_queues.clone(), ); - - Ok((raw_ingest_client_resources, last_updated)) + mut_resources.temp_storage = Self::update_clients_vec( + mut_resources.temp_storage.clone(), + raw_ingest_client_resources.temp_storage.clone(), + ); + mut_resources.ingestions_status_tables = Self::update_clients_vec( + mut_resources.ingestions_status_tables.clone(), + raw_ingest_client_resources.ingestions_status_tables.clone(), + ); + mut_resources.successful_ingestions_queues = Self::update_clients_vec( + mut_resources.successful_ingestions_queues.clone(), + raw_ingest_client_resources.successful_ingestions_queues.clone(), + ); + mut_resources.failed_ingestions_queues = Self::update_clients_vec( + mut_resources.failed_ingestions_queues.clone(), + raw_ingest_client_resources.failed_ingestions_queues.clone(), + ); + Ok(raw_ingest_client_resources) } - pub async fn get_ingestion_queues(&self) -> Result> { - let secured_ready_for_aggregation_queues = - self.secured_ready_for_aggregation_queues.read().await; - - if !secured_ready_for_aggregation_queues.is_expired() { - let vecs = secured_ready_for_aggregation_queues.get(); + // Logic here + // Get a read lock, try and return the secured ready for aggregation queues + // If they are not empty, return them + // Otherwise, drop the read lock and get a write lock + // Check again if they are empty, if not return them assuming something has changed in between + // Otherwise, get the kusto response, create the queues + // Store the queues, and also return them + pub async fn get_clients( + &self, + field_fn: F, + create_client_vec_fn: Fx, + set_value: Fy, + ) -> Result> + where + F: Fn(&InnerIngestClientResources) -> &Vec, + Fx: Fn(&RawIngestClientResources) -> &Vec, + Fy: Fn(&mut InnerIngestClientResources, &Vec), + T: From + Clone, + { + let resources = self.resources.read().await; + if !resources.is_expired() { + let vecs = field_fn(resources.get()); if !vecs.is_empty() { return Ok(vecs.clone()); } } - drop(secured_ready_for_aggregation_queues); - let mut secured_ready_for_aggregation_queues = - self.secured_ready_for_aggregation_queues.write().await; + drop(resources); - let vecs = secured_ready_for_aggregation_queues.get(); + let raw_ingest_client_resources = self.update_from_kusto().await?; + + let mut resources = self.resources.write().await; + let vecs = field_fn(resources.get_mut()); if !vecs.is_empty() { return Ok(vecs.clone()); } - let (raw_ingest_client_resources, last_updated) = self.get().await?; - let queue_uris = raw_ingest_client_resources.secured_ready_for_aggregation_queues; - let queue_clients: Vec = - queue_uris.iter().map(|q| QueueClient::from(q)).collect(); + // First time, so create the resources outside + let mut_resources = resources.get_mut(); + let new_resources = Self::create_clients_vec(create_client_vec_fn(&raw_ingest_client_resources)); + set_value(mut_resources, &new_resources); - secured_ready_for_aggregation_queues.update_with_time(queue_clients.clone(), last_updated); + Ok(new_resources) + } - Ok(queue_clients) + pub async fn get_secured_ready_for_aggregation_queues(&self) -> Result> { + self.get_clients( + |resources| &resources.secured_ready_for_aggregation_queues, + |resources| &resources.secured_ready_for_aggregation_queues, + |mut_resources, new_resources| mut_resources.secured_ready_for_aggregation_queues = new_resources.clone(), + ).await + } + + pub async fn get_temp_storage(&self) -> Result> { + self.get_clients( + |resources| &resources.temp_storage, + |resources| &resources.temp_storage, + |mut_resources, new_resources| mut_resources.temp_storage = new_resources.clone(), + ).await + } + + pub async fn get_ingestions_status_tables(&self) -> Result> { + self.get_clients( + |resources| &resources.ingestions_status_tables, + |resources| &resources.ingestions_status_tables, + |mut_resources, new_resources| mut_resources.ingestions_status_tables = new_resources.clone(), + ).await + } + + pub async fn get_successful_ingestions_queues(&self) -> Result> { + self.get_clients( + |resources| &resources.successful_ingestions_queues, + |resources| &resources.successful_ingestions_queues, + |mut_resources, new_resources| mut_resources.successful_ingestions_queues = new_resources.clone(), + ).await + } + + pub async fn get_failed_ingestions_queues(&self) -> Result> { + self.get_clients( + |resources| &resources.failed_ingestions_queues, + |resources| &resources.failed_ingestions_queues, + |mut_resources, new_resources| mut_resources.failed_ingestions_queues = new_resources.clone(), + ).await } } @@ -143,57 +246,30 @@ impl ResourceManager { } pub async fn secured_ready_for_aggregation_queues(&self) -> Result> { - self.ingest_client_resources.get_ingestion_queues().await + self.ingest_client_resources + .get_secured_ready_for_aggregation_queues() + .await } - // pub async fn failed_ingestions_queues(&mut self) -> Result> { - // // TODO: proper refresh and caching logic so we don't need to generate new clients every time - // self.ingest_client_resources - // .get_ingest_client_resources() - // .await?; - - // let queue_uris = self - // .ingest_client_resources - // .failed_ingestions_queues - // .clone(); - - // Ok(queue_uris.iter().map(|q| QueueClient::from(q)).collect()) - // } - - // pub async fn successful_ingestions_queues(&mut self) -> Result> { - // // TODO: proper refresh and caching logic so we don't need to generate new clients every time - // self.ingest_client_resources - // .get_ingest_client_resources() - // .await?; - - // let queue_uris = self - // .ingest_client_resources - // .successful_ingestions_queues - // .clone(); - - // Ok(queue_uris.iter().map(|q| QueueClient::from(q)).collect()) - // } - - // pub async fn temp_storage(&mut self) -> Result> { - // // TODO: proper refresh and caching logic so we don't need to generate new clients every time - // self.ingest_client_resources - // .get_ingest_client_resources() - // .await?; + pub async fn temp_storage(&self) -> Result> { + self.ingest_client_resources.get_temp_storage().await + } - // let container_uris = self.ingest_client_resources.temp_storage.clone(); + pub async fn ingestions_status_tables(&self) -> Result> { + self.ingest_client_resources.get_ingestions_status_tables().await + } - // Ok(container_uris - // .iter() - // .map(|c| ContainerClient::from(c)) - // .collect()) - // } + pub async fn successful_ingestions_queues(&self) -> Result> { + self.ingest_client_resources + .get_successful_ingestions_queues() + .await + } - // pub async fn ingestions_status_tables( - // &mut self, - // client: KustoClient, - // ) -> Result> { - // unimplemented!() - // } + pub async fn failed_ingestions_queues(&self) -> Result> { + self.ingest_client_resources + .get_failed_ingestions_queues() + .await + } // pub fn retrieve_service_type(self) -> ServiceType { // unimplemented!() diff --git a/azure-kusto-ingest/src/resource_manager/authorization_context.rs b/azure-kusto-ingest/src/resource_manager/authorization_context.rs index f65182c..397a990 100644 --- a/azure-kusto-ingest/src/resource_manager/authorization_context.rs +++ b/azure-kusto-ingest/src/resource_manager/authorization_context.rs @@ -8,6 +8,8 @@ use super::cache::{Cached, Refreshing}; use super::RESOURCE_REFRESH_PERIOD; pub type KustoIdentityToken = String; + +/// Logic to obtain a Kusto identity token from the management endpoint #[derive(Debug, Clone)] pub struct AuthorizationContext { client: KustoClient, @@ -22,17 +24,14 @@ impl AuthorizationContext { } } - // Logic to get the Kusto identity token from Kusto management endpoint - handle validation here + /// Executes a KQL query to get the Kusto identity token from the management endpoint async fn execute_kql_mgmt_query(client: KustoClient) -> Result { let results = client .execute_command("NetDefaultDB", ".get kusto identity token", None) .await?; - // TODO: any other checks, plus error handling - let table = results.tables.first().unwrap(); - - println!("table: {:#?}", table); - // TODO: any other checks, plus error handling + // TODO: improve validation checks here + let table = results.tables.first().unwrap(); let kusto_identity_token = table .rows .first() @@ -47,25 +46,24 @@ impl AuthorizationContext { return Err(anyhow::anyhow!("Kusto identity token is empty")); } - println!("kusto_identity_token: {:#?}", kusto_identity_token); - Ok(kusto_identity_token) } - // handle caching here + /// Fetches the latest Kusto identity token, either from the cache or by executing a KQL query pub async fn get(&self) -> Result { + // First, attempt to get the return the token from the cache let auth_context_cache = self.auth_context_cache.read().await; if !auth_context_cache.is_expired() { if let Some(inner_value) = auth_context_cache.get() { return Ok(inner_value.clone()); } } - // otherwise, drop the read lock and get a write lock to refresh the token + // Drop the read lock and get a write lock to refresh the token drop(auth_context_cache); let mut auth_context_cache = self.auth_context_cache.write().await; - // check again in case another thread refreshed the token while we were - // waiting on the write lock + // Again attempt to return from cache, check is done in case another thread + // refreshed the token while we were waiting on the write lock if let Some(inner_value) = auth_context_cache.get() { return Ok(inner_value.clone()); } diff --git a/azure-kusto-ingest/src/resource_manager/cache.rs b/azure-kusto-ingest/src/resource_manager/cache.rs index a0334c0..b850a4e 100644 --- a/azure-kusto-ingest/src/resource_manager/cache.rs +++ b/azure-kusto-ingest/src/resource_manager/cache.rs @@ -5,6 +5,8 @@ use std::{ use tokio::sync::RwLock; +/// Wrapper around a value that allows for storing when the value was last updated, +/// as well as the period after which it should be refreshed (i.e. expired) #[derive(Debug, Clone)] pub struct Cached { inner: T, @@ -25,8 +27,8 @@ impl Cached { &self.inner } - pub fn get_last_updated(&self) -> &Instant { - &self.last_updated + pub fn get_mut(&mut self) -> &mut T { + &mut self.inner } pub fn is_expired(&self) -> bool { @@ -37,11 +39,6 @@ impl Cached { self.inner = inner; self.last_updated = Instant::now(); } - - pub fn update_with_time(&mut self, inner: T, last_updated: Instant) { - self.inner = inner; - self.last_updated = last_updated; - } } pub type Refreshing = Arc>>; diff --git a/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs index 91315ef..000356b 100644 --- a/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs +++ b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs @@ -24,7 +24,6 @@ impl RawIngestClientResources { .position(|c| c.column_name == "ResourceTypeName") .unwrap(); - println!("table: {:#?}", table); let resource_uris: Result> = table .rows .iter() diff --git a/azure-kusto-ingest/src/resource_manager/resource_uri.rs b/azure-kusto-ingest/src/resource_manager/resource_uri.rs index 7d766e8..94e14f7 100644 --- a/azure-kusto-ingest/src/resource_manager/resource_uri.rs +++ b/azure-kusto-ingest/src/resource_manager/resource_uri.rs @@ -1,3 +1,4 @@ +use azure_data_tables::{clients::TableServiceClientBuilder, prelude::TableClient}; use azure_storage::StorageCredentials; use azure_storage_blobs::prelude::{ClientBuilder, ContainerClient}; use azure_storage_queues::{QueueClient, QueueServiceClientBuilder}; @@ -5,6 +6,7 @@ use url::Url; use anyhow::Result; +/// Parsing logic of resource URIs as returned by the Kusto management endpoint #[derive(Debug, Clone)] pub struct ResourceUri { uri: String, @@ -13,6 +15,24 @@ pub struct ResourceUri { sas_token: StorageCredentials, } +impl ResourceUri { + pub fn uri(&self) -> &str { + self.uri.as_str() + } + + pub fn service_uri(&self) -> &str { + self.service_uri.as_str() + } + + pub fn object_name(&self) -> &str { + self.object_name.as_str() + } + + pub fn sas_token(&self) -> &StorageCredentials { + &self.sas_token + } +} + impl TryFrom for ResourceUri { type Error = anyhow::Error; @@ -44,26 +64,8 @@ impl TryFrom for ResourceUri { } } -impl ResourceUri { - pub fn uri(&self) -> &str { - self.uri.as_str() - } - - pub fn service_uri(&self) -> &str { - self.service_uri.as_str() - } - - pub fn object_name(&self) -> &str { - self.object_name.as_str() - } - - pub fn sas_token(&self) -> &StorageCredentials { - &self.sas_token - } -} - -impl From<&ResourceUri> for QueueClient { - fn from(resource_uri: &ResourceUri) -> Self { +impl From for QueueClient { + fn from(resource_uri: ResourceUri) -> Self { let queue_service = QueueServiceClientBuilder::with_location(azure_storage::CloudLocation::Custom { uri: resource_uri.service_uri().to_string(), @@ -75,8 +77,8 @@ impl From<&ResourceUri> for QueueClient { } } -impl From<&ResourceUri> for ContainerClient { - fn from(resource_uri: &ResourceUri) -> Self { +impl From for ContainerClient { + fn from(resource_uri: ResourceUri) -> Self { ClientBuilder::with_location(azure_storage::CloudLocation::Custom { uri: resource_uri.service_uri().to_string(), credentials: resource_uri.sas_token().clone(), @@ -84,3 +86,16 @@ impl From<&ResourceUri> for ContainerClient { .container_client(resource_uri.object_name()) } } + +impl From for TableClient { + fn from(resource_uri: ResourceUri) -> Self { + let table_service = + TableServiceClientBuilder::with_location(azure_storage::CloudLocation::Custom { + uri: resource_uri.service_uri().to_string(), + credentials: resource_uri.sas_token().clone(), + }) + .build(); + + table_service.table_client(resource_uri.object_name()) + } +} From b9f2e056f38d027f02b745b444368caf0b253a42 Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Wed, 23 Aug 2023 11:39:22 +0100 Subject: [PATCH 04/44] Allow configurability of ClientOptions for different service types --- azure-kusto-ingest/src/queued_ingest.rs | 33 ++++++++++++-- azure-kusto-ingest/src/resource_manager.rs | 44 +++++++++++++------ .../src/resource_manager/resource_uri.rs | 35 +++++++++------ 3 files changed, 82 insertions(+), 30 deletions(-) diff --git a/azure-kusto-ingest/src/queued_ingest.rs b/azure-kusto-ingest/src/queued_ingest.rs index c9220fa..63f5c7e 100644 --- a/azure-kusto-ingest/src/queued_ingest.rs +++ b/azure-kusto-ingest/src/queued_ingest.rs @@ -1,7 +1,7 @@ use std::sync::Arc; use anyhow::Result; -use azure_core::base64; +use azure_core::{base64, ClientOptions}; use azure_kusto_data::prelude::KustoClient; use crate::descriptors::{BlobDescriptor, FileDescriptor, StreamDescriptor}; @@ -10,16 +10,43 @@ use crate::ingestion_properties::IngestionProperties; use crate::resource_manager::ResourceManager; use crate::result::{IngestionResult, IngestionStatus}; +#[derive(Clone, Default)] +pub struct QueuedIngestClientOptions { + pub queue_service: ClientOptions, + pub blob_service: ClientOptions, + pub table_service: ClientOptions, +} + +impl QueuedIngestClientOptions { + pub fn new() -> Self { + Self::default() + } +} + +impl From for QueuedIngestClientOptions { + /// Creates a `QueuedIngestClientOptions` struct where all the client options are the same + fn from(client_options: ClientOptions) -> Self { + Self { + queue_service: client_options.clone(), + blob_service: client_options.clone(), + table_service: client_options, + } + } +} + #[derive(Clone)] pub struct QueuedIngestClient { // The KustoClient is used to get the ingestion resources, it should be a client against the ingestion cluster endpoint - // kusto_client: KustoClient, resource_manager: Arc, } impl QueuedIngestClient { pub fn new(kusto_client: KustoClient) -> Self { - let resource_manager = Arc::new(ResourceManager::new(kusto_client)); + Self::new_with_client_options(kusto_client, QueuedIngestClientOptions::default()) + } + + pub fn new_with_client_options(kusto_client: KustoClient, options: QueuedIngestClientOptions) -> Self { + let resource_manager = Arc::new(ResourceManager::new(kusto_client, options)); Self { resource_manager } } diff --git a/azure-kusto-ingest/src/resource_manager.rs b/azure-kusto-ingest/src/resource_manager.rs index e57ade3..e168815 100644 --- a/azure-kusto-ingest/src/resource_manager.rs +++ b/azure-kusto-ingest/src/resource_manager.rs @@ -6,6 +6,7 @@ pub mod ingest_client_resources; pub mod resource_uri; use anyhow::{Ok, Result}; +use azure_core::ClientOptions; use azure_kusto_data::prelude::KustoClient; use tokio::sync::RwLock; @@ -13,10 +14,12 @@ use azure_data_tables::prelude::TableClient; use azure_storage_blobs::prelude::ContainerClient; use azure_storage_queues::QueueClient; +use crate::queued_ingest::QueuedIngestClientOptions; + use self::{ authorization_context::AuthorizationContext, cache::{Cached, Refreshing}, - resource_uri::ResourceUri, + resource_uri::{ResourceUri, ClientFromResourceUri}, }; use self::ingest_client_resources::RawIngestClientResources; @@ -48,16 +51,18 @@ impl InnerIngestClientResources { pub struct IngestClientResources { client: KustoClient, resources: Refreshing, + client_options: QueuedIngestClientOptions, } impl IngestClientResources { - pub fn new(client: KustoClient) -> Self { + pub fn new(client: KustoClient, client_options: QueuedIngestClientOptions) -> Self { Self { client, resources: Arc::new(RwLock::new(Cached::new( InnerIngestClientResources::new(), RESOURCE_REFRESH_PERIOD, ))), + client_options } } @@ -68,26 +73,26 @@ impl IngestClientResources { .await?; let table = results.tables.first().unwrap(); - println!("table: {:#?}", table); RawIngestClientResources::try_from(table) } - fn create_clients_vec(resource_uris: &Vec) -> Vec + fn create_clients_vec(resource_uris: &Vec, client_options: ClientOptions) -> Vec where - T: From, + T: ClientFromResourceUri, { - resource_uris.iter().map(|uri| T::from(uri.clone())).collect() + resource_uris.iter().map(|uri| T::create_client(uri.clone(), client_options.clone())).collect() } fn update_clients_vec( current_resources: Vec, resource_uris: Vec, + client_options: ClientOptions ) -> Vec where - T: From, + T: ClientFromResourceUri, { if !current_resources.is_empty() { - Self::create_clients_vec(&resource_uris) + Self::create_clients_vec(&resource_uris, client_options) } else { current_resources } @@ -124,22 +129,27 @@ impl IngestClientResources { mut_resources.secured_ready_for_aggregation_queues = Self::update_clients_vec( mut_resources.secured_ready_for_aggregation_queues.clone(), raw_ingest_client_resources.secured_ready_for_aggregation_queues.clone(), + self.client_options.queue_service.clone(), ); mut_resources.temp_storage = Self::update_clients_vec( mut_resources.temp_storage.clone(), raw_ingest_client_resources.temp_storage.clone(), + self.client_options.blob_service.clone(), ); mut_resources.ingestions_status_tables = Self::update_clients_vec( mut_resources.ingestions_status_tables.clone(), raw_ingest_client_resources.ingestions_status_tables.clone(), + self.client_options.table_service.clone(), ); mut_resources.successful_ingestions_queues = Self::update_clients_vec( mut_resources.successful_ingestions_queues.clone(), raw_ingest_client_resources.successful_ingestions_queues.clone(), + self.client_options.queue_service.clone(), ); mut_resources.failed_ingestions_queues = Self::update_clients_vec( mut_resources.failed_ingestions_queues.clone(), raw_ingest_client_resources.failed_ingestions_queues.clone(), + self.client_options.queue_service.clone(), ); Ok(raw_ingest_client_resources) } @@ -156,12 +166,13 @@ impl IngestClientResources { field_fn: F, create_client_vec_fn: Fx, set_value: Fy, + client_options: ClientOptions ) -> Result> where F: Fn(&InnerIngestClientResources) -> &Vec, Fx: Fn(&RawIngestClientResources) -> &Vec, Fy: Fn(&mut InnerIngestClientResources, &Vec), - T: From + Clone, + T: ClientFromResourceUri + Clone, { let resources = self.resources.read().await; if !resources.is_expired() { @@ -183,7 +194,7 @@ impl IngestClientResources { // First time, so create the resources outside let mut_resources = resources.get_mut(); - let new_resources = Self::create_clients_vec(create_client_vec_fn(&raw_ingest_client_resources)); + let new_resources = Self::create_clients_vec(create_client_vec_fn(&raw_ingest_client_resources), client_options); set_value(mut_resources, &new_resources); Ok(new_resources) @@ -194,6 +205,7 @@ impl IngestClientResources { |resources| &resources.secured_ready_for_aggregation_queues, |resources| &resources.secured_ready_for_aggregation_queues, |mut_resources, new_resources| mut_resources.secured_ready_for_aggregation_queues = new_resources.clone(), + self.client_options.queue_service.clone(), ).await } @@ -202,6 +214,7 @@ impl IngestClientResources { |resources| &resources.temp_storage, |resources| &resources.temp_storage, |mut_resources, new_resources| mut_resources.temp_storage = new_resources.clone(), + self.client_options.blob_service.clone(), ).await } @@ -210,22 +223,25 @@ impl IngestClientResources { |resources| &resources.ingestions_status_tables, |resources| &resources.ingestions_status_tables, |mut_resources, new_resources| mut_resources.ingestions_status_tables = new_resources.clone(), + self.client_options.table_service.clone(), ).await } - + pub async fn get_successful_ingestions_queues(&self) -> Result> { self.get_clients( |resources| &resources.successful_ingestions_queues, |resources| &resources.successful_ingestions_queues, |mut_resources, new_resources| mut_resources.successful_ingestions_queues = new_resources.clone(), + self.client_options.queue_service.clone(), ).await } - + pub async fn get_failed_ingestions_queues(&self) -> Result> { self.get_clients( |resources| &resources.failed_ingestions_queues, |resources| &resources.failed_ingestions_queues, |mut_resources, new_resources| mut_resources.failed_ingestions_queues = new_resources.clone(), + self.client_options.queue_service.clone(), ).await } } @@ -238,9 +254,9 @@ pub struct ResourceManager { } impl ResourceManager { - pub fn new(client: KustoClient) -> Self { + pub fn new(client: KustoClient, client_options: QueuedIngestClientOptions) -> Self { Self { - ingest_client_resources: Arc::new(IngestClientResources::new(client.clone())), + ingest_client_resources: Arc::new(IngestClientResources::new(client.clone(), client_options)), authorization_context: Arc::new(AuthorizationContext::new(client)), } } diff --git a/azure-kusto-ingest/src/resource_manager/resource_uri.rs b/azure-kusto-ingest/src/resource_manager/resource_uri.rs index 94e14f7..c932053 100644 --- a/azure-kusto-ingest/src/resource_manager/resource_uri.rs +++ b/azure-kusto-ingest/src/resource_manager/resource_uri.rs @@ -1,3 +1,4 @@ +use azure_core::ClientOptions; use azure_data_tables::{clients::TableServiceClientBuilder, prelude::TableClient}; use azure_storage::StorageCredentials; use azure_storage_blobs::prelude::{ClientBuilder, ContainerClient}; @@ -41,18 +42,19 @@ impl TryFrom for ResourceUri { let parsed_uri = Url::parse(&uri)?; println!("parsed_uri: {:#?}", parsed_uri); - let service_uri = parsed_uri.scheme().to_string() - + "://" - + parsed_uri.host_str().expect("We should get result here"); + let service_uri = match parsed_uri.host_str() { + Some(host_str) => parsed_uri.scheme().to_string() + "://" + host_str, + None => return Err(anyhow::anyhow!("Host is missing in the URI")), + }; let object_name = parsed_uri .path() .trim_start() .trim_start_matches("/") .to_string(); - let sas_token = parsed_uri - .query() - .expect("Returned URI should contain SAS token as query") - .to_string(); + let sas_token = match parsed_uri.query() { + Some(query) => query.to_string(), + None => return Err(anyhow::anyhow!("SAS token is missing in the URI as a query parameter")), + }; let sas_token = StorageCredentials::sas_token(sas_token)?; Ok(Self { @@ -64,36 +66,43 @@ impl TryFrom for ResourceUri { } } -impl From for QueueClient { - fn from(resource_uri: ResourceUri) -> Self { +pub trait ClientFromResourceUri { + fn create_client(resource_uri: ResourceUri, client_options: ClientOptions) -> Self; +} + +impl ClientFromResourceUri for QueueClient { + fn create_client(resource_uri: ResourceUri, client_options: ClientOptions) -> Self { let queue_service = QueueServiceClientBuilder::with_location(azure_storage::CloudLocation::Custom { uri: resource_uri.service_uri().to_string(), credentials: resource_uri.sas_token().clone(), }) + .client_options(client_options) .build(); queue_service.queue_client(resource_uri.object_name()) } } -impl From for ContainerClient { - fn from(resource_uri: ResourceUri) -> Self { +impl ClientFromResourceUri for ContainerClient { + fn create_client(resource_uri: ResourceUri, client_options: ClientOptions) -> Self { ClientBuilder::with_location(azure_storage::CloudLocation::Custom { uri: resource_uri.service_uri().to_string(), credentials: resource_uri.sas_token().clone(), }) + .client_options(client_options) .container_client(resource_uri.object_name()) } } -impl From for TableClient { - fn from(resource_uri: ResourceUri) -> Self { +impl ClientFromResourceUri for TableClient { + fn create_client(resource_uri: ResourceUri, client_options: ClientOptions) -> Self { let table_service = TableServiceClientBuilder::with_location(azure_storage::CloudLocation::Custom { uri: resource_uri.service_uri().to_string(), credentials: resource_uri.sas_token().clone(), }) + .client_options(client_options) .build(); table_service.table_client(resource_uri.object_name()) From 0469d586ae036554be0f7cf99fee69cf357e5dfa Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Wed, 23 Aug 2023 11:48:31 +0100 Subject: [PATCH 05/44] cargo fmt --- azure-kusto-ingest/src/descriptors.rs | 12 +-- azure-kusto-ingest/src/lib.rs | 6 +- azure-kusto-ingest/src/queued_ingest.rs | 5 +- azure-kusto-ingest/src/resource_manager.rs | 81 +++++++++++++------ .../resource_manager/authorization_context.rs | 2 +- .../src/resource_manager/cache.rs | 2 +- .../src/resource_manager/resource_uri.rs | 6 +- azure-kusto-ingest/src/result.rs | 4 +- 8 files changed, 78 insertions(+), 40 deletions(-) diff --git a/azure-kusto-ingest/src/descriptors.rs b/azure-kusto-ingest/src/descriptors.rs index 9d37658..d9a2b16 100644 --- a/azure-kusto-ingest/src/descriptors.rs +++ b/azure-kusto-ingest/src/descriptors.rs @@ -1,4 +1,4 @@ -use std::{io::Read, path::PathBuf, fmt::format}; +use std::{fmt::format, io::Read, path::PathBuf}; use azure_storage::StorageCredentials; use url::Url; @@ -10,7 +10,7 @@ pub enum BlobAuth { // adds `;managed_identity=` to the blob path UserAssignedManagedIdentity(String), // adds `;managed_identity=system` to the blob path - SystemAssignedManagedIdentity + SystemAssignedManagedIdentity, } #[derive(Clone, Debug)] @@ -18,7 +18,7 @@ pub struct BlobDescriptor { uri: Url, pub(crate) size: Option, pub(crate) source_id: Uuid, - blob_auth: Option + blob_auth: Option, } impl BlobDescriptor { @@ -47,13 +47,13 @@ impl BlobDescriptor { let mut uri = self.uri.clone(); uri.set_query(Some("sas_token")); uri.to_string() - }, + } Some(BlobAuth::UserAssignedManagedIdentity(object_id)) => { format!("{};managed_identity={}", self.uri, object_id) - }, + } Some(BlobAuth::SystemAssignedManagedIdentity) => { format!("{};managed_identity=system", self.uri) - }, + } None => self.uri.to_string(), } } diff --git a/azure-kusto-ingest/src/lib.rs b/azure-kusto-ingest/src/lib.rs index 1a250ae..d117f44 100644 --- a/azure-kusto-ingest/src/lib.rs +++ b/azure-kusto-ingest/src/lib.rs @@ -1,7 +1,7 @@ +pub mod data_format; pub mod descriptors; +pub(crate) mod ingestion_blob_info; pub mod ingestion_properties; pub mod queued_ingest; -pub(crate) mod result; pub(crate) mod resource_manager; -pub mod data_format; -pub(crate) mod ingestion_blob_info; \ No newline at end of file +pub(crate) mod result; diff --git a/azure-kusto-ingest/src/queued_ingest.rs b/azure-kusto-ingest/src/queued_ingest.rs index 63f5c7e..bc0e326 100644 --- a/azure-kusto-ingest/src/queued_ingest.rs +++ b/azure-kusto-ingest/src/queued_ingest.rs @@ -45,7 +45,10 @@ impl QueuedIngestClient { Self::new_with_client_options(kusto_client, QueuedIngestClientOptions::default()) } - pub fn new_with_client_options(kusto_client: KustoClient, options: QueuedIngestClientOptions) -> Self { + pub fn new_with_client_options( + kusto_client: KustoClient, + options: QueuedIngestClientOptions, + ) -> Self { let resource_manager = Arc::new(ResourceManager::new(kusto_client, options)); Self { resource_manager } diff --git a/azure-kusto-ingest/src/resource_manager.rs b/azure-kusto-ingest/src/resource_manager.rs index e168815..140ad37 100644 --- a/azure-kusto-ingest/src/resource_manager.rs +++ b/azure-kusto-ingest/src/resource_manager.rs @@ -19,7 +19,7 @@ use crate::queued_ingest::QueuedIngestClientOptions; use self::{ authorization_context::AuthorizationContext, cache::{Cached, Refreshing}, - resource_uri::{ResourceUri, ClientFromResourceUri}, + resource_uri::{ClientFromResourceUri, ResourceUri}, }; use self::ingest_client_resources::RawIngestClientResources; @@ -62,7 +62,7 @@ impl IngestClientResources { InnerIngestClientResources::new(), RESOURCE_REFRESH_PERIOD, ))), - client_options + client_options, } } @@ -76,17 +76,23 @@ impl IngestClientResources { RawIngestClientResources::try_from(table) } - fn create_clients_vec(resource_uris: &Vec, client_options: ClientOptions) -> Vec + fn create_clients_vec( + resource_uris: &Vec, + client_options: ClientOptions, + ) -> Vec where T: ClientFromResourceUri, { - resource_uris.iter().map(|uri| T::create_client(uri.clone(), client_options.clone())).collect() + resource_uris + .iter() + .map(|uri| T::create_client(uri.clone(), client_options.clone())) + .collect() } fn update_clients_vec( current_resources: Vec, resource_uris: Vec, - client_options: ClientOptions + client_options: ClientOptions, ) -> Vec where T: ClientFromResourceUri, @@ -124,11 +130,13 @@ impl IngestClientResources { let mut_resources = resources.get_mut(); mut_resources.kusto_response = Some(raw_ingest_client_resources.clone()); - + // This is ugly... the logic is to check whether we have already created clients previously, and if so, updating them mut_resources.secured_ready_for_aggregation_queues = Self::update_clients_vec( mut_resources.secured_ready_for_aggregation_queues.clone(), - raw_ingest_client_resources.secured_ready_for_aggregation_queues.clone(), + raw_ingest_client_resources + .secured_ready_for_aggregation_queues + .clone(), self.client_options.queue_service.clone(), ); mut_resources.temp_storage = Self::update_clients_vec( @@ -143,7 +151,9 @@ impl IngestClientResources { ); mut_resources.successful_ingestions_queues = Self::update_clients_vec( mut_resources.successful_ingestions_queues.clone(), - raw_ingest_client_resources.successful_ingestions_queues.clone(), + raw_ingest_client_resources + .successful_ingestions_queues + .clone(), self.client_options.queue_service.clone(), ); mut_resources.failed_ingestions_queues = Self::update_clients_vec( @@ -166,7 +176,7 @@ impl IngestClientResources { field_fn: F, create_client_vec_fn: Fx, set_value: Fy, - client_options: ClientOptions + client_options: ClientOptions, ) -> Result> where F: Fn(&InnerIngestClientResources) -> &Vec, @@ -194,7 +204,10 @@ impl IngestClientResources { // First time, so create the resources outside let mut_resources = resources.get_mut(); - let new_resources = Self::create_clients_vec(create_client_vec_fn(&raw_ingest_client_resources), client_options); + let new_resources = Self::create_clients_vec( + create_client_vec_fn(&raw_ingest_client_resources), + client_options, + ); set_value(mut_resources, &new_resources); Ok(new_resources) @@ -204,45 +217,58 @@ impl IngestClientResources { self.get_clients( |resources| &resources.secured_ready_for_aggregation_queues, |resources| &resources.secured_ready_for_aggregation_queues, - |mut_resources, new_resources| mut_resources.secured_ready_for_aggregation_queues = new_resources.clone(), + |mut_resources, new_resources| { + mut_resources.secured_ready_for_aggregation_queues = new_resources.clone() + }, self.client_options.queue_service.clone(), - ).await + ) + .await } - + pub async fn get_temp_storage(&self) -> Result> { self.get_clients( |resources| &resources.temp_storage, |resources| &resources.temp_storage, |mut_resources, new_resources| mut_resources.temp_storage = new_resources.clone(), self.client_options.blob_service.clone(), - ).await + ) + .await } - + pub async fn get_ingestions_status_tables(&self) -> Result> { self.get_clients( |resources| &resources.ingestions_status_tables, |resources| &resources.ingestions_status_tables, - |mut_resources, new_resources| mut_resources.ingestions_status_tables = new_resources.clone(), + |mut_resources, new_resources| { + mut_resources.ingestions_status_tables = new_resources.clone() + }, self.client_options.table_service.clone(), - ).await + ) + .await } - + pub async fn get_successful_ingestions_queues(&self) -> Result> { self.get_clients( |resources| &resources.successful_ingestions_queues, |resources| &resources.successful_ingestions_queues, - |mut_resources, new_resources| mut_resources.successful_ingestions_queues = new_resources.clone(), + |mut_resources, new_resources| { + mut_resources.successful_ingestions_queues = new_resources.clone() + }, self.client_options.queue_service.clone(), - ).await + ) + .await } - + pub async fn get_failed_ingestions_queues(&self) -> Result> { self.get_clients( |resources| &resources.failed_ingestions_queues, |resources| &resources.failed_ingestions_queues, - |mut_resources, new_resources| mut_resources.failed_ingestions_queues = new_resources.clone(), + |mut_resources, new_resources| { + mut_resources.failed_ingestions_queues = new_resources.clone() + }, self.client_options.queue_service.clone(), - ).await + ) + .await } } @@ -256,7 +282,10 @@ pub struct ResourceManager { impl ResourceManager { pub fn new(client: KustoClient, client_options: QueuedIngestClientOptions) -> Self { Self { - ingest_client_resources: Arc::new(IngestClientResources::new(client.clone(), client_options)), + ingest_client_resources: Arc::new(IngestClientResources::new( + client.clone(), + client_options, + )), authorization_context: Arc::new(AuthorizationContext::new(client)), } } @@ -272,7 +301,9 @@ impl ResourceManager { } pub async fn ingestions_status_tables(&self) -> Result> { - self.ingest_client_resources.get_ingestions_status_tables().await + self.ingest_client_resources + .get_ingestions_status_tables() + .await } pub async fn successful_ingestions_queues(&self) -> Result> { diff --git a/azure-kusto-ingest/src/resource_manager/authorization_context.rs b/azure-kusto-ingest/src/resource_manager/authorization_context.rs index 397a990..f46b658 100644 --- a/azure-kusto-ingest/src/resource_manager/authorization_context.rs +++ b/azure-kusto-ingest/src/resource_manager/authorization_context.rs @@ -62,7 +62,7 @@ impl AuthorizationContext { drop(auth_context_cache); let mut auth_context_cache = self.auth_context_cache.write().await; - // Again attempt to return from cache, check is done in case another thread + // Again attempt to return from cache, check is done in case another thread // refreshed the token while we were waiting on the write lock if let Some(inner_value) = auth_context_cache.get() { return Ok(inner_value.clone()); diff --git a/azure-kusto-ingest/src/resource_manager/cache.rs b/azure-kusto-ingest/src/resource_manager/cache.rs index b850a4e..03a5215 100644 --- a/azure-kusto-ingest/src/resource_manager/cache.rs +++ b/azure-kusto-ingest/src/resource_manager/cache.rs @@ -5,7 +5,7 @@ use std::{ use tokio::sync::RwLock; -/// Wrapper around a value that allows for storing when the value was last updated, +/// Wrapper around a value that allows for storing when the value was last updated, /// as well as the period after which it should be refreshed (i.e. expired) #[derive(Debug, Clone)] pub struct Cached { diff --git a/azure-kusto-ingest/src/resource_manager/resource_uri.rs b/azure-kusto-ingest/src/resource_manager/resource_uri.rs index c932053..b469ea5 100644 --- a/azure-kusto-ingest/src/resource_manager/resource_uri.rs +++ b/azure-kusto-ingest/src/resource_manager/resource_uri.rs @@ -53,7 +53,11 @@ impl TryFrom for ResourceUri { .to_string(); let sas_token = match parsed_uri.query() { Some(query) => query.to_string(), - None => return Err(anyhow::anyhow!("SAS token is missing in the URI as a query parameter")), + None => { + return Err(anyhow::anyhow!( + "SAS token is missing in the URI as a query parameter" + )) + } }; let sas_token = StorageCredentials::sas_token(sas_token)?; diff --git a/azure-kusto-ingest/src/result.rs b/azure-kusto-ingest/src/result.rs index 64fb47c..8a4338c 100644 --- a/azure-kusto-ingest/src/result.rs +++ b/azure-kusto-ingest/src/result.rs @@ -4,7 +4,7 @@ pub enum IngestionStatus { // The ingestion was queued. Queued, // The ingestion was successfully streamed - Success + Success, } // The result of an ingestion. @@ -18,7 +18,7 @@ pub struct IngestionResult { // The source id of the ingestion. source_id: Uuid, // The blob uri of the ingestion, if exists. - blob_uri: Option + blob_uri: Option, } impl IngestionResult { From ffecd4b15e0d2d96e89f69a34740f8dd3a1f60a3 Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Wed, 23 Aug 2023 12:28:11 +0100 Subject: [PATCH 06/44] cargo clippy fixes exc. unused --- azure-kusto-ingest/src/data_format.rs | 7 +--- azure-kusto-ingest/src/descriptors.rs | 32 ++++++++++---- azure-kusto-ingest/src/ingestion_blob_info.rs | 8 ++-- azure-kusto-ingest/src/queued_ingest.rs | 17 ++------ azure-kusto-ingest/src/resource_manager.rs | 5 +-- .../src/resource_manager/resource_uri.rs | 42 +++++++------------ azure-kusto-ingest/src/result.rs | 8 ++-- 7 files changed, 53 insertions(+), 66 deletions(-) diff --git a/azure-kusto-ingest/src/data_format.rs b/azure-kusto-ingest/src/data_format.rs index 0a985e1..532e97c 100644 --- a/azure-kusto-ingest/src/data_format.rs +++ b/azure-kusto-ingest/src/data_format.rs @@ -1,8 +1,9 @@ use serde::Serialize; -#[derive(Serialize, Clone, Debug)] +#[derive(Serialize, Clone, Debug, Default)] pub enum IngestionMappingKind { #[serde(rename = "Csv")] + #[default] CSV, #[serde(rename = "Json")] JSON, @@ -57,10 +58,6 @@ pub enum DataFormat { } impl DataFormat { - pub fn default() -> Self { - DataFormat::CSV - } - pub fn ingestion_mapping_kind(self) -> IngestionMappingKind { match self { DataFormat::CSV => IngestionMappingKind::CSV, diff --git a/azure-kusto-ingest/src/descriptors.rs b/azure-kusto-ingest/src/descriptors.rs index d9a2b16..30077ac 100644 --- a/azure-kusto-ingest/src/descriptors.rs +++ b/azure-kusto-ingest/src/descriptors.rs @@ -1,19 +1,33 @@ -use std::{fmt::format, io::Read, path::PathBuf}; +use std::{io::Read, path::PathBuf}; -use azure_storage::StorageCredentials; +use azure_storage::shared_access_signature::SasToken; use url::Url; use uuid::Uuid; -#[derive(Clone, Debug)] pub enum BlobAuth { - SASToken(), - // adds `;managed_identity=` to the blob path + SASToken(Box), + /// adds `;managed_identity=` to the blob path UserAssignedManagedIdentity(String), - // adds `;managed_identity=system` to the blob path + /// adds `;managed_identity=system` to the blob path SystemAssignedManagedIdentity, } -#[derive(Clone, Debug)] +impl std::fmt::Debug for BlobAuth { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + BlobAuth::SASToken(_) => f.debug_struct("SASToken").finish(), + BlobAuth::UserAssignedManagedIdentity(object_id) => f + .debug_struct("UserAssignedManagedIdentity") + .field("object_id", object_id) + .finish(), + BlobAuth::SystemAssignedManagedIdentity => { + f.debug_struct("SystemAssignedManagedIdentity").finish() + } + } + } +} + +#[derive(Debug)] pub struct BlobDescriptor { uri: Url, pub(crate) size: Option, @@ -43,9 +57,9 @@ impl BlobDescriptor { pub fn uri(&self) -> String { match &self.blob_auth { - Some(BlobAuth::SASToken()) => { + Some(BlobAuth::SASToken(sas_token)) => { let mut uri = self.uri.clone(); - uri.set_query(Some("sas_token")); + uri.set_query(Some(sas_token.token().as_str())); uri.to_string() } Some(BlobAuth::UserAssignedManagedIdentity(object_id)) => { diff --git a/azure-kusto-ingest/src/ingestion_blob_info.rs b/azure-kusto-ingest/src/ingestion_blob_info.rs index a53d798..4659ae4 100644 --- a/azure-kusto-ingest/src/ingestion_blob_info.rs +++ b/azure-kusto-ingest/src/ingestion_blob_info.rs @@ -41,9 +41,9 @@ pub struct QueuedIngestionMessage { impl QueuedIngestionMessage { pub fn new( - blob_descriptor: BlobDescriptor, + blob_descriptor: &BlobDescriptor, ingestion_properties: &IngestionProperties, - auth_context: KustoIdentityToken, + authorization_context: KustoIdentityToken, ) -> Self { let additional_properties = AdditionalProperties { ingestion_mapping: None, @@ -59,13 +59,13 @@ impl QueuedIngestionMessage { tags: vec![], validation_policy: None, zip_pattern: None, - authorization_context: auth_context.clone(), + authorization_context, extra_additional_properties: HashMap::new(), }; Self { id: blob_descriptor.source_id, - blob_path: blob_descriptor.uri().to_string(), + blob_path: blob_descriptor.uri(), raw_data_size: blob_descriptor.size, database_name: ingestion_properties.database_name.clone(), table_name: ingestion_properties.table_name.clone(), diff --git a/azure-kusto-ingest/src/queued_ingest.rs b/azure-kusto-ingest/src/queued_ingest.rs index bc0e326..0996a79 100644 --- a/azure-kusto-ingest/src/queued_ingest.rs +++ b/azure-kusto-ingest/src/queued_ingest.rs @@ -17,14 +17,8 @@ pub struct QueuedIngestClientOptions { pub table_service: ClientOptions, } -impl QueuedIngestClientOptions { - pub fn new() -> Self { - Self::default() - } -} - impl From for QueuedIngestClientOptions { - /// Creates a `QueuedIngestClientOptions` struct where all the client options are the same + /// Creates a `QueuedIngestClientOptions` struct where the same [ClientOptions] are used for all services fn from(client_options: ClientOptions) -> Self { Self { queue_service: client_options.clone(), @@ -34,9 +28,9 @@ impl From for QueuedIngestClientOptions { } } +// The KustoClient is used to get the ingestion resources, it should be a client against the ingestion cluster endpoint #[derive(Clone)] pub struct QueuedIngestClient { - // The KustoClient is used to get the ingestion resources, it should be a client against the ingestion cluster endpoint resource_manager: Arc, } @@ -70,11 +64,8 @@ impl QueuedIngestClient { let auth_context = self.resource_manager.authorization_context().await?; // println!("auth_context: {:#?}\n", auth_context); - let message = QueuedIngestionMessage::new( - blob_descriptor.clone(), - ingestion_properties, - auth_context, - ); + let message = + QueuedIngestionMessage::new(&blob_descriptor, ingestion_properties, auth_context); // println!("message as struct: {:#?}\n", message); diff --git a/azure-kusto-ingest/src/resource_manager.rs b/azure-kusto-ingest/src/resource_manager.rs index 140ad37..556d4ce 100644 --- a/azure-kusto-ingest/src/resource_manager.rs +++ b/azure-kusto-ingest/src/resource_manager.rs @@ -76,10 +76,7 @@ impl IngestClientResources { RawIngestClientResources::try_from(table) } - fn create_clients_vec( - resource_uris: &Vec, - client_options: ClientOptions, - ) -> Vec + fn create_clients_vec(resource_uris: &[ResourceUri], client_options: ClientOptions) -> Vec where T: ClientFromResourceUri, { diff --git a/azure-kusto-ingest/src/resource_manager/resource_uri.rs b/azure-kusto-ingest/src/resource_manager/resource_uri.rs index b469ea5..0685149 100644 --- a/azure-kusto-ingest/src/resource_manager/resource_uri.rs +++ b/azure-kusto-ingest/src/resource_manager/resource_uri.rs @@ -10,17 +10,12 @@ use anyhow::Result; /// Parsing logic of resource URIs as returned by the Kusto management endpoint #[derive(Debug, Clone)] pub struct ResourceUri { - uri: String, service_uri: String, object_name: String, sas_token: StorageCredentials, } impl ResourceUri { - pub fn uri(&self) -> &str { - self.uri.as_str() - } - pub fn service_uri(&self) -> &str { self.service_uri.as_str() } @@ -38,9 +33,7 @@ impl TryFrom for ResourceUri { type Error = anyhow::Error; fn try_from(uri: String) -> Result { - println!("uri: {:#?}", uri); let parsed_uri = Url::parse(&uri)?; - println!("parsed_uri: {:#?}", parsed_uri); let service_uri = match parsed_uri.host_str() { Some(host_str) => parsed_uri.scheme().to_string() + "://" + host_str, @@ -49,7 +42,7 @@ impl TryFrom for ResourceUri { let object_name = parsed_uri .path() .trim_start() - .trim_start_matches("/") + .trim_start_matches('/') .to_string(); let sas_token = match parsed_uri.query() { Some(query) => query.to_string(), @@ -62,7 +55,6 @@ impl TryFrom for ResourceUri { let sas_token = StorageCredentials::sas_token(sas_token)?; Ok(Self { - uri, service_uri, object_name, sas_token, @@ -76,15 +68,13 @@ pub trait ClientFromResourceUri { impl ClientFromResourceUri for QueueClient { fn create_client(resource_uri: ResourceUri, client_options: ClientOptions) -> Self { - let queue_service = - QueueServiceClientBuilder::with_location(azure_storage::CloudLocation::Custom { - uri: resource_uri.service_uri().to_string(), - credentials: resource_uri.sas_token().clone(), - }) - .client_options(client_options) - .build(); - - queue_service.queue_client(resource_uri.object_name()) + QueueServiceClientBuilder::with_location(azure_storage::CloudLocation::Custom { + uri: resource_uri.service_uri().to_string(), + credentials: resource_uri.sas_token().clone(), + }) + .client_options(client_options) + .build() + .queue_client(resource_uri.object_name()) } } @@ -101,14 +91,12 @@ impl ClientFromResourceUri for ContainerClient { impl ClientFromResourceUri for TableClient { fn create_client(resource_uri: ResourceUri, client_options: ClientOptions) -> Self { - let table_service = - TableServiceClientBuilder::with_location(azure_storage::CloudLocation::Custom { - uri: resource_uri.service_uri().to_string(), - credentials: resource_uri.sas_token().clone(), - }) - .client_options(client_options) - .build(); - - table_service.table_client(resource_uri.object_name()) + TableServiceClientBuilder::with_location(azure_storage::CloudLocation::Custom { + uri: resource_uri.service_uri().to_string(), + credentials: resource_uri.sas_token().clone(), + }) + .client_options(client_options) + .build() + .table_client(resource_uri.object_name()) } } diff --git a/azure-kusto-ingest/src/result.rs b/azure-kusto-ingest/src/result.rs index 8a4338c..35a8dd3 100644 --- a/azure-kusto-ingest/src/result.rs +++ b/azure-kusto-ingest/src/result.rs @@ -24,15 +24,15 @@ pub struct IngestionResult { impl IngestionResult { pub fn new( status: IngestionStatus, - database: &String, - table: &String, + database: &str, + table: &str, source_id: Uuid, blob_uri: Option, ) -> Self { Self { status, - database: database.clone(), - table: table.clone(), + database: database.to_owned(), + table: table.to_owned(), source_id, blob_uri, } From 8f86a5f4762881c4c21204c924655f3bbad1c0aa Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Wed, 23 Aug 2023 15:38:53 +0100 Subject: [PATCH 07/44] minor improvements --- azure-kusto-ingest/Cargo.toml | 2 +- azure-kusto-ingest/src/data_format.rs | 21 +------ azure-kusto-ingest/src/ingestion_blob_info.rs | 59 ++++++++++--------- .../src/ingestion_properties.rs | 24 +++----- azure-kusto-ingest/src/queued_ingest.rs | 8 +-- .../resource_manager/authorization_context.rs | 2 +- 6 files changed, 47 insertions(+), 69 deletions(-) diff --git a/azure-kusto-ingest/Cargo.toml b/azure-kusto-ingest/Cargo.toml index e443f19..ca44670 100644 --- a/azure-kusto-ingest/Cargo.toml +++ b/azure-kusto-ingest/Cargo.toml @@ -20,5 +20,5 @@ azure_storage = "0.13" azure_storage_blobs = "0.13" azure_storage_queues = "0.13" azure_data_tables = "0.13" +chrono = { version = "0.4", features = ["serde"] } -time = { version = "0.3", features = ["serde"] } diff --git a/azure-kusto-ingest/src/data_format.rs b/azure-kusto-ingest/src/data_format.rs index 532e97c..bd47228 100644 --- a/azure-kusto-ingest/src/data_format.rs +++ b/azure-kusto-ingest/src/data_format.rs @@ -19,41 +19,26 @@ pub enum IngestionMappingKind { } /// All data formats supported by Kusto -#[derive(Serialize, Clone, Debug)] +#[derive(Serialize, Clone, Debug, Default)] +#[serde(rename_all = "lowercase")] pub enum DataFormat { - #[serde(rename = "apacheavro")] ApacheAvro, - #[serde(rename = "avro")] Avro, - #[serde(rename = "csv")] + #[default] CSV, - #[serde(rename = "json")] JSON, - #[serde(rename = "multijson")] MultiJSON, - #[serde(rename = "orc")] ORC, - #[serde(rename = "parquet")] Parquet, - #[serde(rename = "psv")] PSV, - #[serde(rename = "raw")] RAW, - #[serde(rename = "scsv")] SCSV, - #[serde(rename = "sohsv")] SOHsv, - #[serde(rename = "singlejson")] SingleJSON, - #[serde(rename = "sstream")] SStream, - #[serde(rename = "tsv")] TSV, - #[serde(rename = "tsve")] TSVe, - #[serde(rename = "txt")] TXT, - #[serde(rename = "w3clogfile")] W3CLOGFILE, } diff --git a/azure-kusto-ingest/src/ingestion_blob_info.rs b/azure-kusto-ingest/src/ingestion_blob_info.rs index 4659ae4..5293ee1 100644 --- a/azure-kusto-ingest/src/ingestion_blob_info.rs +++ b/azure-kusto-ingest/src/ingestion_blob_info.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; +use chrono::{DateTime, Utc}; use serde::Serialize; use crate::{ @@ -14,28 +15,32 @@ use crate::{ #[derive(Serialize, Debug)] #[serde(rename_all = "PascalCase")] pub struct QueuedIngestionMessage { + /// Message identifier (GUID) id: uuid::Uuid, + /// Path (URI) to the blob, including the SAS key granting permissions to read/write/delete it. + /// Permissions are required so that the ingestion service can delete the blob once it has completed ingesting the data. blob_path: String, + /// Target database name database_name: String, + /// Target table name table_name: String, + /// Size of the uncompressed data in bytes. Providing this value allows the ingestion service to optimize ingestion by potentially aggregating multiple blobs. This property is optional, but if not given, the service will access the blob just to retrieve the size. #[serde(skip_serializing_if = "Option::is_none")] raw_data_size: Option, + /// If set to `true`, the blob won't be deleted once ingestion is successfully completed. Default is `false` #[serde(skip_serializing_if = "Option::is_none")] retain_blob_on_success: Option, + /// If set to `true`, any aggregation will be skipped. Default is `false` #[serde(skip_serializing_if = "Option::is_none")] flush_immediately: Option, #[serde(skip_serializing_if = "Option::is_none")] ignore_size_limit: Option, // according to Go impl, the report level and method could be Option - report_level: ReportLevel, - report_method: ReportMethod, - // TODO: implement this - // #[serde(skip_serializing_if = "Option::is_none")]s - // #[serde(skip_serializing_if = "Option::is_none")] - // #[serde(with= "time::serde::iso8601")] - source_message_creation_time: String, - // The additional properties struct is modelled on: - // https://learn.microsoft.com/en-us/azure/data-explorer/ingestion-properties + #[serde(skip_serializing_if = "Option::is_none")] + report_level: Option, + #[serde(skip_serializing_if = "Option::is_none")] + report_method: Option, + source_message_creation_time: DateTime, additional_properties: AdditionalProperties, } @@ -74,8 +79,7 @@ impl QueuedIngestionMessage { report_level: ingestion_properties.report_level.clone(), report_method: ingestion_properties.report_method.clone(), ignore_size_limit: Some(false), - // TODO: configurability of creation time - source_message_creation_time: String::from("2023-08-16T13:30:04.639714"), + source_message_creation_time: Utc::now(), additional_properties, } } @@ -83,43 +87,42 @@ impl QueuedIngestionMessage { // The additional properties struct is modelled on: https://learn.microsoft.com/en-us/azure/data-explorer/ingestion-properties #[derive(Serialize, Clone, Debug)] -pub struct AdditionalProperties { +struct AdditionalProperties { #[serde(skip_serializing_if = "Option::is_none")] #[serde(rename = "ingestionMapping")] - pub ingestion_mapping: Option, + ingestion_mapping: Option, #[serde(skip_serializing_if = "Option::is_none")] #[serde(rename = "ingestionMappingReference")] - pub ingestion_mapping_reference: Option, + ingestion_mapping_reference: Option, #[serde(skip_serializing_if = "Option::is_none")] #[serde(rename = "creationTime")] - pub creation_time: Option, + creation_time: Option, #[serde(skip_serializing_if = "Option::is_none")] - pub extend_schema: Option, + extend_schema: Option, #[serde(skip_serializing_if = "Option::is_none")] - pub folder: Option, + folder: Option, #[serde(rename = "format")] - pub data_format: DataFormat, + data_format: DataFormat, #[serde(skip_serializing_if = "Option::is_none")] #[serde(rename = "ingestIfNotExists")] - pub ingest_if_not_exists: Option, + ingest_if_not_exists: Option, #[serde(skip_serializing_if = "Option::is_none")] #[serde(rename = "ignoreFirstRecord")] - pub ignore_first_record: Option, + ignore_first_record: Option, #[serde(skip_serializing_if = "Option::is_none")] - pub policy_ingestiontime: Option, + policy_ingestiontime: Option, #[serde(skip_serializing_if = "Option::is_none")] - pub recreate_schema: Option, + recreate_schema: Option, #[serde(skip_serializing_if = "Vec::is_empty")] - pub tags: Vec, + tags: Vec, #[serde(rename = "validationPolicy")] #[serde(skip_serializing_if = "Option::is_none")] - pub validation_policy: Option, + validation_policy: Option, #[serde(rename = "zipPattern")] #[serde(skip_serializing_if = "Option::is_none")] - pub zip_pattern: Option, - // TODO: the user shouldn't be able to set this, we should expose certain properties via IngestionProperties rather than just the AdditionalProperties struct + zip_pattern: Option, #[serde(rename = "authorizationContext")] - pub authorization_context: KustoIdentityToken, + authorization_context: KustoIdentityToken, #[serde(flatten)] - pub extra_additional_properties: HashMap, + extra_additional_properties: HashMap, } diff --git a/azure-kusto-ingest/src/ingestion_properties.rs b/azure-kusto-ingest/src/ingestion_properties.rs index ee573a8..563457e 100644 --- a/azure-kusto-ingest/src/ingestion_properties.rs +++ b/azure-kusto-ingest/src/ingestion_properties.rs @@ -2,7 +2,7 @@ use crate::data_format::{DataFormat, IngestionMappingKind}; use serde::Serialize; use serde_repr::Serialize_repr; -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Default)] pub struct IngestionProperties { pub database_name: String, pub table_name: String, @@ -19,19 +19,15 @@ pub struct IngestionProperties { pub drop_by_tags: Vec, pub flush_immediately: Option, pub ignore_first_record: bool, - pub report_level: ReportLevel, - pub report_method: ReportMethod, + pub report_level: Option, + pub report_method: Option, pub validation_policy: Option, - // TODO: don't expose AdditionalProperties to user... - // pub additional_properties: AdditionalProperties, - // pub additional_properties: AdditionalProperties, } #[derive(Serialize, Clone, Debug)] +#[serde(rename_all = "PascalCase")] pub struct ValidationPolicy { - #[serde(rename = "ValidationOptions")] validation_options: ValidationOptions, - #[serde(rename = "ValidationImplications")] validation_implications: ValidationImplications, } @@ -82,27 +78,21 @@ pub enum TransformationMethod { /// pre-created (it is recommended to create the mappings in advance and use ingestionMappingReference). /// To read more about mappings look here: https://docs.microsoft.com/en-us/azure/kusto/management/mappings #[derive(Serialize, Clone, Debug)] +#[serde(rename_all = "PascalCase")] pub struct ColumnMapping { - #[serde(rename = "Column")] column: String, // TODO: can this be an enum? - #[serde(rename = "DataType")] - datatype: String, - #[serde(rename = "Properties")] + data_type: String, properties: ColumnMappingProperties, } #[derive(Serialize, Clone, Debug)] +#[serde(rename_all = "PascalCase")] pub struct ColumnMappingProperties { - #[serde(rename = "Path")] path: Option, - #[serde(rename = "Transform")] transform: Option, - #[serde(rename = "Ordinal")] // TODO: This should get serialized to a string ordinal: Option, - #[serde(rename = "ConstValue")] const_value: Option, - #[serde(rename = "Field")] field: Option, } diff --git a/azure-kusto-ingest/src/queued_ingest.rs b/azure-kusto-ingest/src/queued_ingest.rs index 0996a79..76a0c9c 100644 --- a/azure-kusto-ingest/src/queued_ingest.rs +++ b/azure-kusto-ingest/src/queued_ingest.rs @@ -49,7 +49,7 @@ impl QueuedIngestClient { } pub async fn ingest_from_blob( - self, + &self, blob_descriptor: BlobDescriptor, ingestion_properties: &IngestionProperties, ) -> Result { @@ -93,7 +93,7 @@ impl QueuedIngestClient { } pub async fn ingest_from_file( - self, + &self, file_descriptor: FileDescriptor, ingestion_properties: IngestionProperties, ) -> Result { @@ -105,7 +105,7 @@ impl QueuedIngestClient { } pub async fn ingest_from_stream( - self, + &self, stream_descriptor: StreamDescriptor, ingestion_properties: IngestionProperties, ) -> Result { @@ -117,7 +117,7 @@ impl QueuedIngestClient { } async fn upload_from_different_descriptor( - self, + &self, descriptor: FileDescriptor, ingestion_properties: &IngestionProperties, ) -> Result { diff --git a/azure-kusto-ingest/src/resource_manager/authorization_context.rs b/azure-kusto-ingest/src/resource_manager/authorization_context.rs index f46b658..eab99ea 100644 --- a/azure-kusto-ingest/src/resource_manager/authorization_context.rs +++ b/azure-kusto-ingest/src/resource_manager/authorization_context.rs @@ -49,7 +49,7 @@ impl AuthorizationContext { Ok(kusto_identity_token) } - /// Fetches the latest Kusto identity token, either from the cache or by executing a KQL query + /// Fetches the latest Kusto identity token, either retrieving from cache if valid, or by executing a KQL query pub async fn get(&self) -> Result { // First, attempt to get the return the token from the cache let auth_context_cache = self.auth_context_cache.read().await; From a5ec8b8dabefcce478acc7927d14c3aff75e043e Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Wed, 23 Aug 2023 17:00:16 +0100 Subject: [PATCH 08/44] various minor improvements upon testing --- azure-kusto-ingest/src/descriptors.rs | 6 +-- azure-kusto-ingest/src/ingestion_blob_info.rs | 37 ++++++------------- .../src/ingestion_properties.rs | 2 + azure-kusto-ingest/src/queued_ingest.rs | 4 +- azure-kusto-ingest/src/result.rs | 3 ++ 5 files changed, 22 insertions(+), 30 deletions(-) diff --git a/azure-kusto-ingest/src/descriptors.rs b/azure-kusto-ingest/src/descriptors.rs index 30077ac..7468ea4 100644 --- a/azure-kusto-ingest/src/descriptors.rs +++ b/azure-kusto-ingest/src/descriptors.rs @@ -1,11 +1,11 @@ use std::{io::Read, path::PathBuf}; -use azure_storage::shared_access_signature::SasToken; use url::Url; use uuid::Uuid; pub enum BlobAuth { - SASToken(Box), + /// adds `?` to the blob path + SASToken(String), /// adds `;managed_identity=` to the blob path UserAssignedManagedIdentity(String), /// adds `;managed_identity=system` to the blob path @@ -59,7 +59,7 @@ impl BlobDescriptor { match &self.blob_auth { Some(BlobAuth::SASToken(sas_token)) => { let mut uri = self.uri.clone(); - uri.set_query(Some(sas_token.token().as_str())); + uri.set_query(Some(sas_token.as_str())); uri.to_string() } Some(BlobAuth::UserAssignedManagedIdentity(object_id)) => { diff --git a/azure-kusto-ingest/src/ingestion_blob_info.rs b/azure-kusto-ingest/src/ingestion_blob_info.rs index 5293ee1..e60c35c 100644 --- a/azure-kusto-ingest/src/ingestion_blob_info.rs +++ b/azure-kusto-ingest/src/ingestion_blob_info.rs @@ -50,22 +50,18 @@ impl QueuedIngestionMessage { ingestion_properties: &IngestionProperties, authorization_context: KustoIdentityToken, ) -> Self { + // TODO: processing of additional tags, ingest_by_tags, drop_by_tags into just tags + let additional_properties = AdditionalProperties { ingestion_mapping: None, ingestion_mapping_reference: None, - creation_time: None, - extend_schema: None, - folder: None, + creation_time: ingestion_properties.creation_time, data_format: ingestion_properties.data_format.clone(), ingest_if_not_exists: None, ignore_first_record: None, - policy_ingestiontime: None, - recreate_schema: None, tags: vec![], validation_policy: None, - zip_pattern: None, authorization_context, - extra_additional_properties: HashMap::new(), }; Self { @@ -88,41 +84,32 @@ impl QueuedIngestionMessage { // The additional properties struct is modelled on: https://learn.microsoft.com/en-us/azure/data-explorer/ingestion-properties #[derive(Serialize, Clone, Debug)] struct AdditionalProperties { + #[serde(rename = "authorizationContext")] + authorization_context: KustoIdentityToken, + #[serde(rename = "format")] + data_format: DataFormat, #[serde(skip_serializing_if = "Option::is_none")] #[serde(rename = "ingestionMapping")] ingestion_mapping: Option, #[serde(skip_serializing_if = "Option::is_none")] #[serde(rename = "ingestionMappingReference")] ingestion_mapping_reference: Option, + // TODO: is this required? + // #[serde(skip_serializing_if = "Option::is_none")] + // #[serde(rename = "ingestionMappingType")] + // ingestion_mapping_type: Option, #[serde(skip_serializing_if = "Option::is_none")] #[serde(rename = "creationTime")] - creation_time: Option, - #[serde(skip_serializing_if = "Option::is_none")] - extend_schema: Option, - #[serde(skip_serializing_if = "Option::is_none")] - folder: Option, - #[serde(rename = "format")] - data_format: DataFormat, + creation_time: Option>, #[serde(skip_serializing_if = "Option::is_none")] #[serde(rename = "ingestIfNotExists")] ingest_if_not_exists: Option, #[serde(skip_serializing_if = "Option::is_none")] #[serde(rename = "ignoreFirstRecord")] ignore_first_record: Option, - #[serde(skip_serializing_if = "Option::is_none")] - policy_ingestiontime: Option, - #[serde(skip_serializing_if = "Option::is_none")] - recreate_schema: Option, #[serde(skip_serializing_if = "Vec::is_empty")] tags: Vec, #[serde(rename = "validationPolicy")] #[serde(skip_serializing_if = "Option::is_none")] validation_policy: Option, - #[serde(rename = "zipPattern")] - #[serde(skip_serializing_if = "Option::is_none")] - zip_pattern: Option, - #[serde(rename = "authorizationContext")] - authorization_context: KustoIdentityToken, - #[serde(flatten)] - extra_additional_properties: HashMap, } diff --git a/azure-kusto-ingest/src/ingestion_properties.rs b/azure-kusto-ingest/src/ingestion_properties.rs index 563457e..5d4849a 100644 --- a/azure-kusto-ingest/src/ingestion_properties.rs +++ b/azure-kusto-ingest/src/ingestion_properties.rs @@ -1,4 +1,5 @@ use crate::data_format::{DataFormat, IngestionMappingKind}; +use chrono::{DateTime, Utc}; use serde::Serialize; use serde_repr::Serialize_repr; @@ -22,6 +23,7 @@ pub struct IngestionProperties { pub report_level: Option, pub report_method: Option, pub validation_policy: Option, + pub creation_time: Option>, } #[derive(Serialize, Clone, Debug)] diff --git a/azure-kusto-ingest/src/queued_ingest.rs b/azure-kusto-ingest/src/queued_ingest.rs index 76a0c9c..015f034 100644 --- a/azure-kusto-ingest/src/queued_ingest.rs +++ b/azure-kusto-ingest/src/queued_ingest.rs @@ -51,7 +51,7 @@ impl QueuedIngestClient { pub async fn ingest_from_blob( &self, blob_descriptor: BlobDescriptor, - ingestion_properties: &IngestionProperties, + ingestion_properties: IngestionProperties, ) -> Result { // The queues returned here should ideally be the storage queue client from azure-storage-queue // As such, it may be better for ResourceManager to return a struct that contains the storage queue client @@ -65,7 +65,7 @@ impl QueuedIngestClient { // println!("auth_context: {:#?}\n", auth_context); let message = - QueuedIngestionMessage::new(&blob_descriptor, ingestion_properties, auth_context); + QueuedIngestionMessage::new(&blob_descriptor, &ingestion_properties, auth_context); // println!("message as struct: {:#?}\n", message); diff --git a/azure-kusto-ingest/src/result.rs b/azure-kusto-ingest/src/result.rs index 35a8dd3..f5bbc0d 100644 --- a/azure-kusto-ingest/src/result.rs +++ b/azure-kusto-ingest/src/result.rs @@ -1,5 +1,6 @@ use uuid::Uuid; +#[derive(Debug)] pub enum IngestionStatus { // The ingestion was queued. Queued, @@ -7,7 +8,9 @@ pub enum IngestionStatus { Success, } + // The result of an ingestion. +#[derive(Debug)] pub struct IngestionResult { // Will be `Queued` if the ingestion is queued, or `Success` if the ingestion is streaming and successful. status: IngestionStatus, From ede9088dc469214a58e89de3d5033b6e3fe04276 Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Thu, 24 Aug 2023 16:02:09 +0100 Subject: [PATCH 09/44] improvements --- azure-kusto-ingest/src/data_format.rs | 1 + azure-kusto-ingest/src/descriptors.rs | 72 ++++----- azure-kusto-ingest/src/ingestion_blob_info.rs | 4 +- .../src/ingestion_properties.rs | 11 +- azure-kusto-ingest/src/queued_ingest.rs | 79 +++------- azure-kusto-ingest/src/resource_manager.rs | 137 +++++++++--------- .../resource_manager/authorization_context.rs | 15 +- .../ingest_client_resources.rs | 42 ++++-- .../src/resource_manager/resource_uri.rs | 7 +- azure-kusto-ingest/src/result.rs | 1 - 10 files changed, 184 insertions(+), 185 deletions(-) diff --git a/azure-kusto-ingest/src/data_format.rs b/azure-kusto-ingest/src/data_format.rs index bd47228..03a42aa 100644 --- a/azure-kusto-ingest/src/data_format.rs +++ b/azure-kusto-ingest/src/data_format.rs @@ -19,6 +19,7 @@ pub enum IngestionMappingKind { } /// All data formats supported by Kusto +/// Default is [DataFormat::CSV] #[derive(Serialize, Clone, Debug, Default)] #[serde(rename_all = "lowercase")] pub enum DataFormat { diff --git a/azure-kusto-ingest/src/descriptors.rs b/azure-kusto-ingest/src/descriptors.rs index 7468ea4..89cde58 100644 --- a/azure-kusto-ingest/src/descriptors.rs +++ b/azure-kusto-ingest/src/descriptors.rs @@ -1,8 +1,8 @@ -use std::{io::Read, path::PathBuf}; - use url::Url; use uuid::Uuid; +/// Helper enum for authentication information on a blob +#[derive(Clone)] pub enum BlobAuth { /// adds `?` to the blob path SASToken(String), @@ -27,11 +27,13 @@ impl std::fmt::Debug for BlobAuth { } } -#[derive(Debug)] +/// Encapsulates the information related to a blob that is required to ingest from a blob +#[derive(Debug, Clone)] pub struct BlobDescriptor { uri: Url, pub(crate) size: Option, pub(crate) source_id: Uuid, + /// Authentication information for the blob; when [None], the uri is passed through as is blob_auth: Option, } @@ -50,11 +52,13 @@ impl BlobDescriptor { } } + /// Mutator to modify the authentication information of the BlobDescriptor pub fn with_blob_auth(mut self, blob_auth: BlobAuth) -> Self { self.blob_auth = Some(blob_auth); self } + /// Returns the uri with the authentication information added pub fn uri(&self) -> String { match &self.blob_auth { Some(BlobAuth::SASToken(sas_token)) => { @@ -73,40 +77,36 @@ impl BlobDescriptor { } } -#[derive(Clone, Debug)] -pub struct FileDescriptor { - pub path: PathBuf, - pub size: Option, - pub source_id: Uuid, -} +// #[derive(Clone, Debug)] +// pub struct FileDescriptor { +// pub path: PathBuf, +// pub size: Option, +// pub source_id: Uuid, +// } -impl FileDescriptor { - pub fn new(path: PathBuf, size: Option, source_id: Option) -> Self { - unimplemented!() - } -} +// impl FileDescriptor { +// pub fn new(path: PathBuf, size: Option, source_id: Option) -> Self { +// unimplemented!() +// } +// } // #[derive(Clone, Debug)] -pub struct StreamDescriptor { - stream: Box, - size: Option, - source_id: Uuid, - compressed: bool, - stream_name: String, -} - -impl StreamDescriptor { - pub fn new( - stream: Box, - size: Option, - source_id: Option, - compressed: bool, - stream_name: String, - ) -> Self { - unimplemented!() - } +// pub struct StreamDescriptor { +// stream: Box, +// size: Option, +// source_id: Uuid, +// compressed: bool, +// stream_name: String, +// } - pub fn from_file_descriptor(file_descriptor: FileDescriptor) -> Self { - unimplemented!() - } -} +// impl StreamDescriptor { +// pub fn new( +// stream: Box, +// size: Option, +// source_id: Option, +// compressed: bool, +// stream_name: String, +// ) -> Self { +// unimplemented!() +// } +// } diff --git a/azure-kusto-ingest/src/ingestion_blob_info.rs b/azure-kusto-ingest/src/ingestion_blob_info.rs index e60c35c..463b8ba 100644 --- a/azure-kusto-ingest/src/ingestion_blob_info.rs +++ b/azure-kusto-ingest/src/ingestion_blob_info.rs @@ -1,5 +1,3 @@ -use std::collections::HashMap; - use chrono::{DateTime, Utc}; use serde::Serialize; @@ -58,7 +56,7 @@ impl QueuedIngestionMessage { creation_time: ingestion_properties.creation_time, data_format: ingestion_properties.data_format.clone(), ingest_if_not_exists: None, - ignore_first_record: None, + ignore_first_record: ingestion_properties.ignore_first_record, tags: vec![], validation_policy: None, authorization_context, diff --git a/azure-kusto-ingest/src/ingestion_properties.rs b/azure-kusto-ingest/src/ingestion_properties.rs index 5d4849a..bb86dda 100644 --- a/azure-kusto-ingest/src/ingestion_properties.rs +++ b/azure-kusto-ingest/src/ingestion_properties.rs @@ -3,14 +3,18 @@ use chrono::{DateTime, Utc}; use serde::Serialize; use serde_repr::Serialize_repr; +/// Properties used when ingesting data into Kusto, allowing for customisation of the ingestion process #[derive(Clone, Debug, Default)] pub struct IngestionProperties { + /// Name of the database to ingest into pub database_name: String, + /// Name of the table to ingest into pub table_name: String, + /// Whether the blob is retained after ingestion, note that this requires extra permissions pub retain_blob_on_success: Option, + /// Format of the data being ingested pub data_format: DataFormat, - // I think we could make this neater by using some enum wizardry to enforce certain checks that are being done currently - // I'm thinking of something like we give an ingestion mapping enum, with + // TODO: ingestion mappings could likely be made neater by using enums to enforce checks pub ingestion_mapping: Option>, pub ingestion_mapping_type: Option, pub ingestion_mapping_reference: Option>, @@ -19,10 +23,11 @@ pub struct IngestionProperties { pub ingest_by_tags: Vec, pub drop_by_tags: Vec, pub flush_immediately: Option, - pub ignore_first_record: bool, + pub ignore_first_record: Option, pub report_level: Option, pub report_method: Option, pub validation_policy: Option, + /// Allows for configurability of the `creationTime` property pub creation_time: Option>, } diff --git a/azure-kusto-ingest/src/queued_ingest.rs b/azure-kusto-ingest/src/queued_ingest.rs index 015f034..3876eb1 100644 --- a/azure-kusto-ingest/src/queued_ingest.rs +++ b/azure-kusto-ingest/src/queued_ingest.rs @@ -4,12 +4,13 @@ use anyhow::Result; use azure_core::{base64, ClientOptions}; use azure_kusto_data::prelude::KustoClient; -use crate::descriptors::{BlobDescriptor, FileDescriptor, StreamDescriptor}; +use crate::descriptors::BlobDescriptor; use crate::ingestion_blob_info::QueuedIngestionMessage; use crate::ingestion_properties::IngestionProperties; use crate::resource_manager::ResourceManager; use crate::result::{IngestionResult, IngestionStatus}; +/// Allows configurability of ClientOptions for the storage clients used within [QueuedIngestClient] #[derive(Clone, Default)] pub struct QueuedIngestClientOptions { pub queue_service: ClientOptions, @@ -28,17 +29,20 @@ impl From for QueuedIngestClientOptions { } } -// The KustoClient is used to get the ingestion resources, it should be a client against the ingestion cluster endpoint +/// Client for ingesting data into Kusto using the queued flavour of ingestion #[derive(Clone)] pub struct QueuedIngestClient { resource_manager: Arc, } impl QueuedIngestClient { + /// Creates a new client from the given [KustoClient] pub fn new(kusto_client: KustoClient) -> Self { Self::new_with_client_options(kusto_client, QueuedIngestClientOptions::default()) } + /// Creates a new client from the given [KustoClient] and [QueuedIngestClientOptions] + /// This allows for customisation of the [ClientOptions] used for the storage clients pub fn new_with_client_options( kusto_client: KustoClient, options: QueuedIngestClientOptions, @@ -48,6 +52,7 @@ impl QueuedIngestClient { Self { resource_manager } } + /// Ingest a file into Kusto from Azure Blob Storage pub async fn ingest_from_blob( &self, blob_descriptor: BlobDescriptor, @@ -92,57 +97,21 @@ impl QueuedIngestClient { )) } - pub async fn ingest_from_file( - &self, - file_descriptor: FileDescriptor, - ingestion_properties: IngestionProperties, - ) -> Result { - unimplemented!() - // This function needs to upload the blob from the file, and then call on ingest_from_blob - - // self.ingest_from_blob(blob_descriptor, &ingestion_properties) - // .await - } - - pub async fn ingest_from_stream( - &self, - stream_descriptor: StreamDescriptor, - ingestion_properties: IngestionProperties, - ) -> Result { - unimplemented!() - // This function needs to upload the blob from the stream, and then call on ingest_from_blob - - // self.ingest_from_blob(blob_descriptor, &ingestion_properties) - // .await - } - - async fn upload_from_different_descriptor( - &self, - descriptor: FileDescriptor, - ingestion_properties: &IngestionProperties, - ) -> Result { - unimplemented!() - // WIP - // let blob_name = format!( - // "{database_name}_{table_name}_{source_id}_{stream_name}", - // database_name = ingestion_properties.database_name, - // table_name = ingestion_properties.table_name, - // source_id = descriptor.source_id, - // stream_name = descriptor.stream_name.to_str().unwrap().to_string() - // ); - - // let container_clients = self.resource_manager.temp_storage().await?; - // // TODO: pick a random container client from the container clients returned by the resource manager - // let container_client = container_clients.first().unwrap().clone(); - // let blob_client = container_client.blob_client(blob_name); - - // blob_client.put_block_blob(body) - - // blob_url = ""; - - // Ok(BlobDescriptor::new( - // blob_url, - // ingestion_properties.source_id, - // )) - } + // /// Ingest a local file into Kusto + // pub async fn ingest_from_file( + // &self, + // file_descriptor: FileDescriptor, + // ingestion_properties: IngestionProperties, + // ) -> Result { + // unimplemented!() + // } + + // /// Ingest a stream into Kusto + // pub async fn ingest_from_stream( + // &self, + // stream_descriptor: StreamDescriptor, + // ingestion_properties: IngestionProperties, + // ) -> Result { + // unimplemented!() + // } } diff --git a/azure-kusto-ingest/src/resource_manager.rs b/azure-kusto-ingest/src/resource_manager.rs index 556d4ce..57fdafc 100644 --- a/azure-kusto-ingest/src/resource_manager.rs +++ b/azure-kusto-ingest/src/resource_manager.rs @@ -67,11 +67,20 @@ impl IngestClientResources { } // TODO: Logic to get the Kusto identity token from Kusto management endpoint - handle any validation of the response from the query here + /// Executes a KQL management query that retrieves resource URIs for the various Azure resources used for ingestion async fn execute_kql_mgmt_query(client: KustoClient) -> Result { let results = client .execute_command("NetDefaultDB", ".get ingestion resources", None) .await?; - let table = results.tables.first().unwrap(); + + let table = match results.tables.first() { + Some(a) => a, + None => { + return Err(anyhow::anyhow!( + "Kusto expected a table containing ingestion resource results, found no tables", + )) + } + }; RawIngestClientResources::try_from(table) } @@ -107,6 +116,7 @@ impl IngestClientResources { // 4. Return the kusto response // As such, at any one time it is guaranteed that anything that has been queried before will be available and up to date // Anything that has not been queried before will be available to create, but not as Azure clients until explicitly queried + /// async fn update_from_kusto(&self) -> Result { let resources = self.resources.read().await; if !resources.is_expired() { @@ -128,7 +138,6 @@ impl IngestClientResources { mut_resources.kusto_response = Some(raw_ingest_client_resources.clone()); - // This is ugly... the logic is to check whether we have already created clients previously, and if so, updating them mut_resources.secured_ready_for_aggregation_queues = Self::update_clients_vec( mut_resources.secured_ready_for_aggregation_queues.clone(), raw_ingest_client_resources @@ -222,61 +231,63 @@ impl IngestClientResources { .await } - pub async fn get_temp_storage(&self) -> Result> { - self.get_clients( - |resources| &resources.temp_storage, - |resources| &resources.temp_storage, - |mut_resources, new_resources| mut_resources.temp_storage = new_resources.clone(), - self.client_options.blob_service.clone(), - ) - .await - } + // pub async fn get_temp_storage(&self) -> Result> { + // self.get_clients( + // |resources| &resources.temp_storage, + // |resources| &resources.temp_storage, + // |mut_resources, new_resources| mut_resources.temp_storage = new_resources.clone(), + // self.client_options.blob_service.clone(), + // ) + // .await + // } - pub async fn get_ingestions_status_tables(&self) -> Result> { - self.get_clients( - |resources| &resources.ingestions_status_tables, - |resources| &resources.ingestions_status_tables, - |mut_resources, new_resources| { - mut_resources.ingestions_status_tables = new_resources.clone() - }, - self.client_options.table_service.clone(), - ) - .await - } + // pub async fn get_ingestions_status_tables(&self) -> Result> { + // self.get_clients( + // |resources| &resources.ingestions_status_tables, + // |resources| &resources.ingestions_status_tables, + // |mut_resources, new_resources| { + // mut_resources.ingestions_status_tables = new_resources.clone() + // }, + // self.client_options.table_service.clone(), + // ) + // .await + // } - pub async fn get_successful_ingestions_queues(&self) -> Result> { - self.get_clients( - |resources| &resources.successful_ingestions_queues, - |resources| &resources.successful_ingestions_queues, - |mut_resources, new_resources| { - mut_resources.successful_ingestions_queues = new_resources.clone() - }, - self.client_options.queue_service.clone(), - ) - .await - } + // pub async fn get_successful_ingestions_queues(&self) -> Result> { + // self.get_clients( + // |resources| &resources.successful_ingestions_queues, + // |resources| &resources.successful_ingestions_queues, + // |mut_resources, new_resources| { + // mut_resources.successful_ingestions_queues = new_resources.clone() + // }, + // self.client_options.queue_service.clone(), + // ) + // .await + // } - pub async fn get_failed_ingestions_queues(&self) -> Result> { - self.get_clients( - |resources| &resources.failed_ingestions_queues, - |resources| &resources.failed_ingestions_queues, - |mut_resources, new_resources| { - mut_resources.failed_ingestions_queues = new_resources.clone() - }, - self.client_options.queue_service.clone(), - ) - .await - } + // pub async fn get_failed_ingestions_queues(&self) -> Result> { + // self.get_clients( + // |resources| &resources.failed_ingestions_queues, + // |resources| &resources.failed_ingestions_queues, + // |mut_resources, new_resources| { + // mut_resources.failed_ingestions_queues = new_resources.clone() + // }, + // self.client_options.queue_service.clone(), + // ) + // .await + // } } pub type KustoIdentityToken = String; +/// ResourceManager is a struct that keeps track of all the resources required for ingestion using the queued flavour pub struct ResourceManager { ingest_client_resources: Arc, authorization_context: Arc, } impl ResourceManager { + /// Creates a new ResourceManager from the given [KustoClient] and the [QueuedIngestClientOptions] as provided by the user pub fn new(client: KustoClient, client_options: QueuedIngestClientOptions) -> Self { Self { ingest_client_resources: Arc::new(IngestClientResources::new( @@ -293,30 +304,26 @@ impl ResourceManager { .await } - pub async fn temp_storage(&self) -> Result> { - self.ingest_client_resources.get_temp_storage().await - } - - pub async fn ingestions_status_tables(&self) -> Result> { - self.ingest_client_resources - .get_ingestions_status_tables() - .await - } + // pub async fn temp_storage(&self) -> Result> { + // self.ingest_client_resources.get_temp_storage().await + // } - pub async fn successful_ingestions_queues(&self) -> Result> { - self.ingest_client_resources - .get_successful_ingestions_queues() - .await - } + // pub async fn ingestions_status_tables(&self) -> Result> { + // self.ingest_client_resources + // .get_ingestions_status_tables() + // .await + // } - pub async fn failed_ingestions_queues(&self) -> Result> { - self.ingest_client_resources - .get_failed_ingestions_queues() - .await - } + // pub async fn successful_ingestions_queues(&self) -> Result> { + // self.ingest_client_resources + // .get_successful_ingestions_queues() + // .await + // } - // pub fn retrieve_service_type(self) -> ServiceType { - // unimplemented!() + // pub async fn failed_ingestions_queues(&self) -> Result> { + // self.ingest_client_resources + // .get_failed_ingestions_queues() + // .await // } pub async fn authorization_context(&self) -> Result { diff --git a/azure-kusto-ingest/src/resource_manager/authorization_context.rs b/azure-kusto-ingest/src/resource_manager/authorization_context.rs index eab99ea..5b1ccbc 100644 --- a/azure-kusto-ingest/src/resource_manager/authorization_context.rs +++ b/azure-kusto-ingest/src/resource_manager/authorization_context.rs @@ -9,7 +9,7 @@ use super::RESOURCE_REFRESH_PERIOD; pub type KustoIdentityToken = String; -/// Logic to obtain a Kusto identity token from the management endpoint +/// Logic to obtain a Kusto identity token from the management endpoint. This auth token is a temporary token #[derive(Debug, Clone)] pub struct AuthorizationContext { client: KustoClient, @@ -30,8 +30,17 @@ impl AuthorizationContext { .execute_command("NetDefaultDB", ".get kusto identity token", None) .await?; - // TODO: improve validation checks here - let table = results.tables.first().unwrap(); + let table = match &results.tables[..] { + [a] => a, + _ => { + return Err(anyhow::anyhow!( + "Kusto Expected 1 table in results, found {}", + results.tables.len() + )) + } + }; + + // TODO: add more validation here let kusto_identity_token = table .rows .first() diff --git a/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs index 000356b..de779d0 100644 --- a/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs +++ b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs @@ -12,33 +12,42 @@ pub struct RawIngestClientResources { } impl RawIngestClientResources { - fn get_resource_by_name(table: &TableV1, resource_name: String) -> Result> { + fn get_resource_by_name(table: &TableV1, resource_name: String, err_if_not_found: bool) -> Result> { let storage_root_index = table .columns .iter() .position(|c| c.column_name == "StorageRoot") - .unwrap(); + .ok_or(anyhow::anyhow!( + "StorageRoot column is missing in the table" + ))?; + let resource_type_name_index = table .columns .iter() .position(|c| c.column_name == "ResourceTypeName") - .unwrap(); + .ok_or(anyhow::anyhow!( + "ResourceTypeName column is missing in the table" + ))?; - let resource_uris: Result> = table + let resource_uris: Vec> = table .rows .iter() .filter(|r| r[resource_type_name_index] == resource_name) .map(|r| { - ResourceUri::try_from( - r[storage_root_index] - .as_str() - .expect("We should get result here") - .to_string(), - ) + ResourceUri::try_from(r[storage_root_index].as_str().ok_or(anyhow::anyhow!( + "Response returned from Kusto could not be parsed as a string" + ))?) }) .collect(); + + if err_if_not_found && resource_uris.is_empty() { + return Err(anyhow::anyhow!( + "No {} resources found in the table", + resource_name + )); + } - resource_uris + resource_uris.into_iter().collect() } } @@ -46,15 +55,16 @@ impl TryFrom<&TableV1> for RawIngestClientResources { type Error = anyhow::Error; fn try_from(table: &TableV1) -> std::result::Result { + println!("table: {:?}", table); let secured_ready_for_aggregation_queues = - Self::get_resource_by_name(table, "SecuredReadyForAggregationQueue".to_string())?; + Self::get_resource_by_name(table, "SecuredReadyForAggregationQueue".to_string(), true)?; let failed_ingestions_queues = - Self::get_resource_by_name(table, "FailedIngestionsQueue".to_string())?; + Self::get_resource_by_name(table, "FailedIngestionsQueue".to_string(), true)?; let successful_ingestions_queues = - Self::get_resource_by_name(table, "SuccessfulIngestionsQueue".to_string())?; - let temp_storage = Self::get_resource_by_name(table, "TempStorage".to_string())?; + Self::get_resource_by_name(table, "SuccessfulIngestionsQueue".to_string(), true)?; + let temp_storage = Self::get_resource_by_name(table, "TempStorage".to_string(), true)?; let ingestions_status_tables = - Self::get_resource_by_name(table, "IngestionsStatusTable".to_string())?; + Self::get_resource_by_name(table, "IngestionsStatusTable".to_string(), true)?; Ok(Self { secured_ready_for_aggregation_queues, diff --git a/azure-kusto-ingest/src/resource_manager/resource_uri.rs b/azure-kusto-ingest/src/resource_manager/resource_uri.rs index 0685149..ae536b1 100644 --- a/azure-kusto-ingest/src/resource_manager/resource_uri.rs +++ b/azure-kusto-ingest/src/resource_manager/resource_uri.rs @@ -29,11 +29,11 @@ impl ResourceUri { } } -impl TryFrom for ResourceUri { +impl TryFrom<&str> for ResourceUri { type Error = anyhow::Error; - fn try_from(uri: String) -> Result { - let parsed_uri = Url::parse(&uri)?; + fn try_from(uri: &str) -> Result { + let parsed_uri = Url::parse(uri)?; let service_uri = match parsed_uri.host_str() { Some(host_str) => parsed_uri.scheme().to_string() + "://" + host_str, @@ -62,6 +62,7 @@ impl TryFrom for ResourceUri { } } +/// Trait to be used to create an Azure client from a resource URI with configurability of ClientOptions pub trait ClientFromResourceUri { fn create_client(resource_uri: ResourceUri, client_options: ClientOptions) -> Self; } diff --git a/azure-kusto-ingest/src/result.rs b/azure-kusto-ingest/src/result.rs index f5bbc0d..1391dcf 100644 --- a/azure-kusto-ingest/src/result.rs +++ b/azure-kusto-ingest/src/result.rs @@ -8,7 +8,6 @@ pub enum IngestionStatus { Success, } - // The result of an ingestion. #[derive(Debug)] pub struct IngestionResult { From 60545207a6b5dc31afaa1ee7e44c617f0c4c8f4e Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Fri, 25 Aug 2023 15:44:09 +0100 Subject: [PATCH 10/44] improve validation in authorization context --- .../resource_manager/authorization_context.rs | 36 +++++++++++++------ 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/azure-kusto-ingest/src/resource_manager/authorization_context.rs b/azure-kusto-ingest/src/resource_manager/authorization_context.rs index 5b1ccbc..f1fd600 100644 --- a/azure-kusto-ingest/src/resource_manager/authorization_context.rs +++ b/azure-kusto-ingest/src/resource_manager/authorization_context.rs @@ -12,7 +12,9 @@ pub type KustoIdentityToken = String; /// Logic to obtain a Kusto identity token from the management endpoint. This auth token is a temporary token #[derive(Debug, Clone)] pub struct AuthorizationContext { + /// A client against a Kusto ingestion cluster client: KustoClient, + /// Cache of the Kusto identity token auth_context_cache: Refreshing>, } @@ -30,6 +32,7 @@ impl AuthorizationContext { .execute_command("NetDefaultDB", ".get kusto identity token", None) .await?; + // Check that there is only 1 table in the results returned by the query let table = match &results.tables[..] { [a] => a, _ => { @@ -40,16 +43,28 @@ impl AuthorizationContext { } }; - // TODO: add more validation here - let kusto_identity_token = table - .rows - .first() - .unwrap() - .first() - .unwrap() - .as_str() - .unwrap() - .to_string(); + // Check that a column in this table actually exists called `AuthorizationContext` + let index = table + .columns + .iter() + .position(|c| c.column_name == "AuthorizationContext") + .ok_or(anyhow::anyhow!( + "AuthorizationContext column is missing in the table" + ))?; + + // Check that there is only 1 row in the table, and that the value in the first row at the given index is not empty + let kusto_identity_token = match &table.rows[..] { + [row] => row.get(index).ok_or(anyhow::anyhow!( + "Kusto response did not contain a value in the first row at position {}", + index + ))?, + _ => { + return Err(anyhow::anyhow!( + "Kusto Expected 1 row in results, found {}", + table.rows.len() + )) + } + }.to_string(); if kusto_identity_token.chars().all(char::is_whitespace) { return Err(anyhow::anyhow!("Kusto identity token is empty")); @@ -77,6 +92,7 @@ impl AuthorizationContext { return Ok(inner_value.clone()); } + // Fetch new token from Kusto, update the cache, and return the token let token = Self::execute_kql_mgmt_query(self.client.clone()).await?; auth_context_cache.update(Some(token.clone())); From 82ee449626880f1b5f9ac25fb99051ae54ad7966 Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Fri, 25 Aug 2023 15:56:46 +0100 Subject: [PATCH 11/44] move client options to separate file + other improvements --- azure-kusto-ingest/src/client_options.rs | 61 +++++++++++++++++++ azure-kusto-ingest/src/lib.rs | 1 + azure-kusto-ingest/src/queued_ingest.rs | 25 ++------ azure-kusto-ingest/src/resource_manager.rs | 2 +- .../resource_manager/authorization_context.rs | 3 +- .../ingest_client_resources.rs | 10 ++- 6 files changed, 77 insertions(+), 25 deletions(-) create mode 100644 azure-kusto-ingest/src/client_options.rs diff --git a/azure-kusto-ingest/src/client_options.rs b/azure-kusto-ingest/src/client_options.rs new file mode 100644 index 0000000..bb880c6 --- /dev/null +++ b/azure-kusto-ingest/src/client_options.rs @@ -0,0 +1,61 @@ +use azure_core::ClientOptions; + +/// Allows configurability of ClientOptions for the storage clients used within [QueuedIngestClient](crate::queued_ingest::QueuedIngestClient) +#[derive(Clone, Default)] +pub struct QueuedIngestClientOptions { + pub queue_service: ClientOptions, + pub blob_service: ClientOptions, + pub table_service: ClientOptions, +} + +impl From for QueuedIngestClientOptions { + /// Creates a `QueuedIngestClientOptions` struct where the same [ClientOptions] are used for all services + fn from(client_options: ClientOptions) -> Self { + Self { + queue_service: client_options.clone(), + blob_service: client_options.clone(), + table_service: client_options, + } + } +} + +/// Builder for [QueuedIngestClientOptions], call `build()` to create the [QueuedIngestClientOptions] +#[derive(Clone, Default)] +pub struct QueuedIngestClientOptionsBuilder { + queue_service: ClientOptions, + blob_service: ClientOptions, + table_service: ClientOptions, +} + +impl QueuedIngestClientOptionsBuilder { + pub fn new() -> Self { + Self { + queue_service: ClientOptions::default(), + blob_service: ClientOptions::default(), + table_service: ClientOptions::default(), + } + } + + pub fn with_queue_service(mut self, queue_service: ClientOptions) -> Self { + self.queue_service = queue_service; + self + } + + pub fn with_blob_service(mut self, blob_service: ClientOptions) -> Self { + self.blob_service = blob_service; + self + } + + pub fn with_table_service(mut self, table_service: ClientOptions) -> Self { + self.table_service = table_service; + self + } + + pub fn build(self) -> QueuedIngestClientOptions { + QueuedIngestClientOptions { + queue_service: self.queue_service, + blob_service: self.blob_service, + table_service: self.table_service, + } + } +} diff --git a/azure-kusto-ingest/src/lib.rs b/azure-kusto-ingest/src/lib.rs index d117f44..4cb9ce8 100644 --- a/azure-kusto-ingest/src/lib.rs +++ b/azure-kusto-ingest/src/lib.rs @@ -1,3 +1,4 @@ +pub mod client_options; pub mod data_format; pub mod descriptors; pub(crate) mod ingestion_blob_info; diff --git a/azure-kusto-ingest/src/queued_ingest.rs b/azure-kusto-ingest/src/queued_ingest.rs index 3876eb1..861f490 100644 --- a/azure-kusto-ingest/src/queued_ingest.rs +++ b/azure-kusto-ingest/src/queued_ingest.rs @@ -1,34 +1,16 @@ use std::sync::Arc; use anyhow::Result; -use azure_core::{base64, ClientOptions}; +use azure_core::base64; use azure_kusto_data::prelude::KustoClient; +use crate::client_options::QueuedIngestClientOptions; use crate::descriptors::BlobDescriptor; use crate::ingestion_blob_info::QueuedIngestionMessage; use crate::ingestion_properties::IngestionProperties; use crate::resource_manager::ResourceManager; use crate::result::{IngestionResult, IngestionStatus}; -/// Allows configurability of ClientOptions for the storage clients used within [QueuedIngestClient] -#[derive(Clone, Default)] -pub struct QueuedIngestClientOptions { - pub queue_service: ClientOptions, - pub blob_service: ClientOptions, - pub table_service: ClientOptions, -} - -impl From for QueuedIngestClientOptions { - /// Creates a `QueuedIngestClientOptions` struct where the same [ClientOptions] are used for all services - fn from(client_options: ClientOptions) -> Self { - Self { - queue_service: client_options.clone(), - blob_service: client_options.clone(), - table_service: client_options, - } - } -} - /// Client for ingesting data into Kusto using the queued flavour of ingestion #[derive(Clone)] pub struct QueuedIngestClient { @@ -47,6 +29,9 @@ impl QueuedIngestClient { kusto_client: KustoClient, options: QueuedIngestClientOptions, ) -> Self { + // TODO: add a validation check that the client provided is against the ingestion endpoint + // kusto_client.management_url() + let resource_manager = Arc::new(ResourceManager::new(kusto_client, options)); Self { resource_manager } diff --git a/azure-kusto-ingest/src/resource_manager.rs b/azure-kusto-ingest/src/resource_manager.rs index 57fdafc..20890ee 100644 --- a/azure-kusto-ingest/src/resource_manager.rs +++ b/azure-kusto-ingest/src/resource_manager.rs @@ -14,7 +14,7 @@ use azure_data_tables::prelude::TableClient; use azure_storage_blobs::prelude::ContainerClient; use azure_storage_queues::QueueClient; -use crate::queued_ingest::QueuedIngestClientOptions; +use crate::client_options::QueuedIngestClientOptions; use self::{ authorization_context::AuthorizationContext, diff --git a/azure-kusto-ingest/src/resource_manager/authorization_context.rs b/azure-kusto-ingest/src/resource_manager/authorization_context.rs index f1fd600..1f2c094 100644 --- a/azure-kusto-ingest/src/resource_manager/authorization_context.rs +++ b/azure-kusto-ingest/src/resource_manager/authorization_context.rs @@ -64,7 +64,8 @@ impl AuthorizationContext { table.rows.len() )) } - }.to_string(); + } + .to_string(); if kusto_identity_token.chars().all(char::is_whitespace) { return Err(anyhow::anyhow!("Kusto identity token is empty")); diff --git a/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs index de779d0..db57c8c 100644 --- a/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs +++ b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs @@ -12,7 +12,11 @@ pub struct RawIngestClientResources { } impl RawIngestClientResources { - fn get_resource_by_name(table: &TableV1, resource_name: String, err_if_not_found: bool) -> Result> { + fn get_resource_by_name( + table: &TableV1, + resource_name: String, + err_if_not_found: bool, + ) -> Result> { let storage_root_index = table .columns .iter() @@ -39,7 +43,7 @@ impl RawIngestClientResources { ))?) }) .collect(); - + if err_if_not_found && resource_uris.is_empty() { return Err(anyhow::anyhow!( "No {} resources found in the table", @@ -55,7 +59,7 @@ impl TryFrom<&TableV1> for RawIngestClientResources { type Error = anyhow::Error; fn try_from(table: &TableV1) -> std::result::Result { - println!("table: {:?}", table); + // println!("table: {:?}", table); let secured_ready_for_aggregation_queues = Self::get_resource_by_name(table, "SecuredReadyForAggregationQueue".to_string(), true)?; let failed_ingestions_queues = From 25109192d0b3f3d562fbe65706f2b3dc1eb1ae23 Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Fri, 25 Aug 2023 15:59:27 +0100 Subject: [PATCH 12/44] remove duplicate definition of KustoIdentityToken --- azure-kusto-ingest/src/ingestion_blob_info.rs | 2 +- azure-kusto-ingest/src/resource_manager.rs | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/azure-kusto-ingest/src/ingestion_blob_info.rs b/azure-kusto-ingest/src/ingestion_blob_info.rs index 463b8ba..e7aa634 100644 --- a/azure-kusto-ingest/src/ingestion_blob_info.rs +++ b/azure-kusto-ingest/src/ingestion_blob_info.rs @@ -5,7 +5,7 @@ use crate::{ data_format::DataFormat, descriptors::BlobDescriptor, ingestion_properties::{IngestionProperties, ReportLevel, ReportMethod, ValidationPolicy}, - resource_manager::KustoIdentityToken, + resource_manager::authorization_context::KustoIdentityToken, }; // Basing the ingestion message on diff --git a/azure-kusto-ingest/src/resource_manager.rs b/azure-kusto-ingest/src/resource_manager.rs index 20890ee..e3cc98f 100644 --- a/azure-kusto-ingest/src/resource_manager.rs +++ b/azure-kusto-ingest/src/resource_manager.rs @@ -17,7 +17,7 @@ use azure_storage_queues::QueueClient; use crate::client_options::QueuedIngestClientOptions; use self::{ - authorization_context::AuthorizationContext, + authorization_context::{AuthorizationContext, KustoIdentityToken}, cache::{Cached, Refreshing}, resource_uri::{ClientFromResourceUri, ResourceUri}, }; @@ -278,8 +278,6 @@ impl IngestClientResources { // } } -pub type KustoIdentityToken = String; - /// ResourceManager is a struct that keeps track of all the resources required for ingestion using the queued flavour pub struct ResourceManager { ingest_client_resources: Arc, From eb955f855ce96fbfab763fd938ac0c3d0c93d6dd Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Fri, 25 Aug 2023 16:12:42 +0100 Subject: [PATCH 13/44] move ingest client resource around --- azure-kusto-ingest/src/resource_manager.rs | 263 +--------------- .../ingest_client_resources.rs | 297 +++++++++++++++++- 2 files changed, 282 insertions(+), 278 deletions(-) diff --git a/azure-kusto-ingest/src/resource_manager.rs b/azure-kusto-ingest/src/resource_manager.rs index e3cc98f..c648e9c 100644 --- a/azure-kusto-ingest/src/resource_manager.rs +++ b/azure-kusto-ingest/src/resource_manager.rs @@ -5,279 +5,20 @@ pub mod cache; pub mod ingest_client_resources; pub mod resource_uri; -use anyhow::{Ok, Result}; -use azure_core::ClientOptions; +use anyhow::Result; use azure_kusto_data::prelude::KustoClient; -use tokio::sync::RwLock; -use azure_data_tables::prelude::TableClient; -use azure_storage_blobs::prelude::ContainerClient; use azure_storage_queues::QueueClient; use crate::client_options::QueuedIngestClientOptions; use self::{ authorization_context::{AuthorizationContext, KustoIdentityToken}, - cache::{Cached, Refreshing}, - resource_uri::{ClientFromResourceUri, ResourceUri}, + ingest_client_resources::IngestClientResources, }; -use self::ingest_client_resources::RawIngestClientResources; - pub(crate) const RESOURCE_REFRESH_PERIOD: Duration = Duration::from_secs(60 * 60); -pub struct InnerIngestClientResources { - kusto_response: Option, - secured_ready_for_aggregation_queues: Vec, - temp_storage: Vec, - ingestions_status_tables: Vec, - successful_ingestions_queues: Vec, - failed_ingestions_queues: Vec, -} - -impl InnerIngestClientResources { - pub fn new() -> Self { - Self { - kusto_response: None, - secured_ready_for_aggregation_queues: Vec::new(), - temp_storage: Vec::new(), - ingestions_status_tables: Vec::new(), - successful_ingestions_queues: Vec::new(), - failed_ingestions_queues: Vec::new(), - } - } -} - -pub struct IngestClientResources { - client: KustoClient, - resources: Refreshing, - client_options: QueuedIngestClientOptions, -} - -impl IngestClientResources { - pub fn new(client: KustoClient, client_options: QueuedIngestClientOptions) -> Self { - Self { - client, - resources: Arc::new(RwLock::new(Cached::new( - InnerIngestClientResources::new(), - RESOURCE_REFRESH_PERIOD, - ))), - client_options, - } - } - - // TODO: Logic to get the Kusto identity token from Kusto management endpoint - handle any validation of the response from the query here - /// Executes a KQL management query that retrieves resource URIs for the various Azure resources used for ingestion - async fn execute_kql_mgmt_query(client: KustoClient) -> Result { - let results = client - .execute_command("NetDefaultDB", ".get ingestion resources", None) - .await?; - - let table = match results.tables.first() { - Some(a) => a, - None => { - return Err(anyhow::anyhow!( - "Kusto expected a table containing ingestion resource results, found no tables", - )) - } - }; - - RawIngestClientResources::try_from(table) - } - - fn create_clients_vec(resource_uris: &[ResourceUri], client_options: ClientOptions) -> Vec - where - T: ClientFromResourceUri, - { - resource_uris - .iter() - .map(|uri| T::create_client(uri.clone(), client_options.clone())) - .collect() - } - - fn update_clients_vec( - current_resources: Vec, - resource_uris: Vec, - client_options: ClientOptions, - ) -> Vec - where - T: ClientFromResourceUri, - { - if !current_resources.is_empty() { - Self::create_clients_vec(&resource_uris, client_options) - } else { - current_resources - } - } - - // 1. Get the kusto response - // 2. Update the kusto response, and the dependent resources if they are not empty, do this by a hashmap on the URI returned - // 3. Update the time - // 4. Return the kusto response - // As such, at any one time it is guaranteed that anything that has been queried before will be available and up to date - // Anything that has not been queried before will be available to create, but not as Azure clients until explicitly queried - /// - async fn update_from_kusto(&self) -> Result { - let resources = self.resources.read().await; - if !resources.is_expired() { - if let Some(ref inner_value) = resources.get().kusto_response { - return Ok(inner_value.clone()); - } - } - // otherwise, drop the read lock and get a write lock to refresh the kusto response - drop(resources); - let mut resources = self.resources.write().await; - - // check again in case another thread refreshed the while we were waiting on the write lock - if let Some(inner_value) = &resources.get().kusto_response { - return Ok(inner_value.clone()); - } - - let raw_ingest_client_resources = Self::execute_kql_mgmt_query(self.client.clone()).await?; - let mut_resources = resources.get_mut(); - - mut_resources.kusto_response = Some(raw_ingest_client_resources.clone()); - - mut_resources.secured_ready_for_aggregation_queues = Self::update_clients_vec( - mut_resources.secured_ready_for_aggregation_queues.clone(), - raw_ingest_client_resources - .secured_ready_for_aggregation_queues - .clone(), - self.client_options.queue_service.clone(), - ); - mut_resources.temp_storage = Self::update_clients_vec( - mut_resources.temp_storage.clone(), - raw_ingest_client_resources.temp_storage.clone(), - self.client_options.blob_service.clone(), - ); - mut_resources.ingestions_status_tables = Self::update_clients_vec( - mut_resources.ingestions_status_tables.clone(), - raw_ingest_client_resources.ingestions_status_tables.clone(), - self.client_options.table_service.clone(), - ); - mut_resources.successful_ingestions_queues = Self::update_clients_vec( - mut_resources.successful_ingestions_queues.clone(), - raw_ingest_client_resources - .successful_ingestions_queues - .clone(), - self.client_options.queue_service.clone(), - ); - mut_resources.failed_ingestions_queues = Self::update_clients_vec( - mut_resources.failed_ingestions_queues.clone(), - raw_ingest_client_resources.failed_ingestions_queues.clone(), - self.client_options.queue_service.clone(), - ); - Ok(raw_ingest_client_resources) - } - - // Logic here - // Get a read lock, try and return the secured ready for aggregation queues - // If they are not empty, return them - // Otherwise, drop the read lock and get a write lock - // Check again if they are empty, if not return them assuming something has changed in between - // Otherwise, get the kusto response, create the queues - // Store the queues, and also return them - pub async fn get_clients( - &self, - field_fn: F, - create_client_vec_fn: Fx, - set_value: Fy, - client_options: ClientOptions, - ) -> Result> - where - F: Fn(&InnerIngestClientResources) -> &Vec, - Fx: Fn(&RawIngestClientResources) -> &Vec, - Fy: Fn(&mut InnerIngestClientResources, &Vec), - T: ClientFromResourceUri + Clone, - { - let resources = self.resources.read().await; - if !resources.is_expired() { - let vecs = field_fn(resources.get()); - if !vecs.is_empty() { - return Ok(vecs.clone()); - } - } - - drop(resources); - - let raw_ingest_client_resources = self.update_from_kusto().await?; - - let mut resources = self.resources.write().await; - let vecs = field_fn(resources.get_mut()); - if !vecs.is_empty() { - return Ok(vecs.clone()); - } - - // First time, so create the resources outside - let mut_resources = resources.get_mut(); - let new_resources = Self::create_clients_vec( - create_client_vec_fn(&raw_ingest_client_resources), - client_options, - ); - set_value(mut_resources, &new_resources); - - Ok(new_resources) - } - - pub async fn get_secured_ready_for_aggregation_queues(&self) -> Result> { - self.get_clients( - |resources| &resources.secured_ready_for_aggregation_queues, - |resources| &resources.secured_ready_for_aggregation_queues, - |mut_resources, new_resources| { - mut_resources.secured_ready_for_aggregation_queues = new_resources.clone() - }, - self.client_options.queue_service.clone(), - ) - .await - } - - // pub async fn get_temp_storage(&self) -> Result> { - // self.get_clients( - // |resources| &resources.temp_storage, - // |resources| &resources.temp_storage, - // |mut_resources, new_resources| mut_resources.temp_storage = new_resources.clone(), - // self.client_options.blob_service.clone(), - // ) - // .await - // } - - // pub async fn get_ingestions_status_tables(&self) -> Result> { - // self.get_clients( - // |resources| &resources.ingestions_status_tables, - // |resources| &resources.ingestions_status_tables, - // |mut_resources, new_resources| { - // mut_resources.ingestions_status_tables = new_resources.clone() - // }, - // self.client_options.table_service.clone(), - // ) - // .await - // } - - // pub async fn get_successful_ingestions_queues(&self) -> Result> { - // self.get_clients( - // |resources| &resources.successful_ingestions_queues, - // |resources| &resources.successful_ingestions_queues, - // |mut_resources, new_resources| { - // mut_resources.successful_ingestions_queues = new_resources.clone() - // }, - // self.client_options.queue_service.clone(), - // ) - // .await - // } - - // pub async fn get_failed_ingestions_queues(&self) -> Result> { - // self.get_clients( - // |resources| &resources.failed_ingestions_queues, - // |resources| &resources.failed_ingestions_queues, - // |mut_resources, new_resources| { - // mut_resources.failed_ingestions_queues = new_resources.clone() - // }, - // self.client_options.queue_service.clone(), - // ) - // .await - // } -} - /// ResourceManager is a struct that keeps track of all the resources required for ingestion using the queued flavour pub struct ResourceManager { ingest_client_resources: Arc, diff --git a/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs index db57c8c..1b3ef42 100644 --- a/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs +++ b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs @@ -1,6 +1,15 @@ -use super::resource_uri::ResourceUri; +use std::sync::Arc; + +use crate::client_options::QueuedIngestClientOptions; + +use super::{resource_uri::{ResourceUri, ClientFromResourceUri}, cache::{Refreshing, Cached}, RESOURCE_REFRESH_PERIOD}; use anyhow::Result; -use azure_kusto_data::models::TableV1; +use azure_core::ClientOptions; +use azure_data_tables::prelude::TableClient; +use azure_kusto_data::{models::TableV1, prelude::KustoClient}; +use azure_storage_blobs::prelude::ContainerClient; +use azure_storage_queues::QueueClient; +use tokio::sync::RwLock; #[derive(Debug, Clone)] pub struct RawIngestClientResources { @@ -12,26 +21,28 @@ pub struct RawIngestClientResources { } impl RawIngestClientResources { + /// Helper to get a column index from a table + // TODO: this could be moved upstream - would likely result in a change to the API of this function to return an Option + // As such, error handling would still need to be done at use + fn get_column_index(table: &TableV1, column_name: &str) -> Result { + table + .columns + .iter() + .position(|c| c.column_name == column_name) + .ok_or(anyhow::anyhow!( + "{} column is missing in the table", + column_name + )) + } + + /// Helper to get a resource URI from a table fn get_resource_by_name( table: &TableV1, resource_name: String, err_if_not_found: bool, ) -> Result> { - let storage_root_index = table - .columns - .iter() - .position(|c| c.column_name == "StorageRoot") - .ok_or(anyhow::anyhow!( - "StorageRoot column is missing in the table" - ))?; - - let resource_type_name_index = table - .columns - .iter() - .position(|c| c.column_name == "ResourceTypeName") - .ok_or(anyhow::anyhow!( - "ResourceTypeName column is missing in the table" - ))?; + let storage_root_index = Self::get_column_index(table, "StorageRoot")?; + let resource_type_name_index = Self::get_column_index(table, "ResourceTypeName")?; let resource_uris: Vec> = table .rows @@ -79,3 +90,255 @@ impl TryFrom<&TableV1> for RawIngestClientResources { }) } } + +pub struct InnerIngestClientResources { + kusto_response: Option, + secured_ready_for_aggregation_queues: Vec, + temp_storage: Vec, + ingestions_status_tables: Vec, + successful_ingestions_queues: Vec, + failed_ingestions_queues: Vec, +} + +impl InnerIngestClientResources { + pub fn new() -> Self { + Self { + kusto_response: None, + secured_ready_for_aggregation_queues: Vec::new(), + temp_storage: Vec::new(), + ingestions_status_tables: Vec::new(), + successful_ingestions_queues: Vec::new(), + failed_ingestions_queues: Vec::new(), + } + } +} + +pub struct IngestClientResources { + client: KustoClient, + resources: Refreshing, + client_options: QueuedIngestClientOptions, +} + +impl IngestClientResources { + pub fn new(client: KustoClient, client_options: QueuedIngestClientOptions) -> Self { + Self { + client, + resources: Arc::new(RwLock::new(Cached::new( + InnerIngestClientResources::new(), + RESOURCE_REFRESH_PERIOD, + ))), + client_options, + } + } + + // TODO: Logic to get the Kusto identity token from Kusto management endpoint - handle any validation of the response from the query here + /// Executes a KQL management query that retrieves resource URIs for the various Azure resources used for ingestion + async fn execute_kql_mgmt_query(client: KustoClient) -> Result { + let results = client + .execute_command("NetDefaultDB", ".get ingestion resources", None) + .await?; + + let table = match results.tables.first() { + Some(a) => a, + None => { + return Err(anyhow::anyhow!( + "Kusto expected a table containing ingestion resource results, found no tables", + )) + } + }; + + RawIngestClientResources::try_from(table) + } + + fn create_clients_vec(resource_uris: &[ResourceUri], client_options: ClientOptions) -> Vec + where + T: ClientFromResourceUri, + { + resource_uris + .iter() + .map(|uri| T::create_client(uri.clone(), client_options.clone())) + .collect() + } + + fn update_clients_vec( + current_resources: Vec, + resource_uris: Vec, + client_options: ClientOptions, + ) -> Vec + where + T: ClientFromResourceUri, + { + if !current_resources.is_empty() { + Self::create_clients_vec(&resource_uris, client_options) + } else { + current_resources + } + } + + // 1. Get the kusto response + // 2. Update the kusto response, and the dependent resources if they are not empty, do this by a hashmap on the URI returned + // 3. Update the time + // 4. Return the kusto response + // As such, at any one time it is guaranteed that anything that has been queried before will be available and up to date + // Anything that has not been queried before will be available to create, but not as Azure clients until explicitly queried + /// + async fn update_from_kusto(&self) -> Result { + let resources = self.resources.read().await; + if !resources.is_expired() { + if let Some(ref inner_value) = resources.get().kusto_response { + return Ok(inner_value.clone()); + } + } + // otherwise, drop the read lock and get a write lock to refresh the kusto response + drop(resources); + let mut resources = self.resources.write().await; + + // check again in case another thread refreshed the while we were waiting on the write lock + if let Some(inner_value) = &resources.get().kusto_response { + return Ok(inner_value.clone()); + } + + let raw_ingest_client_resources = Self::execute_kql_mgmt_query(self.client.clone()).await?; + let mut_resources = resources.get_mut(); + + mut_resources.kusto_response = Some(raw_ingest_client_resources.clone()); + + mut_resources.secured_ready_for_aggregation_queues = Self::update_clients_vec( + mut_resources.secured_ready_for_aggregation_queues.clone(), + raw_ingest_client_resources + .secured_ready_for_aggregation_queues + .clone(), + self.client_options.queue_service.clone(), + ); + mut_resources.temp_storage = Self::update_clients_vec( + mut_resources.temp_storage.clone(), + raw_ingest_client_resources.temp_storage.clone(), + self.client_options.blob_service.clone(), + ); + mut_resources.ingestions_status_tables = Self::update_clients_vec( + mut_resources.ingestions_status_tables.clone(), + raw_ingest_client_resources.ingestions_status_tables.clone(), + self.client_options.table_service.clone(), + ); + mut_resources.successful_ingestions_queues = Self::update_clients_vec( + mut_resources.successful_ingestions_queues.clone(), + raw_ingest_client_resources + .successful_ingestions_queues + .clone(), + self.client_options.queue_service.clone(), + ); + mut_resources.failed_ingestions_queues = Self::update_clients_vec( + mut_resources.failed_ingestions_queues.clone(), + raw_ingest_client_resources.failed_ingestions_queues.clone(), + self.client_options.queue_service.clone(), + ); + Ok(raw_ingest_client_resources) + } + + // Logic here + // Get a read lock, try and return the secured ready for aggregation queues + // If they are not empty, return them + // Otherwise, drop the read lock and get a write lock + // Check again if they are empty, if not return them assuming something has changed in between + // Otherwise, get the kusto response, create the queues + // Store the queues, and also return them + pub async fn get_clients( + &self, + field_fn: F, + create_client_vec_fn: Fx, + set_value: Fy, + client_options: ClientOptions, + ) -> Result> + where + F: Fn(&InnerIngestClientResources) -> &Vec, + Fx: Fn(&RawIngestClientResources) -> &Vec, + Fy: Fn(&mut InnerIngestClientResources, &Vec), + T: ClientFromResourceUri + Clone, + { + let resources = self.resources.read().await; + if !resources.is_expired() { + let vecs = field_fn(resources.get()); + if !vecs.is_empty() { + return Ok(vecs.clone()); + } + } + + drop(resources); + + let raw_ingest_client_resources = self.update_from_kusto().await?; + + let mut resources = self.resources.write().await; + let vecs = field_fn(resources.get_mut()); + if !vecs.is_empty() { + return Ok(vecs.clone()); + } + + // First time, so create the resources outside + let mut_resources = resources.get_mut(); + let new_resources = Self::create_clients_vec( + create_client_vec_fn(&raw_ingest_client_resources), + client_options, + ); + set_value(mut_resources, &new_resources); + + Ok(new_resources) + } + + pub async fn get_secured_ready_for_aggregation_queues(&self) -> Result> { + self.get_clients( + |resources| &resources.secured_ready_for_aggregation_queues, + |resources| &resources.secured_ready_for_aggregation_queues, + |mut_resources, new_resources| { + mut_resources.secured_ready_for_aggregation_queues = new_resources.clone() + }, + self.client_options.queue_service.clone(), + ) + .await + } + + // pub async fn get_temp_storage(&self) -> Result> { + // self.get_clients( + // |resources| &resources.temp_storage, + // |resources| &resources.temp_storage, + // |mut_resources, new_resources| mut_resources.temp_storage = new_resources.clone(), + // self.client_options.blob_service.clone(), + // ) + // .await + // } + + // pub async fn get_ingestions_status_tables(&self) -> Result> { + // self.get_clients( + // |resources| &resources.ingestions_status_tables, + // |resources| &resources.ingestions_status_tables, + // |mut_resources, new_resources| { + // mut_resources.ingestions_status_tables = new_resources.clone() + // }, + // self.client_options.table_service.clone(), + // ) + // .await + // } + + // pub async fn get_successful_ingestions_queues(&self) -> Result> { + // self.get_clients( + // |resources| &resources.successful_ingestions_queues, + // |resources| &resources.successful_ingestions_queues, + // |mut_resources, new_resources| { + // mut_resources.successful_ingestions_queues = new_resources.clone() + // }, + // self.client_options.queue_service.clone(), + // ) + // .await + // } + + // pub async fn get_failed_ingestions_queues(&self) -> Result> { + // self.get_clients( + // |resources| &resources.failed_ingestions_queues, + // |resources| &resources.failed_ingestions_queues, + // |mut_resources, new_resources| { + // mut_resources.failed_ingestions_queues = new_resources.clone() + // }, + // self.client_options.queue_service.clone(), + // ) + // .await + // } +} From df96e97e0b6e39076440a7e2e45df8c61a177a1b Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Tue, 29 Aug 2023 16:41:53 +0100 Subject: [PATCH 14/44] simplify --- azure-kusto-ingest/Cargo.toml | 1 - azure-kusto-ingest/src/queued_ingest.rs | 16 +- azure-kusto-ingest/src/resource_manager.rs | 30 +- .../resource_manager/authorization_context.rs | 13 +- .../src/resource_manager/cache.rs | 4 - .../ingest_client_resources.rs | 362 ++++-------------- .../src/resource_manager/resource_uri.rs | 13 - azure-kusto-ingest/src/result.rs | 10 +- 8 files changed, 111 insertions(+), 338 deletions(-) diff --git a/azure-kusto-ingest/Cargo.toml b/azure-kusto-ingest/Cargo.toml index ca44670..7c251cf 100644 --- a/azure-kusto-ingest/Cargo.toml +++ b/azure-kusto-ingest/Cargo.toml @@ -19,6 +19,5 @@ azure_core = "0.13" azure_storage = "0.13" azure_storage_blobs = "0.13" azure_storage_queues = "0.13" -azure_data_tables = "0.13" chrono = { version = "0.4", features = ["serde"] } diff --git a/azure-kusto-ingest/src/queued_ingest.rs b/azure-kusto-ingest/src/queued_ingest.rs index 861f490..abd9fdd 100644 --- a/azure-kusto-ingest/src/queued_ingest.rs +++ b/azure-kusto-ingest/src/queued_ingest.rs @@ -69,17 +69,17 @@ impl QueuedIngestClient { let message = base64::encode(&message); // println!("message as base64 encoded string: {}\n", message); - let resp = queue_client.put_message(message).await?; + let _resp = queue_client.put_message(message).await?; // println!("resp: {:#?}\n", resp); - Ok(IngestionResult::new( - IngestionStatus::Queued, - &ingestion_properties.database_name, - &ingestion_properties.table_name, - blob_descriptor.source_id, - Some(blob_descriptor.uri()), - )) + Ok(IngestionResult { + status: IngestionStatus::Queued, + database: ingestion_properties.database_name, + table: ingestion_properties.table_name, + source_id: blob_descriptor.source_id, + blob_uri: Some(blob_descriptor.uri()), + }) } // /// Ingest a local file into Kusto diff --git a/azure-kusto-ingest/src/resource_manager.rs b/azure-kusto-ingest/src/resource_manager.rs index c648e9c..53f81c7 100644 --- a/azure-kusto-ingest/src/resource_manager.rs +++ b/azure-kusto-ingest/src/resource_manager.rs @@ -17,7 +17,7 @@ use self::{ ingest_client_resources::IngestClientResources, }; -pub(crate) const RESOURCE_REFRESH_PERIOD: Duration = Duration::from_secs(60 * 60); +pub const RESOURCE_REFRESH_PERIOD: Duration = Duration::from_secs(60 * 60); /// ResourceManager is a struct that keeps track of all the resources required for ingestion using the queued flavour pub struct ResourceManager { @@ -38,31 +38,15 @@ impl ResourceManager { } pub async fn secured_ready_for_aggregation_queues(&self) -> Result> { - self.ingest_client_resources - .get_secured_ready_for_aggregation_queues() - .await + Ok(self + .ingest_client_resources + .get() + .await? + .secured_ready_for_aggregation_queues) } // pub async fn temp_storage(&self) -> Result> { - // self.ingest_client_resources.get_temp_storage().await - // } - - // pub async fn ingestions_status_tables(&self) -> Result> { - // self.ingest_client_resources - // .get_ingestions_status_tables() - // .await - // } - - // pub async fn successful_ingestions_queues(&self) -> Result> { - // self.ingest_client_resources - // .get_successful_ingestions_queues() - // .await - // } - - // pub async fn failed_ingestions_queues(&self) -> Result> { - // self.ingest_client_resources - // .get_failed_ingestions_queues() - // .await + // Ok(self.ingest_client_resources.get().await?.temp_storage) // } pub async fn authorization_context(&self) -> Result { diff --git a/azure-kusto-ingest/src/resource_manager/authorization_context.rs b/azure-kusto-ingest/src/resource_manager/authorization_context.rs index 1f2c094..f52fc7d 100644 --- a/azure-kusto-ingest/src/resource_manager/authorization_context.rs +++ b/azure-kusto-ingest/src/resource_manager/authorization_context.rs @@ -27,8 +27,9 @@ impl AuthorizationContext { } /// Executes a KQL query to get the Kusto identity token from the management endpoint - async fn execute_kql_mgmt_query(client: KustoClient) -> Result { - let results = client + async fn execute_kql_mgmt_query(&self) -> Result { + let results = self + .client .execute_command("NetDefaultDB", ".get kusto identity token", None) .await?; @@ -89,12 +90,14 @@ impl AuthorizationContext { // Again attempt to return from cache, check is done in case another thread // refreshed the token while we were waiting on the write lock - if let Some(inner_value) = auth_context_cache.get() { - return Ok(inner_value.clone()); + if !auth_context_cache.is_expired() { + if let Some(inner_value) = auth_context_cache.get() { + return Ok(inner_value.clone()); + } } // Fetch new token from Kusto, update the cache, and return the token - let token = Self::execute_kql_mgmt_query(self.client.clone()).await?; + let token = self.execute_kql_mgmt_query().await?; auth_context_cache.update(Some(token.clone())); Ok(token) diff --git a/azure-kusto-ingest/src/resource_manager/cache.rs b/azure-kusto-ingest/src/resource_manager/cache.rs index 03a5215..a304996 100644 --- a/azure-kusto-ingest/src/resource_manager/cache.rs +++ b/azure-kusto-ingest/src/resource_manager/cache.rs @@ -27,10 +27,6 @@ impl Cached { &self.inner } - pub fn get_mut(&mut self) -> &mut T { - &mut self.inner - } - pub fn is_expired(&self) -> bool { self.last_updated.elapsed() > self.refresh_period } diff --git a/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs index 1b3ef42..cd155a5 100644 --- a/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs +++ b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs @@ -2,120 +2,97 @@ use std::sync::Arc; use crate::client_options::QueuedIngestClientOptions; -use super::{resource_uri::{ResourceUri, ClientFromResourceUri}, cache::{Refreshing, Cached}, RESOURCE_REFRESH_PERIOD}; +use super::{ + cache::{Cached, Refreshing}, + resource_uri::{ClientFromResourceUri, ResourceUri}, + RESOURCE_REFRESH_PERIOD, +}; use anyhow::Result; use azure_core::ClientOptions; -use azure_data_tables::prelude::TableClient; use azure_kusto_data::{models::TableV1, prelude::KustoClient}; use azure_storage_blobs::prelude::ContainerClient; use azure_storage_queues::QueueClient; use tokio::sync::RwLock; -#[derive(Debug, Clone)] -pub struct RawIngestClientResources { - pub secured_ready_for_aggregation_queues: Vec, - pub failed_ingestions_queues: Vec, - pub successful_ingestions_queues: Vec, - pub temp_storage: Vec, - pub ingestions_status_tables: Vec, +/// Helper to get a column index from a table +// TODO: this could be moved upstream into Kusto Data - would likely result in a change to the API of this function to return an Option +fn get_column_index(table: &TableV1, column_name: &str) -> Result { + table + .columns + .iter() + .position(|c| c.column_name == column_name) + .ok_or(anyhow::anyhow!( + "{} column is missing in the table", + column_name + )) } -impl RawIngestClientResources { - /// Helper to get a column index from a table - // TODO: this could be moved upstream - would likely result in a change to the API of this function to return an Option - // As such, error handling would still need to be done at use - fn get_column_index(table: &TableV1, column_name: &str) -> Result { - table - .columns - .iter() - .position(|c| c.column_name == column_name) - .ok_or(anyhow::anyhow!( - "{} column is missing in the table", - column_name - )) - } +/// Helper to get a resource URI from a table, erroring if there are no resources of the given name +fn get_resource_by_name(table: &TableV1, resource_name: String) -> Result> { + let storage_root_index = get_column_index(table, "StorageRoot")?; + let resource_type_name_index = get_column_index(table, "ResourceTypeName")?; + + let resource_uris: Vec> = table + .rows + .iter() + .filter(|r| r[resource_type_name_index] == resource_name) + .map(|r| { + ResourceUri::try_from(r[storage_root_index].as_str().ok_or(anyhow::anyhow!( + "Response returned from Kusto could not be parsed as a string" + ))?) + }) + .collect(); - /// Helper to get a resource URI from a table - fn get_resource_by_name( - table: &TableV1, - resource_name: String, - err_if_not_found: bool, - ) -> Result> { - let storage_root_index = Self::get_column_index(table, "StorageRoot")?; - let resource_type_name_index = Self::get_column_index(table, "ResourceTypeName")?; + if resource_uris.is_empty() { + return Err(anyhow::anyhow!( + "No {} resources found in the table", + resource_name + )); + } - let resource_uris: Vec> = table - .rows - .iter() - .filter(|r| r[resource_type_name_index] == resource_name) - .map(|r| { - ResourceUri::try_from(r[storage_root_index].as_str().ok_or(anyhow::anyhow!( - "Response returned from Kusto could not be parsed as a string" - ))?) - }) - .collect(); + resource_uris.into_iter().collect() +} - if err_if_not_found && resource_uris.is_empty() { - return Err(anyhow::anyhow!( - "No {} resources found in the table", - resource_name - )); - } +/// Helper to turn a vector of resource URIs into a vector of Azure clients of type T with the provided [ClientOptions] +fn create_clients_vec(resource_uris: &[ResourceUri], client_options: &ClientOptions) -> Vec +where + T: ClientFromResourceUri, +{ + resource_uris + .iter() + .map(|uri| T::create_client(uri.clone(), client_options.clone())) + .collect() +} - resource_uris.into_iter().collect() - } +/// Storage of the clients required for ingestion +#[derive(Debug, Clone)] +pub struct InnerIngestClientResources { + pub secured_ready_for_aggregation_queues: Vec, + pub temp_storage: Vec, } -impl TryFrom<&TableV1> for RawIngestClientResources { +impl TryFrom<(&TableV1, &QueuedIngestClientOptions)> for InnerIngestClientResources { type Error = anyhow::Error; - fn try_from(table: &TableV1) -> std::result::Result { - // println!("table: {:?}", table); + /// Attempts to create a new InnerIngestClientResources from the given [TableV1] and [QueuedIngestClientOptions] + fn try_from((table, client_options): (&TableV1, &QueuedIngestClientOptions)) -> Result { let secured_ready_for_aggregation_queues = - Self::get_resource_by_name(table, "SecuredReadyForAggregationQueue".to_string(), true)?; - let failed_ingestions_queues = - Self::get_resource_by_name(table, "FailedIngestionsQueue".to_string(), true)?; - let successful_ingestions_queues = - Self::get_resource_by_name(table, "SuccessfulIngestionsQueue".to_string(), true)?; - let temp_storage = Self::get_resource_by_name(table, "TempStorage".to_string(), true)?; - let ingestions_status_tables = - Self::get_resource_by_name(table, "IngestionsStatusTable".to_string(), true)?; + get_resource_by_name(table, "SecuredReadyForAggregationQueue".to_string())?; + let temp_storage = get_resource_by_name(table, "TempStorage".to_string())?; Ok(Self { - secured_ready_for_aggregation_queues, - failed_ingestions_queues, - successful_ingestions_queues, - temp_storage, - ingestions_status_tables, + secured_ready_for_aggregation_queues: create_clients_vec( + &secured_ready_for_aggregation_queues, + &client_options.queue_service, + ), + temp_storage: create_clients_vec(&temp_storage, &client_options.blob_service), }) } } -pub struct InnerIngestClientResources { - kusto_response: Option, - secured_ready_for_aggregation_queues: Vec, - temp_storage: Vec, - ingestions_status_tables: Vec, - successful_ingestions_queues: Vec, - failed_ingestions_queues: Vec, -} - -impl InnerIngestClientResources { - pub fn new() -> Self { - Self { - kusto_response: None, - secured_ready_for_aggregation_queues: Vec::new(), - temp_storage: Vec::new(), - ingestions_status_tables: Vec::new(), - successful_ingestions_queues: Vec::new(), - failed_ingestions_queues: Vec::new(), - } - } -} - pub struct IngestClientResources { client: KustoClient, - resources: Refreshing, + resources: Refreshing>, client_options: QueuedIngestClientOptions, } @@ -123,222 +100,49 @@ impl IngestClientResources { pub fn new(client: KustoClient, client_options: QueuedIngestClientOptions) -> Self { Self { client, - resources: Arc::new(RwLock::new(Cached::new( - InnerIngestClientResources::new(), - RESOURCE_REFRESH_PERIOD, - ))), + resources: Arc::new(RwLock::new(Cached::new(None, RESOURCE_REFRESH_PERIOD))), client_options, } } - // TODO: Logic to get the Kusto identity token from Kusto management endpoint - handle any validation of the response from the query here + /// Gets the latest resources from Kusto, updating the cached resources if they are expired /// Executes a KQL management query that retrieves resource URIs for the various Azure resources used for ingestion - async fn execute_kql_mgmt_query(client: KustoClient) -> Result { - let results = client + async fn execute_kql_mgmt_query(&self) -> Result { + let results = self + .client .execute_command("NetDefaultDB", ".get ingestion resources", None) .await?; - let table = match results.tables.first() { - Some(a) => a, - None => { - return Err(anyhow::anyhow!( - "Kusto expected a table containing ingestion resource results, found no tables", - )) - } - }; - - RawIngestClientResources::try_from(table) - } - - fn create_clients_vec(resource_uris: &[ResourceUri], client_options: ClientOptions) -> Vec - where - T: ClientFromResourceUri, - { - resource_uris - .iter() - .map(|uri| T::create_client(uri.clone(), client_options.clone())) - .collect() - } + let new_resources = results.tables.first().ok_or(anyhow::anyhow!( + "Kusto expected a table containing ingestion resource results, found no tables", + ))?; - fn update_clients_vec( - current_resources: Vec, - resource_uris: Vec, - client_options: ClientOptions, - ) -> Vec - where - T: ClientFromResourceUri, - { - if !current_resources.is_empty() { - Self::create_clients_vec(&resource_uris, client_options) - } else { - current_resources - } + InnerIngestClientResources::try_from((new_resources, &self.client_options)) } - // 1. Get the kusto response - // 2. Update the kusto response, and the dependent resources if they are not empty, do this by a hashmap on the URI returned - // 3. Update the time - // 4. Return the kusto response - // As such, at any one time it is guaranteed that anything that has been queried before will be available and up to date - // Anything that has not been queried before will be available to create, but not as Azure clients until explicitly queried - /// - async fn update_from_kusto(&self) -> Result { + /// Gets the latest resources either from cache, or fetching from Kusto and updating the cached resources + pub async fn get(&self) -> Result { let resources = self.resources.read().await; if !resources.is_expired() { - if let Some(ref inner_value) = resources.get().kusto_response { + if let Some(inner_value) = resources.get() { return Ok(inner_value.clone()); } } + // otherwise, drop the read lock and get a write lock to refresh the kusto response drop(resources); let mut resources = self.resources.write().await; - // check again in case another thread refreshed the while we were waiting on the write lock - if let Some(inner_value) = &resources.get().kusto_response { - return Ok(inner_value.clone()); - } - - let raw_ingest_client_resources = Self::execute_kql_mgmt_query(self.client.clone()).await?; - let mut_resources = resources.get_mut(); - - mut_resources.kusto_response = Some(raw_ingest_client_resources.clone()); - - mut_resources.secured_ready_for_aggregation_queues = Self::update_clients_vec( - mut_resources.secured_ready_for_aggregation_queues.clone(), - raw_ingest_client_resources - .secured_ready_for_aggregation_queues - .clone(), - self.client_options.queue_service.clone(), - ); - mut_resources.temp_storage = Self::update_clients_vec( - mut_resources.temp_storage.clone(), - raw_ingest_client_resources.temp_storage.clone(), - self.client_options.blob_service.clone(), - ); - mut_resources.ingestions_status_tables = Self::update_clients_vec( - mut_resources.ingestions_status_tables.clone(), - raw_ingest_client_resources.ingestions_status_tables.clone(), - self.client_options.table_service.clone(), - ); - mut_resources.successful_ingestions_queues = Self::update_clients_vec( - mut_resources.successful_ingestions_queues.clone(), - raw_ingest_client_resources - .successful_ingestions_queues - .clone(), - self.client_options.queue_service.clone(), - ); - mut_resources.failed_ingestions_queues = Self::update_clients_vec( - mut_resources.failed_ingestions_queues.clone(), - raw_ingest_client_resources.failed_ingestions_queues.clone(), - self.client_options.queue_service.clone(), - ); - Ok(raw_ingest_client_resources) - } - - // Logic here - // Get a read lock, try and return the secured ready for aggregation queues - // If they are not empty, return them - // Otherwise, drop the read lock and get a write lock - // Check again if they are empty, if not return them assuming something has changed in between - // Otherwise, get the kusto response, create the queues - // Store the queues, and also return them - pub async fn get_clients( - &self, - field_fn: F, - create_client_vec_fn: Fx, - set_value: Fy, - client_options: ClientOptions, - ) -> Result> - where - F: Fn(&InnerIngestClientResources) -> &Vec, - Fx: Fn(&RawIngestClientResources) -> &Vec, - Fy: Fn(&mut InnerIngestClientResources, &Vec), - T: ClientFromResourceUri + Clone, - { - let resources = self.resources.read().await; + // check again in case another thread refreshed while we were waiting on the write lock if !resources.is_expired() { - let vecs = field_fn(resources.get()); - if !vecs.is_empty() { - return Ok(vecs.clone()); + if let Some(inner_value) = resources.get() { + return Ok(inner_value.clone()); } } - drop(resources); - - let raw_ingest_client_resources = self.update_from_kusto().await?; - - let mut resources = self.resources.write().await; - let vecs = field_fn(resources.get_mut()); - if !vecs.is_empty() { - return Ok(vecs.clone()); - } - - // First time, so create the resources outside - let mut_resources = resources.get_mut(); - let new_resources = Self::create_clients_vec( - create_client_vec_fn(&raw_ingest_client_resources), - client_options, - ); - set_value(mut_resources, &new_resources); + let new_resources = self.execute_kql_mgmt_query().await?; + resources.update(Some(new_resources.clone())); Ok(new_resources) } - - pub async fn get_secured_ready_for_aggregation_queues(&self) -> Result> { - self.get_clients( - |resources| &resources.secured_ready_for_aggregation_queues, - |resources| &resources.secured_ready_for_aggregation_queues, - |mut_resources, new_resources| { - mut_resources.secured_ready_for_aggregation_queues = new_resources.clone() - }, - self.client_options.queue_service.clone(), - ) - .await - } - - // pub async fn get_temp_storage(&self) -> Result> { - // self.get_clients( - // |resources| &resources.temp_storage, - // |resources| &resources.temp_storage, - // |mut_resources, new_resources| mut_resources.temp_storage = new_resources.clone(), - // self.client_options.blob_service.clone(), - // ) - // .await - // } - - // pub async fn get_ingestions_status_tables(&self) -> Result> { - // self.get_clients( - // |resources| &resources.ingestions_status_tables, - // |resources| &resources.ingestions_status_tables, - // |mut_resources, new_resources| { - // mut_resources.ingestions_status_tables = new_resources.clone() - // }, - // self.client_options.table_service.clone(), - // ) - // .await - // } - - // pub async fn get_successful_ingestions_queues(&self) -> Result> { - // self.get_clients( - // |resources| &resources.successful_ingestions_queues, - // |resources| &resources.successful_ingestions_queues, - // |mut_resources, new_resources| { - // mut_resources.successful_ingestions_queues = new_resources.clone() - // }, - // self.client_options.queue_service.clone(), - // ) - // .await - // } - - // pub async fn get_failed_ingestions_queues(&self) -> Result> { - // self.get_clients( - // |resources| &resources.failed_ingestions_queues, - // |resources| &resources.failed_ingestions_queues, - // |mut_resources, new_resources| { - // mut_resources.failed_ingestions_queues = new_resources.clone() - // }, - // self.client_options.queue_service.clone(), - // ) - // .await - // } } diff --git a/azure-kusto-ingest/src/resource_manager/resource_uri.rs b/azure-kusto-ingest/src/resource_manager/resource_uri.rs index ae536b1..caa594c 100644 --- a/azure-kusto-ingest/src/resource_manager/resource_uri.rs +++ b/azure-kusto-ingest/src/resource_manager/resource_uri.rs @@ -1,5 +1,4 @@ use azure_core::ClientOptions; -use azure_data_tables::{clients::TableServiceClientBuilder, prelude::TableClient}; use azure_storage::StorageCredentials; use azure_storage_blobs::prelude::{ClientBuilder, ContainerClient}; use azure_storage_queues::{QueueClient, QueueServiceClientBuilder}; @@ -89,15 +88,3 @@ impl ClientFromResourceUri for ContainerClient { .container_client(resource_uri.object_name()) } } - -impl ClientFromResourceUri for TableClient { - fn create_client(resource_uri: ResourceUri, client_options: ClientOptions) -> Self { - TableServiceClientBuilder::with_location(azure_storage::CloudLocation::Custom { - uri: resource_uri.service_uri().to_string(), - credentials: resource_uri.sas_token().clone(), - }) - .client_options(client_options) - .build() - .table_client(resource_uri.object_name()) - } -} diff --git a/azure-kusto-ingest/src/result.rs b/azure-kusto-ingest/src/result.rs index 1391dcf..6350de4 100644 --- a/azure-kusto-ingest/src/result.rs +++ b/azure-kusto-ingest/src/result.rs @@ -12,15 +12,15 @@ pub enum IngestionStatus { #[derive(Debug)] pub struct IngestionResult { // Will be `Queued` if the ingestion is queued, or `Success` if the ingestion is streaming and successful. - status: IngestionStatus, + pub status: IngestionStatus, // The name of the database where the ingestion was performed. - database: String, + pub database: String, // The name of the table where the ingestion was performed. - table: String, + pub table: String, // The source id of the ingestion. - source_id: Uuid, + pub source_id: Uuid, // The blob uri of the ingestion, if exists. - blob_uri: Option, + pub blob_uri: Option, } impl IngestionResult { From f9da7f2146396c40a411c8c0492b9346797942ea Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Wed, 30 Aug 2023 10:41:43 +0100 Subject: [PATCH 15/44] add some unit tests --- .../src/resource_manager/cache.rs | 40 ++++++ .../src/resource_manager/resource_uri.rs | 119 ++++++++++++++++-- 2 files changed, 151 insertions(+), 8 deletions(-) diff --git a/azure-kusto-ingest/src/resource_manager/cache.rs b/azure-kusto-ingest/src/resource_manager/cache.rs index a304996..061343c 100644 --- a/azure-kusto-ingest/src/resource_manager/cache.rs +++ b/azure-kusto-ingest/src/resource_manager/cache.rs @@ -38,3 +38,43 @@ impl Cached { } pub type Refreshing = Arc>>; + +#[cfg(test)] +mod tests { + use super::*; + use std::time::Duration; + + #[test] + fn test_cached_get() { + let value = "hello"; + let cached = Cached::new(value.to_string(), Duration::from_secs(60)); + + assert_eq!(cached.get(), value); + } + + #[test] + fn test_cached_is_expired() { + let value = "hello"; + let mut cached = Cached::new(value.to_string(), Duration::from_secs(60)); + + assert!(!cached.is_expired()); + + cached.last_updated = Instant::now() - Duration::from_secs(61); + + assert!(cached.is_expired()); + } + + #[test] + fn test_cached_update() { + let value = "hello"; + let mut cached = Cached::new(value.to_string(), Duration::from_secs(60)); + + assert_eq!(cached.get(), value); + + let new_value = "world"; + cached.update(new_value.to_string()); + + assert!(!cached.is_expired()); + assert_eq!(cached.get(), new_value); + } +} diff --git a/azure-kusto-ingest/src/resource_manager/resource_uri.rs b/azure-kusto-ingest/src/resource_manager/resource_uri.rs index caa594c..e78ca58 100644 --- a/azure-kusto-ingest/src/resource_manager/resource_uri.rs +++ b/azure-kusto-ingest/src/resource_manager/resource_uri.rs @@ -34,15 +34,26 @@ impl TryFrom<&str> for ResourceUri { fn try_from(uri: &str) -> Result { let parsed_uri = Url::parse(uri)?; - let service_uri = match parsed_uri.host_str() { - Some(host_str) => parsed_uri.scheme().to_string() + "://" + host_str, - None => return Err(anyhow::anyhow!("Host is missing in the URI")), + let scheme = match parsed_uri.scheme() { + "https" => "https".to_string(), + other_scheme => { + return Err(anyhow::anyhow!( + "URI scheme must be 'https', was '{other_scheme}'" + )) + } }; - let object_name = parsed_uri - .path() - .trim_start() - .trim_start_matches('/') - .to_string(); + + let service_uri = scheme + + "://" + + parsed_uri + .host_str() + .expect("Url::parse should always return a host for a URI"); + + let object_name = match parsed_uri.path().trim_start().trim_start_matches('/') { + "" => return Err(anyhow::anyhow!("Object name is missing in the URI")), + name => name.to_string(), + }; + let sas_token = match parsed_uri.query() { Some(query) => query.to_string(), None => { @@ -88,3 +99,95 @@ impl ClientFromResourceUri for ContainerClient { .container_client(resource_uri.object_name()) } } + +#[cfg(test)] +mod tests { + use super::*; + use std::convert::TryFrom; + + #[test] + fn resource_uri_try_from() { + let uri = "https://storageaccountname.blob.core.windows.com/containerobjectname?sas=token"; + let resource_uri = ResourceUri::try_from(uri).unwrap(); + + assert_eq!( + resource_uri.service_uri(), + "https://storageaccountname.blob.core.windows.com" + ); + assert_eq!(resource_uri.object_name(), "containerobjectname"); + + assert!(matches!( + resource_uri.sas_token(), + StorageCredentials::SASToken(_) + )); + + if let StorageCredentials::SASToken(sas_vec) = resource_uri.sas_token() { + assert_eq!(sas_vec.len(), 1); + assert_eq!(sas_vec[0].0, "sas"); + assert_eq!(sas_vec[0].1, "token"); + } + } + + #[test] + fn invalid_scheme() { + let uri = "http://storageaccountname.blob.core.windows.com/containerobjectname?sas=token"; + let resource_uri = ResourceUri::try_from(uri); + + assert!(resource_uri.is_err()); + } + + #[test] + fn missing_host_str() { + let uri = "https:"; + let resource_uri = ResourceUri::try_from(uri); + println!("{:#?}", resource_uri); + + assert!(resource_uri.is_err()); + } + + #[test] + fn missing_object_name() { + let uri = "https://storageaccountname.blob.core.windows.com/?sas=token"; + let resource_uri = ResourceUri::try_from(uri); + println!("{:#?}", resource_uri); + + assert!(resource_uri.is_err()); + } + + #[test] + fn missing_sas_token() { + let uri = "https://storageaccountname.blob.core.windows.com/containerobjectname"; + let resource_uri = ResourceUri::try_from(uri); + println!("{:#?}", resource_uri); + + assert!(resource_uri.is_err()); + } + + #[test] + fn queue_client_from_resource_uri() { + let resource_uri = ResourceUri { + service_uri: "https://mystorageaccount.queue.core.windows.net".to_string(), + object_name: "queuename".to_string(), + sas_token: StorageCredentials::SASToken(vec![("sas".to_string(), "token".to_string())]), + }; + + let client_options = ClientOptions::default(); + let queue_client = QueueClient::create_client(resource_uri, client_options); + + assert_eq!(queue_client.queue_name(), "queuename"); + } + + #[test] + fn container_client_from_resource_uri() { + let resource_uri = ResourceUri { + service_uri: "https://mystorageaccount.blob.core.windows.net".to_string(), + object_name: "containername".to_string(), + sas_token: StorageCredentials::SASToken(vec![("sas".to_string(), "token".to_string())]), + }; + + let client_options = ClientOptions::default(); + let container_client = ContainerClient::create_client(resource_uri, client_options); + + assert_eq!(container_client.container_name(), "containername"); + } +} From 5cb3a23da0e5293841ea62cdba99845dd65e43a6 Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Wed, 30 Aug 2023 11:24:15 +0100 Subject: [PATCH 16/44] remove use of URL in blob descriptor --- azure-kusto-ingest/src/descriptors.rs | 45 +++------------------------ 1 file changed, 4 insertions(+), 41 deletions(-) diff --git a/azure-kusto-ingest/src/descriptors.rs b/azure-kusto-ingest/src/descriptors.rs index 89cde58..33e50df 100644 --- a/azure-kusto-ingest/src/descriptors.rs +++ b/azure-kusto-ingest/src/descriptors.rs @@ -1,7 +1,6 @@ -use url::Url; use uuid::Uuid; -/// Helper enum for authentication information on a blob +/// Helper for adding authentication information to a blob path in the format expected by Kusto #[derive(Clone)] pub enum BlobAuth { /// adds `?` to the blob path @@ -30,7 +29,7 @@ impl std::fmt::Debug for BlobAuth { /// Encapsulates the information related to a blob that is required to ingest from a blob #[derive(Debug, Clone)] pub struct BlobDescriptor { - uri: Url, + uri: String, pub(crate) size: Option, pub(crate) source_id: Uuid, /// Authentication information for the blob; when [None], the uri is passed through as is @@ -38,7 +37,7 @@ pub struct BlobDescriptor { } impl BlobDescriptor { - pub fn new(uri: Url, size: Option, source_id: Option) -> Self { + pub fn new(uri: String, size: Option, source_id: Option) -> Self { let source_id = match source_id { Some(source_id) => source_id, None => Uuid::new_v4(), @@ -62,9 +61,7 @@ impl BlobDescriptor { pub fn uri(&self) -> String { match &self.blob_auth { Some(BlobAuth::SASToken(sas_token)) => { - let mut uri = self.uri.clone(); - uri.set_query(Some(sas_token.as_str())); - uri.to_string() + format!("{}?{}", self.uri, sas_token.as_str()) } Some(BlobAuth::UserAssignedManagedIdentity(object_id)) => { format!("{};managed_identity={}", self.uri, object_id) @@ -76,37 +73,3 @@ impl BlobDescriptor { } } } - -// #[derive(Clone, Debug)] -// pub struct FileDescriptor { -// pub path: PathBuf, -// pub size: Option, -// pub source_id: Uuid, -// } - -// impl FileDescriptor { -// pub fn new(path: PathBuf, size: Option, source_id: Option) -> Self { -// unimplemented!() -// } -// } - -// #[derive(Clone, Debug)] -// pub struct StreamDescriptor { -// stream: Box, -// size: Option, -// source_id: Uuid, -// compressed: bool, -// stream_name: String, -// } - -// impl StreamDescriptor { -// pub fn new( -// stream: Box, -// size: Option, -// source_id: Option, -// compressed: bool, -// stream_name: String, -// ) -> Self { -// unimplemented!() -// } -// } From 9b4a9b2ce5a25880a27435519722a41a4c56e564 Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Wed, 30 Aug 2023 11:59:37 +0100 Subject: [PATCH 17/44] add tests for descriptors --- azure-kusto-ingest/src/descriptors.rs | 66 +++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/azure-kusto-ingest/src/descriptors.rs b/azure-kusto-ingest/src/descriptors.rs index 33e50df..ed7f0e8 100644 --- a/azure-kusto-ingest/src/descriptors.rs +++ b/azure-kusto-ingest/src/descriptors.rs @@ -73,3 +73,69 @@ impl BlobDescriptor { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn blob_descriptor_with_no_auth_modification() { + let uri = "https://mystorageaccount.blob.core.windows.net/mycontainer/myblob"; + let blob_descriptor = BlobDescriptor::new(uri.to_string(), None, None); + + assert_eq!(blob_descriptor.uri(), uri); + } + + #[test] + fn blob_descriptor_with_sas_token() { + let uri = "https://mystorageaccount.blob.core.windows.net/mycontainer/myblob"; + let sas_token = "my_sas_token"; + let blob_descriptor = BlobDescriptor::new(uri.to_string(), None, None) + .with_blob_auth(BlobAuth::SASToken(sas_token.to_string())); + + assert_eq!(blob_descriptor.uri(), format!("{}?{}", uri, sas_token)); + } + + #[test] + fn blob_descriptor_with_user_assigned_managed_identity() { + let uri = "https://mystorageaccount.blob.core.windows.net/mycontainer/myblob"; + let object_id = "my_object_id"; + let blob_descriptor = BlobDescriptor::new(uri.to_string(), None, None) + .with_blob_auth(BlobAuth::UserAssignedManagedIdentity(object_id.to_string())); + + assert_eq!( + blob_descriptor.uri(), + format!("{};managed_identity={}", uri, object_id) + ); + } + + #[test] + fn blob_descriptor_with_system_assigned_managed_identity() { + let uri = "https://mystorageaccount.blob.core.windows.net/mycontainer/myblob"; + let blob_descriptor = BlobDescriptor::new(uri.to_string(), None, None) + .with_blob_auth(BlobAuth::SystemAssignedManagedIdentity); + + assert_eq!( + blob_descriptor.uri(), + format!("{};managed_identity=system", uri) + ); + } + + #[test] + fn blob_descriptor_with_size() { + let uri = "https://mystorageaccount.blob.core.windows.net/mycontainer/myblob"; + let size = 123; + let blob_descriptor = BlobDescriptor::new(uri.to_string(), Some(size), None); + + assert_eq!(blob_descriptor.size, Some(size)); + } + + #[test] + fn blob_descriptor_with_source_id() { + let uri = "https://mystorageaccount.blob.core.windows.net/mycontainer/myblob"; + let source_id = Uuid::new_v4(); + let blob_descriptor = BlobDescriptor::new(uri.to_string(), None, Some(source_id)); + + assert_eq!(blob_descriptor.source_id, source_id); + } +} From 691c255d9b06e5961f352ac6c5eac14c3b9098a1 Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Wed, 30 Aug 2023 12:54:52 +0100 Subject: [PATCH 18/44] choose a random queue + more changes --- azure-kusto-ingest/Cargo.toml | 1 + azure-kusto-ingest/src/ingestion_blob_info.rs | 14 +++++---- azure-kusto-ingest/src/queued_ingest.rs | 29 +++++-------------- azure-kusto-ingest/src/resource_manager.rs | 1 + 4 files changed, 18 insertions(+), 27 deletions(-) diff --git a/azure-kusto-ingest/Cargo.toml b/azure-kusto-ingest/Cargo.toml index 7c251cf..aeebf86 100644 --- a/azure-kusto-ingest/Cargo.toml +++ b/azure-kusto-ingest/Cargo.toml @@ -20,4 +20,5 @@ azure_storage = "0.13" azure_storage_blobs = "0.13" azure_storage_queues = "0.13" chrono = { version = "0.4", features = ["serde"] } +rand = "0.8" diff --git a/azure-kusto-ingest/src/ingestion_blob_info.rs b/azure-kusto-ingest/src/ingestion_blob_info.rs index e7aa634..338c9c7 100644 --- a/azure-kusto-ingest/src/ingestion_blob_info.rs +++ b/azure-kusto-ingest/src/ingestion_blob_info.rs @@ -13,16 +13,18 @@ use crate::{ #[derive(Serialize, Debug)] #[serde(rename_all = "PascalCase")] pub struct QueuedIngestionMessage { - /// Message identifier (GUID) + /// Message identifier for this upload id: uuid::Uuid, /// Path (URI) to the blob, including the SAS key granting permissions to read/write/delete it. /// Permissions are required so that the ingestion service can delete the blob once it has completed ingesting the data. blob_path: String, - /// Target database name + // Name of the Kusto database the data will ingest into database_name: String, - /// Target table name + // Name of the Kusto table the the data will ingest into table_name: String, - /// Size of the uncompressed data in bytes. Providing this value allows the ingestion service to optimize ingestion by potentially aggregating multiple blobs. This property is optional, but if not given, the service will access the blob just to retrieve the size. + /// Size of the uncompressed data in bytes. + /// Providing this value allows the ingestion service to optimize ingestion by potentially aggregating multiple blobs. + /// Although this property is optional, it is recommended to provide the size as otherwise the service will access the blob just to retrieve the size. #[serde(skip_serializing_if = "Option::is_none")] raw_data_size: Option, /// If set to `true`, the blob won't be deleted once ingestion is successfully completed. Default is `false` @@ -31,11 +33,13 @@ pub struct QueuedIngestionMessage { /// If set to `true`, any aggregation will be skipped. Default is `false` #[serde(skip_serializing_if = "Option::is_none")] flush_immediately: Option, + /// Ignores the size limit for data ingestion #[serde(skip_serializing_if = "Option::is_none")] ignore_size_limit: Option, - // according to Go impl, the report level and method could be Option + /// Defines which if any ingestion states are reported #[serde(skip_serializing_if = "Option::is_none")] report_level: Option, + /// Defines which mechanisms are used to report the ingestion status #[serde(skip_serializing_if = "Option::is_none")] report_method: Option, source_message_creation_time: DateTime, diff --git a/azure-kusto-ingest/src/queued_ingest.rs b/azure-kusto-ingest/src/queued_ingest.rs index abd9fdd..8dd95b1 100644 --- a/azure-kusto-ingest/src/queued_ingest.rs +++ b/azure-kusto-ingest/src/queued_ingest.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use anyhow::Result; use azure_core::base64; use azure_kusto_data::prelude::KustoClient; +use rand::seq::SliceRandom; use crate::client_options::QueuedIngestClientOptions; use crate::descriptors::BlobDescriptor; @@ -56,21 +57,23 @@ impl QueuedIngestClient { let message = QueuedIngestionMessage::new(&blob_descriptor, &ingestion_properties, auth_context); - // println!("message as struct: {:#?}\n", message); - // TODO: pick a random queue from the queue clients returned by the resource manager - let queue_client = ingestion_queues.first().unwrap().clone(); + // Pick a random queue from the queue clients returned by the resource manager + let mut rng = rand::thread_rng(); + let queue_client = ingestion_queues + .choose(&mut rng) + .ok_or(anyhow::anyhow!("Failed to pick a random queue"))?; // println!("queue_client: {:#?}\n", queue_client); let message = serde_json::to_string(&message).unwrap(); // println!("message as string: {}\n", message); + // Base64 encode the ingestion message let message = base64::encode(&message); // println!("message as base64 encoded string: {}\n", message); let _resp = queue_client.put_message(message).await?; - // println!("resp: {:#?}\n", resp); Ok(IngestionResult { @@ -81,22 +84,4 @@ impl QueuedIngestClient { blob_uri: Some(blob_descriptor.uri()), }) } - - // /// Ingest a local file into Kusto - // pub async fn ingest_from_file( - // &self, - // file_descriptor: FileDescriptor, - // ingestion_properties: IngestionProperties, - // ) -> Result { - // unimplemented!() - // } - - // /// Ingest a stream into Kusto - // pub async fn ingest_from_stream( - // &self, - // stream_descriptor: StreamDescriptor, - // ingestion_properties: IngestionProperties, - // ) -> Result { - // unimplemented!() - // } } diff --git a/azure-kusto-ingest/src/resource_manager.rs b/azure-kusto-ingest/src/resource_manager.rs index 53f81c7..132f3fd 100644 --- a/azure-kusto-ingest/src/resource_manager.rs +++ b/azure-kusto-ingest/src/resource_manager.rs @@ -37,6 +37,7 @@ impl ResourceManager { } } + /// Returns the latest [QueueClient]s ready for posting ingestion messages to pub async fn secured_ready_for_aggregation_queues(&self) -> Result> { Ok(self .ingest_client_resources From c250a67a9e9fc2336bb3c18de11fa93e75811982 Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Wed, 30 Aug 2023 14:14:10 +0100 Subject: [PATCH 19/44] simplify again to provide basic queued ingestion client supporting only identity mapping --- azure-kusto-ingest/Cargo.toml | 1 - azure-kusto-ingest/src/client_options.rs | 12 +-- azure-kusto-ingest/src/data_format.rs | 65 ------------- azure-kusto-ingest/src/descriptors.rs | 56 +++++------ azure-kusto-ingest/src/errors.rs | 6 -- azure-kusto-ingest/src/ingestion_blob_info.rs | 54 +---------- .../src/ingestion_properties.rs | 92 +------------------ azure-kusto-ingest/src/resource_manager.rs | 1 + 8 files changed, 37 insertions(+), 250 deletions(-) delete mode 100644 azure-kusto-ingest/src/errors.rs diff --git a/azure-kusto-ingest/Cargo.toml b/azure-kusto-ingest/Cargo.toml index aeebf86..4a41019 100644 --- a/azure-kusto-ingest/Cargo.toml +++ b/azure-kusto-ingest/Cargo.toml @@ -9,7 +9,6 @@ edition = "2021" uuid = {version = "1", features = ["v4", "serde"]} serde = { version = "1", features = ["serde_derive"] } serde_json = "1" -serde_repr = "0.1" url = "2" azure-kusto-data = {path = "../azure-kusto-data"} anyhow = "1.0.72" diff --git a/azure-kusto-ingest/src/client_options.rs b/azure-kusto-ingest/src/client_options.rs index bb880c6..c7b6bd8 100644 --- a/azure-kusto-ingest/src/client_options.rs +++ b/azure-kusto-ingest/src/client_options.rs @@ -5,7 +5,6 @@ use azure_core::ClientOptions; pub struct QueuedIngestClientOptions { pub queue_service: ClientOptions, pub blob_service: ClientOptions, - pub table_service: ClientOptions, } impl From for QueuedIngestClientOptions { @@ -13,8 +12,7 @@ impl From for QueuedIngestClientOptions { fn from(client_options: ClientOptions) -> Self { Self { queue_service: client_options.clone(), - blob_service: client_options.clone(), - table_service: client_options, + blob_service: client_options, } } } @@ -24,7 +22,6 @@ impl From for QueuedIngestClientOptions { pub struct QueuedIngestClientOptionsBuilder { queue_service: ClientOptions, blob_service: ClientOptions, - table_service: ClientOptions, } impl QueuedIngestClientOptionsBuilder { @@ -32,7 +29,6 @@ impl QueuedIngestClientOptionsBuilder { Self { queue_service: ClientOptions::default(), blob_service: ClientOptions::default(), - table_service: ClientOptions::default(), } } @@ -46,16 +42,10 @@ impl QueuedIngestClientOptionsBuilder { self } - pub fn with_table_service(mut self, table_service: ClientOptions) -> Self { - self.table_service = table_service; - self - } - pub fn build(self) -> QueuedIngestClientOptions { QueuedIngestClientOptions { queue_service: self.queue_service, blob_service: self.blob_service, - table_service: self.table_service, } } } diff --git a/azure-kusto-ingest/src/data_format.rs b/azure-kusto-ingest/src/data_format.rs index 03a42aa..655502f 100644 --- a/azure-kusto-ingest/src/data_format.rs +++ b/azure-kusto-ingest/src/data_format.rs @@ -1,23 +1,5 @@ use serde::Serialize; -#[derive(Serialize, Clone, Debug, Default)] -pub enum IngestionMappingKind { - #[serde(rename = "Csv")] - #[default] - CSV, - #[serde(rename = "Json")] - JSON, - Avro, - ApacheAvro, - Parquet, - SStream, - #[serde(rename = "Orc")] - ORC, - #[serde(rename = "W3CLogFile")] - W3CLOGFILE, - Unknown, -} - /// All data formats supported by Kusto /// Default is [DataFormat::CSV] #[derive(Serialize, Clone, Debug, Default)] @@ -42,50 +24,3 @@ pub enum DataFormat { TXT, W3CLOGFILE, } - -impl DataFormat { - pub fn ingestion_mapping_kind(self) -> IngestionMappingKind { - match self { - DataFormat::CSV => IngestionMappingKind::CSV, - DataFormat::TSV => IngestionMappingKind::CSV, - DataFormat::SCSV => IngestionMappingKind::CSV, - DataFormat::SOHsv => IngestionMappingKind::CSV, - DataFormat::PSV => IngestionMappingKind::CSV, - DataFormat::TXT => IngestionMappingKind::CSV, - DataFormat::TSVe => IngestionMappingKind::CSV, - DataFormat::JSON => IngestionMappingKind::JSON, - DataFormat::SingleJSON => IngestionMappingKind::JSON, - DataFormat::MultiJSON => IngestionMappingKind::JSON, - DataFormat::Avro => IngestionMappingKind::Avro, - DataFormat::ApacheAvro => IngestionMappingKind::ApacheAvro, - DataFormat::Parquet => IngestionMappingKind::Parquet, - DataFormat::SStream => IngestionMappingKind::SStream, - DataFormat::ORC => IngestionMappingKind::ORC, - DataFormat::RAW => IngestionMappingKind::CSV, - DataFormat::W3CLOGFILE => IngestionMappingKind::W3CLOGFILE, - } - } - - /// Binary formats should not be compressed - pub fn compressible(self) -> bool { - match self { - DataFormat::CSV => true, - DataFormat::TSV => true, - DataFormat::SCSV => true, - DataFormat::SOHsv => true, - DataFormat::PSV => true, - DataFormat::TXT => true, - DataFormat::TSVe => true, - DataFormat::JSON => true, - DataFormat::SingleJSON => true, - DataFormat::MultiJSON => true, - DataFormat::Avro => true, - DataFormat::ApacheAvro => true, - DataFormat::Parquet => false, - DataFormat::SStream => false, - DataFormat::ORC => false, - DataFormat::RAW => true, - DataFormat::W3CLOGFILE => true, - } - } -} diff --git a/azure-kusto-ingest/src/descriptors.rs b/azure-kusto-ingest/src/descriptors.rs index ed7f0e8..245d6e0 100644 --- a/azure-kusto-ingest/src/descriptors.rs +++ b/azure-kusto-ingest/src/descriptors.rs @@ -1,31 +1,5 @@ use uuid::Uuid; -/// Helper for adding authentication information to a blob path in the format expected by Kusto -#[derive(Clone)] -pub enum BlobAuth { - /// adds `?` to the blob path - SASToken(String), - /// adds `;managed_identity=` to the blob path - UserAssignedManagedIdentity(String), - /// adds `;managed_identity=system` to the blob path - SystemAssignedManagedIdentity, -} - -impl std::fmt::Debug for BlobAuth { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - BlobAuth::SASToken(_) => f.debug_struct("SASToken").finish(), - BlobAuth::UserAssignedManagedIdentity(object_id) => f - .debug_struct("UserAssignedManagedIdentity") - .field("object_id", object_id) - .finish(), - BlobAuth::SystemAssignedManagedIdentity => { - f.debug_struct("SystemAssignedManagedIdentity").finish() - } - } - } -} - /// Encapsulates the information related to a blob that is required to ingest from a blob #[derive(Debug, Clone)] pub struct BlobDescriptor { @@ -57,8 +31,8 @@ impl BlobDescriptor { self } - /// Returns the uri with the authentication information added - pub fn uri(&self) -> String { + /// Returns the uri with the authentication information added, ready to be serialized into the ingestion message + pub(crate) fn uri(&self) -> String { match &self.blob_auth { Some(BlobAuth::SASToken(sas_token)) => { format!("{}?{}", self.uri, sas_token.as_str()) @@ -74,6 +48,32 @@ impl BlobDescriptor { } } +/// Helper for adding authentication information to a blob path in the format expected by Kusto +#[derive(Clone)] +pub enum BlobAuth { + /// adds `?` to the blob path + SASToken(String), + /// adds `;managed_identity=` to the blob path + UserAssignedManagedIdentity(String), + /// adds `;managed_identity=system` to the blob path + SystemAssignedManagedIdentity, +} + +impl std::fmt::Debug for BlobAuth { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + BlobAuth::SASToken(_) => f.debug_struct("SASToken").finish(), + BlobAuth::UserAssignedManagedIdentity(object_id) => f + .debug_struct("UserAssignedManagedIdentity") + .field("object_id", object_id) + .finish(), + BlobAuth::SystemAssignedManagedIdentity => { + f.debug_struct("SystemAssignedManagedIdentity").finish() + } + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/azure-kusto-ingest/src/errors.rs b/azure-kusto-ingest/src/errors.rs deleted file mode 100644 index f833f07..0000000 --- a/azure-kusto-ingest/src/errors.rs +++ /dev/null @@ -1,6 +0,0 @@ -pub enum KustoClientError { - KustoMappingError, - KustoDuplicateMappingError, - KustoMissingMappingError, - KustoInvalidEndpointError, -} \ No newline at end of file diff --git a/azure-kusto-ingest/src/ingestion_blob_info.rs b/azure-kusto-ingest/src/ingestion_blob_info.rs index 338c9c7..3d5db79 100644 --- a/azure-kusto-ingest/src/ingestion_blob_info.rs +++ b/azure-kusto-ingest/src/ingestion_blob_info.rs @@ -2,9 +2,8 @@ use chrono::{DateTime, Utc}; use serde::Serialize; use crate::{ - data_format::DataFormat, - descriptors::BlobDescriptor, - ingestion_properties::{IngestionProperties, ReportLevel, ReportMethod, ValidationPolicy}, + data_format::DataFormat, descriptors::BlobDescriptor, + ingestion_properties::IngestionProperties, resource_manager::authorization_context::KustoIdentityToken, }; @@ -30,18 +29,9 @@ pub struct QueuedIngestionMessage { /// If set to `true`, the blob won't be deleted once ingestion is successfully completed. Default is `false` #[serde(skip_serializing_if = "Option::is_none")] retain_blob_on_success: Option, - /// If set to `true`, any aggregation will be skipped. Default is `false` + /// If set to `true`, any server side aggregation will be skipped. Default is `false` #[serde(skip_serializing_if = "Option::is_none")] flush_immediately: Option, - /// Ignores the size limit for data ingestion - #[serde(skip_serializing_if = "Option::is_none")] - ignore_size_limit: Option, - /// Defines which if any ingestion states are reported - #[serde(skip_serializing_if = "Option::is_none")] - report_level: Option, - /// Defines which mechanisms are used to report the ingestion status - #[serde(skip_serializing_if = "Option::is_none")] - report_method: Option, source_message_creation_time: DateTime, additional_properties: AdditionalProperties, } @@ -52,18 +42,9 @@ impl QueuedIngestionMessage { ingestion_properties: &IngestionProperties, authorization_context: KustoIdentityToken, ) -> Self { - // TODO: processing of additional tags, ingest_by_tags, drop_by_tags into just tags - let additional_properties = AdditionalProperties { - ingestion_mapping: None, - ingestion_mapping_reference: None, - creation_time: ingestion_properties.creation_time, - data_format: ingestion_properties.data_format.clone(), - ingest_if_not_exists: None, - ignore_first_record: ingestion_properties.ignore_first_record, - tags: vec![], - validation_policy: None, authorization_context, + data_format: ingestion_properties.data_format.clone(), }; Self { @@ -74,9 +55,6 @@ impl QueuedIngestionMessage { table_name: ingestion_properties.table_name.clone(), retain_blob_on_success: ingestion_properties.retain_blob_on_success, flush_immediately: ingestion_properties.flush_immediately, - report_level: ingestion_properties.report_level.clone(), - report_method: ingestion_properties.report_method.clone(), - ignore_size_limit: Some(false), source_message_creation_time: Utc::now(), additional_properties, } @@ -90,28 +68,4 @@ struct AdditionalProperties { authorization_context: KustoIdentityToken, #[serde(rename = "format")] data_format: DataFormat, - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(rename = "ingestionMapping")] - ingestion_mapping: Option, - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(rename = "ingestionMappingReference")] - ingestion_mapping_reference: Option, - // TODO: is this required? - // #[serde(skip_serializing_if = "Option::is_none")] - // #[serde(rename = "ingestionMappingType")] - // ingestion_mapping_type: Option, - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(rename = "creationTime")] - creation_time: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(rename = "ingestIfNotExists")] - ingest_if_not_exists: Option, - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(rename = "ignoreFirstRecord")] - ignore_first_record: Option, - #[serde(skip_serializing_if = "Vec::is_empty")] - tags: Vec, - #[serde(rename = "validationPolicy")] - #[serde(skip_serializing_if = "Option::is_none")] - validation_policy: Option, } diff --git a/azure-kusto-ingest/src/ingestion_properties.rs b/azure-kusto-ingest/src/ingestion_properties.rs index bb86dda..031505f 100644 --- a/azure-kusto-ingest/src/ingestion_properties.rs +++ b/azure-kusto-ingest/src/ingestion_properties.rs @@ -1,9 +1,7 @@ -use crate::data_format::{DataFormat, IngestionMappingKind}; +use crate::data_format::DataFormat; use chrono::{DateTime, Utc}; -use serde::Serialize; -use serde_repr::Serialize_repr; -/// Properties used when ingesting data into Kusto, allowing for customisation of the ingestion process +/// Properties of ingestion that can be used when ingesting data into Kusto allowing for customisation of the ingestion process #[derive(Clone, Debug, Default)] pub struct IngestionProperties { /// Name of the database to ingest into @@ -14,92 +12,8 @@ pub struct IngestionProperties { pub retain_blob_on_success: Option, /// Format of the data being ingested pub data_format: DataFormat, - // TODO: ingestion mappings could likely be made neater by using enums to enforce checks - pub ingestion_mapping: Option>, - pub ingestion_mapping_type: Option, - pub ingestion_mapping_reference: Option>, - pub additional_tags: Vec, - pub ingest_if_not_exists: Vec, - pub ingest_by_tags: Vec, - pub drop_by_tags: Vec, + /// If set to `true`, any aggregation will be skipped. Default is `false` pub flush_immediately: Option, - pub ignore_first_record: Option, - pub report_level: Option, - pub report_method: Option, - pub validation_policy: Option, /// Allows for configurability of the `creationTime` property pub creation_time: Option>, } - -#[derive(Serialize, Clone, Debug)] -#[serde(rename_all = "PascalCase")] -pub struct ValidationPolicy { - validation_options: ValidationOptions, - validation_implications: ValidationImplications, -} - -#[derive(Serialize_repr, Clone, Debug)] -#[repr(u8)] -pub enum ValidationOptions { - DoNotValidate = 0, - ValidateCsvInputConstantColumns = 1, - ValidateCsvInputColumnLevelOnly = 2, -} - -#[derive(Serialize_repr, Clone, Debug)] -#[repr(u8)] -pub enum ValidationImplications { - Fail = 0, - BestEffort = 1, -} - -#[derive(Serialize_repr, Clone, Debug)] -#[repr(u8)] -pub enum ReportLevel { - Failures = 0, - None = 1, - All = 2, -} - -#[derive(Serialize_repr, Clone, Debug)] -#[repr(u8)] -pub enum ReportMethod { - Queue = 0, - Table = 1, -} - -#[derive(Serialize, Clone, Debug)] -pub enum TransformationMethod { - PropertyBagArrayToDictionary, - SourceLocation, - SourceLineNumber, - DateTimeFromUnixSeconds, - DateTimeFromUnixMilliseconds, - DateTimeFromUnixMicroseconds, - DateTimeFromUnixNanoseconds, - DropMappedFields, - BytesAsBase64, -} - -/// Use this class to create mappings for IngestionProperties.ingestionMappings and utilize mappings that were not -/// pre-created (it is recommended to create the mappings in advance and use ingestionMappingReference). -/// To read more about mappings look here: https://docs.microsoft.com/en-us/azure/kusto/management/mappings -#[derive(Serialize, Clone, Debug)] -#[serde(rename_all = "PascalCase")] -pub struct ColumnMapping { - column: String, - // TODO: can this be an enum? - data_type: String, - properties: ColumnMappingProperties, -} - -#[derive(Serialize, Clone, Debug)] -#[serde(rename_all = "PascalCase")] -pub struct ColumnMappingProperties { - path: Option, - transform: Option, - // TODO: This should get serialized to a string - ordinal: Option, - const_value: Option, - field: Option, -} diff --git a/azure-kusto-ingest/src/resource_manager.rs b/azure-kusto-ingest/src/resource_manager.rs index 132f3fd..d1524c2 100644 --- a/azure-kusto-ingest/src/resource_manager.rs +++ b/azure-kusto-ingest/src/resource_manager.rs @@ -46,6 +46,7 @@ impl ResourceManager { .secured_ready_for_aggregation_queues) } + /// Returns the latest [ContainerClient]s for temporary storage, where local data can be uploaded to Kusto-owned storage prior to ingestion // pub async fn temp_storage(&self) -> Result> { // Ok(self.ingest_client_resources.get().await?.temp_storage) // } From 2951b13853781cb05f384a5cc83b51e55c197082 Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Wed, 30 Aug 2023 14:32:54 +0100 Subject: [PATCH 20/44] remove more stuff --- azure-kusto-ingest/Cargo.toml | 22 ++++++------- azure-kusto-ingest/src/data_format.rs | 13 +++++++- azure-kusto-ingest/src/lib.rs | 1 - azure-kusto-ingest/src/queued_ingest.rs | 11 ++----- azure-kusto-ingest/src/result.rs | 42 ------------------------- 5 files changed, 25 insertions(+), 64 deletions(-) delete mode 100644 azure-kusto-ingest/src/result.rs diff --git a/azure-kusto-ingest/Cargo.toml b/azure-kusto-ingest/Cargo.toml index 4a41019..dfb13d3 100644 --- a/azure-kusto-ingest/Cargo.toml +++ b/azure-kusto-ingest/Cargo.toml @@ -6,18 +6,18 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -uuid = {version = "1", features = ["v4", "serde"]} -serde = { version = "1", features = ["serde_derive"] } -serde_json = "1" -url = "2" azure-kusto-data = {path = "../azure-kusto-data"} -anyhow = "1.0.72" -tokio = { version = "1", features = ["full"] } +# Azure SDK for Rust crates versions should be kept in sync +azure_core = "0.14" +azure_storage = "0.14" +azure_storage_blobs = "0.14" +azure_storage_queues = "0.14" -azure_core = "0.13" -azure_storage = "0.13" -azure_storage_blobs = "0.13" -azure_storage_queues = "0.13" +anyhow = "1" chrono = { version = "0.4", features = ["serde"] } rand = "0.8" - +serde = { version = "1", features = ["serde_derive"] } +serde_json = "1" +tokio = { version = "1", features = ["full"] } +url = "2" +uuid = {version = "1", features = ["v4", "serde"]} diff --git a/azure-kusto-ingest/src/data_format.rs b/azure-kusto-ingest/src/data_format.rs index 655502f..824ba1c 100644 --- a/azure-kusto-ingest/src/data_format.rs +++ b/azure-kusto-ingest/src/data_format.rs @@ -2,7 +2,7 @@ use serde::Serialize; /// All data formats supported by Kusto /// Default is [DataFormat::CSV] -#[derive(Serialize, Clone, Debug, Default)] +#[derive(Serialize, Clone, Debug, Default, PartialEq)] #[serde(rename_all = "lowercase")] pub enum DataFormat { ApacheAvro, @@ -24,3 +24,14 @@ pub enum DataFormat { TXT, W3CLOGFILE, } + +// Unit tests +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn data_format_default() { + assert_eq!(DataFormat::default(), DataFormat::CSV); + } +} diff --git a/azure-kusto-ingest/src/lib.rs b/azure-kusto-ingest/src/lib.rs index 4cb9ce8..eecc220 100644 --- a/azure-kusto-ingest/src/lib.rs +++ b/azure-kusto-ingest/src/lib.rs @@ -5,4 +5,3 @@ pub(crate) mod ingestion_blob_info; pub mod ingestion_properties; pub mod queued_ingest; pub(crate) mod resource_manager; -pub(crate) mod result; diff --git a/azure-kusto-ingest/src/queued_ingest.rs b/azure-kusto-ingest/src/queued_ingest.rs index 8dd95b1..5a882a0 100644 --- a/azure-kusto-ingest/src/queued_ingest.rs +++ b/azure-kusto-ingest/src/queued_ingest.rs @@ -10,7 +10,6 @@ use crate::descriptors::BlobDescriptor; use crate::ingestion_blob_info::QueuedIngestionMessage; use crate::ingestion_properties::IngestionProperties; use crate::resource_manager::ResourceManager; -use crate::result::{IngestionResult, IngestionStatus}; /// Client for ingesting data into Kusto using the queued flavour of ingestion #[derive(Clone)] @@ -43,7 +42,7 @@ impl QueuedIngestClient { &self, blob_descriptor: BlobDescriptor, ingestion_properties: IngestionProperties, - ) -> Result { + ) -> Result<()> { // The queues returned here should ideally be the storage queue client from azure-storage-queue // As such, it may be better for ResourceManager to return a struct that contains the storage queue client let ingestion_queues = self @@ -76,12 +75,6 @@ impl QueuedIngestClient { let _resp = queue_client.put_message(message).await?; // println!("resp: {:#?}\n", resp); - Ok(IngestionResult { - status: IngestionStatus::Queued, - database: ingestion_properties.database_name, - table: ingestion_properties.table_name, - source_id: blob_descriptor.source_id, - blob_uri: Some(blob_descriptor.uri()), - }) + Ok(()) } } diff --git a/azure-kusto-ingest/src/result.rs b/azure-kusto-ingest/src/result.rs deleted file mode 100644 index 6350de4..0000000 --- a/azure-kusto-ingest/src/result.rs +++ /dev/null @@ -1,42 +0,0 @@ -use uuid::Uuid; - -#[derive(Debug)] -pub enum IngestionStatus { - // The ingestion was queued. - Queued, - // The ingestion was successfully streamed - Success, -} - -// The result of an ingestion. -#[derive(Debug)] -pub struct IngestionResult { - // Will be `Queued` if the ingestion is queued, or `Success` if the ingestion is streaming and successful. - pub status: IngestionStatus, - // The name of the database where the ingestion was performed. - pub database: String, - // The name of the table where the ingestion was performed. - pub table: String, - // The source id of the ingestion. - pub source_id: Uuid, - // The blob uri of the ingestion, if exists. - pub blob_uri: Option, -} - -impl IngestionResult { - pub fn new( - status: IngestionStatus, - database: &str, - table: &str, - source_id: Uuid, - blob_uri: Option, - ) -> Self { - Self { - status, - database: database.to_owned(), - table: table.to_owned(), - source_id, - blob_uri, - } - } -} From 90f968ab516e81e189043e0a7ad7696212ba4d95 Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Wed, 30 Aug 2023 15:26:23 +0100 Subject: [PATCH 21/44] add basic example + more tidying --- .../examples/ingest_from_blob.rs | 56 +++++++++++++++++++ azure-kusto-ingest/src/data_format.rs | 2 +- .../src/ingestion_properties.rs | 7 +-- 3 files changed, 60 insertions(+), 5 deletions(-) create mode 100644 azure-kusto-ingest/examples/ingest_from_blob.rs diff --git a/azure-kusto-ingest/examples/ingest_from_blob.rs b/azure-kusto-ingest/examples/ingest_from_blob.rs new file mode 100644 index 0000000..93fcd64 --- /dev/null +++ b/azure-kusto-ingest/examples/ingest_from_blob.rs @@ -0,0 +1,56 @@ +use anyhow::Result; +use azure_kusto_data::prelude::{ConnectionString, KustoClient, KustoClientOptions}; +use azure_kusto_ingest::data_format::DataFormat; +use azure_kusto_ingest::descriptors::{BlobAuth, BlobDescriptor}; +use azure_kusto_ingest::ingestion_properties::IngestionProperties; +use azure_kusto_ingest::queued_ingest::QueuedIngestClient; + +/// Example of ingesting data into Kusto from Azure Blob Storage using managed identities +/// +/// There are some steps that need to be taken to allow for managed identities to work: +/// - Permissions as the ingestor to initiate ingestion +/// https://learn.microsoft.com/en-us/azure/data-explorer/kusto/api/netfx/kusto-ingest-client-permissions +/// - Permissions for Kusto to access storage +/// https://learn.microsoft.com/en-us/azure/data-explorer/ingest-data-managed-identity +#[tokio::main] +async fn main() -> Result<()> { + let cluster_uri = "https://ingest-..kusto.windows.net"; + let user_mi_object_id = ""; + + // Create a Kusto client with managed identity authentication via the user assigned identity + // Note that this requires + let kusto_client = KustoClient::new( + ConnectionString::with_managed_identity_auth( + cluster_uri, + Some(user_mi_object_id.to_string()), + ), + KustoClientOptions::default(), + )?; + + // Create a queued ingest client + let queued_ingest_client = QueuedIngestClient::new(kusto_client); + + // Define ingestion properties + let ingestion_properties = IngestionProperties { + database_name: "".into(), + table_name: "".into(), + // Don't delete the blob on successful ingestion + retain_blob_on_success: Some(true), + // File format of the blob is Parquet + data_format: DataFormat::Parquet, + // Assume the server side default for flush_immediately + flush_immediately: None, + }; + + // Define the blob to ingest from + let blob_uri = "https://.blob.core.windows.net/"; + // Define the size of the blob if known, this improves ingestion performance as Kusto does not need to access the blob to determine the size + let blob_size = 123; + // Create the blob descriptor, also specifying that the blob should be accessed using the system assigned managed identity of the Kusto cluster + let blob_descriptor = BlobDescriptor::new(blob_uri.to_string(), Some(blob_size), None) + .with_blob_auth(BlobAuth::SystemAssignedManagedIdentity); + + queued_ingest_client + .ingest_from_blob(blob_descriptor, ingestion_properties) + .await +} diff --git a/azure-kusto-ingest/src/data_format.rs b/azure-kusto-ingest/src/data_format.rs index 824ba1c..6dd52e6 100644 --- a/azure-kusto-ingest/src/data_format.rs +++ b/azure-kusto-ingest/src/data_format.rs @@ -1,6 +1,6 @@ use serde::Serialize; -/// All data formats supported by Kusto +/// All data formats supported by Kusto. /// Default is [DataFormat::CSV] #[derive(Serialize, Clone, Debug, Default, PartialEq)] #[serde(rename_all = "lowercase")] diff --git a/azure-kusto-ingest/src/ingestion_properties.rs b/azure-kusto-ingest/src/ingestion_properties.rs index 031505f..1c99141 100644 --- a/azure-kusto-ingest/src/ingestion_properties.rs +++ b/azure-kusto-ingest/src/ingestion_properties.rs @@ -1,5 +1,4 @@ use crate::data_format::DataFormat; -use chrono::{DateTime, Utc}; /// Properties of ingestion that can be used when ingesting data into Kusto allowing for customisation of the ingestion process #[derive(Clone, Debug, Default)] @@ -8,12 +7,12 @@ pub struct IngestionProperties { pub database_name: String, /// Name of the table to ingest into pub table_name: String, - /// Whether the blob is retained after ingestion, note that this requires extra permissions + /// Whether the blob is retained after ingestion. + /// Note that the default when not provided is `false`, meaning that Kusto will attempt to delete the blob upon ingestion. + /// This will only be successful if provided sufficient permissions on the blob pub retain_blob_on_success: Option, /// Format of the data being ingested pub data_format: DataFormat, /// If set to `true`, any aggregation will be skipped. Default is `false` pub flush_immediately: Option, - /// Allows for configurability of the `creationTime` property - pub creation_time: Option>, } From 10d1d9cc3ecba73a61a5158737fee118a53a6ed9 Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Wed, 30 Aug 2023 16:31:42 +0100 Subject: [PATCH 22/44] change to import from qualification --- azure-kusto-ingest/src/ingestion_blob_info.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/azure-kusto-ingest/src/ingestion_blob_info.rs b/azure-kusto-ingest/src/ingestion_blob_info.rs index 3d5db79..5b2fb80 100644 --- a/azure-kusto-ingest/src/ingestion_blob_info.rs +++ b/azure-kusto-ingest/src/ingestion_blob_info.rs @@ -1,5 +1,6 @@ use chrono::{DateTime, Utc}; use serde::Serialize; +use uuid::Uuid; use crate::{ data_format::DataFormat, descriptors::BlobDescriptor, @@ -13,7 +14,7 @@ use crate::{ #[serde(rename_all = "PascalCase")] pub struct QueuedIngestionMessage { /// Message identifier for this upload - id: uuid::Uuid, + id: Uuid, /// Path (URI) to the blob, including the SAS key granting permissions to read/write/delete it. /// Permissions are required so that the ingestion service can delete the blob once it has completed ingesting the data. blob_path: String, From 0983454277a5f29f1d9b680709f460998a028e8b Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Thu, 31 Aug 2023 09:59:50 +0100 Subject: [PATCH 23/44] change rng to allow multiple threads --- azure-kusto-ingest/src/queued_ingest.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/azure-kusto-ingest/src/queued_ingest.rs b/azure-kusto-ingest/src/queued_ingest.rs index 5a882a0..57e78c9 100644 --- a/azure-kusto-ingest/src/queued_ingest.rs +++ b/azure-kusto-ingest/src/queued_ingest.rs @@ -3,7 +3,9 @@ use std::sync::Arc; use anyhow::Result; use azure_core::base64; use azure_kusto_data::prelude::KustoClient; +use rand::rngs::StdRng; use rand::seq::SliceRandom; +use rand::SeedableRng; use crate::client_options::QueuedIngestClientOptions; use crate::descriptors::BlobDescriptor; @@ -59,7 +61,7 @@ impl QueuedIngestClient { // println!("message as struct: {:#?}\n", message); // Pick a random queue from the queue clients returned by the resource manager - let mut rng = rand::thread_rng(); + let mut rng: StdRng = SeedableRng::from_entropy(); let queue_client = ingestion_queues .choose(&mut rng) .ok_or(anyhow::anyhow!("Failed to pick a random queue"))?; From b72d2a9aba1a329de993b4f50a7e5e60fcf50639 Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Thu, 31 Aug 2023 10:53:31 +0100 Subject: [PATCH 24/44] handle authorization context token correctly --- .../src/resource_manager/authorization_context.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/azure-kusto-ingest/src/resource_manager/authorization_context.rs b/azure-kusto-ingest/src/resource_manager/authorization_context.rs index f52fc7d..c0c5a0b 100644 --- a/azure-kusto-ingest/src/resource_manager/authorization_context.rs +++ b/azure-kusto-ingest/src/resource_manager/authorization_context.rs @@ -65,14 +65,20 @@ impl AuthorizationContext { table.rows.len() )) } - } - .to_string(); + }; + + // Convert the JSON string into a Rust string + let kusto_identity_token = kusto_identity_token + .as_str() + .ok_or(anyhow::anyhow!( + "Kusto response did not contain a string value" + ))?; if kusto_identity_token.chars().all(char::is_whitespace) { return Err(anyhow::anyhow!("Kusto identity token is empty")); } - Ok(kusto_identity_token) + Ok(kusto_identity_token.to_string()) } /// Fetches the latest Kusto identity token, either retrieving from cache if valid, or by executing a KQL query From e00f9438c720fe1430e921f8dbf52709c81782d7 Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Fri, 1 Sep 2023 14:22:18 +0100 Subject: [PATCH 25/44] initial markups --- azure-kusto-ingest/Cargo.toml | 2 +- .../examples/ingest_from_blob.rs | 3 +-- azure-kusto-ingest/src/descriptors.rs | 22 +++++++++---------- .../resource_manager/authorization_context.rs | 10 ++++----- .../src/resource_manager/cache.rs | 2 +- 5 files changed, 18 insertions(+), 21 deletions(-) diff --git a/azure-kusto-ingest/Cargo.toml b/azure-kusto-ingest/Cargo.toml index dfb13d3..a9e9080 100644 --- a/azure-kusto-ingest/Cargo.toml +++ b/azure-kusto-ingest/Cargo.toml @@ -7,7 +7,7 @@ edition = "2021" [dependencies] azure-kusto-data = {path = "../azure-kusto-data"} -# Azure SDK for Rust crates versions should be kept in sync +# Azure SDK for Rust crates versions must be kept in sync azure_core = "0.14" azure_storage = "0.14" azure_storage_blobs = "0.14" diff --git a/azure-kusto-ingest/examples/ingest_from_blob.rs b/azure-kusto-ingest/examples/ingest_from_blob.rs index 93fcd64..574ad7b 100644 --- a/azure-kusto-ingest/examples/ingest_from_blob.rs +++ b/azure-kusto-ingest/examples/ingest_from_blob.rs @@ -18,7 +18,6 @@ async fn main() -> Result<()> { let user_mi_object_id = ""; // Create a Kusto client with managed identity authentication via the user assigned identity - // Note that this requires let kusto_client = KustoClient::new( ConnectionString::with_managed_identity_auth( cluster_uri, @@ -47,7 +46,7 @@ async fn main() -> Result<()> { // Define the size of the blob if known, this improves ingestion performance as Kusto does not need to access the blob to determine the size let blob_size = 123; // Create the blob descriptor, also specifying that the blob should be accessed using the system assigned managed identity of the Kusto cluster - let blob_descriptor = BlobDescriptor::new(blob_uri.to_string(), Some(blob_size), None) + let blob_descriptor = BlobDescriptor::new(blob_uri, Some(blob_size), None) .with_blob_auth(BlobAuth::SystemAssignedManagedIdentity); queued_ingest_client diff --git a/azure-kusto-ingest/src/descriptors.rs b/azure-kusto-ingest/src/descriptors.rs index 245d6e0..85804cb 100644 --- a/azure-kusto-ingest/src/descriptors.rs +++ b/azure-kusto-ingest/src/descriptors.rs @@ -11,14 +11,14 @@ pub struct BlobDescriptor { } impl BlobDescriptor { - pub fn new(uri: String, size: Option, source_id: Option) -> Self { + pub fn new(uri: impl Into, size: Option, source_id: Option) -> Self { let source_id = match source_id { Some(source_id) => source_id, None => Uuid::new_v4(), }; Self { - uri, + uri: uri.into(), size, source_id, blob_auth: None, @@ -81,7 +81,7 @@ mod tests { #[test] fn blob_descriptor_with_no_auth_modification() { let uri = "https://mystorageaccount.blob.core.windows.net/mycontainer/myblob"; - let blob_descriptor = BlobDescriptor::new(uri.to_string(), None, None); + let blob_descriptor = BlobDescriptor::new(uri, None, None); assert_eq!(blob_descriptor.uri(), uri); } @@ -90,34 +90,34 @@ mod tests { fn blob_descriptor_with_sas_token() { let uri = "https://mystorageaccount.blob.core.windows.net/mycontainer/myblob"; let sas_token = "my_sas_token"; - let blob_descriptor = BlobDescriptor::new(uri.to_string(), None, None) + let blob_descriptor = BlobDescriptor::new(uri, None, None) .with_blob_auth(BlobAuth::SASToken(sas_token.to_string())); - assert_eq!(blob_descriptor.uri(), format!("{}?{}", uri, sas_token)); + assert_eq!(blob_descriptor.uri(), format!("{uri}?{sas_token}")); } #[test] fn blob_descriptor_with_user_assigned_managed_identity() { let uri = "https://mystorageaccount.blob.core.windows.net/mycontainer/myblob"; let object_id = "my_object_id"; - let blob_descriptor = BlobDescriptor::new(uri.to_string(), None, None) + let blob_descriptor = BlobDescriptor::new(uri, None, None) .with_blob_auth(BlobAuth::UserAssignedManagedIdentity(object_id.to_string())); assert_eq!( blob_descriptor.uri(), - format!("{};managed_identity={}", uri, object_id) + format!("{uri};managed_identity={object_id}") ); } #[test] fn blob_descriptor_with_system_assigned_managed_identity() { let uri = "https://mystorageaccount.blob.core.windows.net/mycontainer/myblob"; - let blob_descriptor = BlobDescriptor::new(uri.to_string(), None, None) + let blob_descriptor = BlobDescriptor::new(uri, None, None) .with_blob_auth(BlobAuth::SystemAssignedManagedIdentity); assert_eq!( blob_descriptor.uri(), - format!("{};managed_identity=system", uri) + format!("{uri};managed_identity=system") ); } @@ -125,7 +125,7 @@ mod tests { fn blob_descriptor_with_size() { let uri = "https://mystorageaccount.blob.core.windows.net/mycontainer/myblob"; let size = 123; - let blob_descriptor = BlobDescriptor::new(uri.to_string(), Some(size), None); + let blob_descriptor = BlobDescriptor::new(uri, Some(size), None); assert_eq!(blob_descriptor.size, Some(size)); } @@ -134,7 +134,7 @@ mod tests { fn blob_descriptor_with_source_id() { let uri = "https://mystorageaccount.blob.core.windows.net/mycontainer/myblob"; let source_id = Uuid::new_v4(); - let blob_descriptor = BlobDescriptor::new(uri.to_string(), None, Some(source_id)); + let blob_descriptor = BlobDescriptor::new(uri, None, Some(source_id)); assert_eq!(blob_descriptor.source_id, source_id); } diff --git a/azure-kusto-ingest/src/resource_manager/authorization_context.rs b/azure-kusto-ingest/src/resource_manager/authorization_context.rs index c0c5a0b..a5a030b 100644 --- a/azure-kusto-ingest/src/resource_manager/authorization_context.rs +++ b/azure-kusto-ingest/src/resource_manager/authorization_context.rs @@ -68,11 +68,9 @@ impl AuthorizationContext { }; // Convert the JSON string into a Rust string - let kusto_identity_token = kusto_identity_token - .as_str() - .ok_or(anyhow::anyhow!( - "Kusto response did not contain a string value" - ))?; + let kusto_identity_token = kusto_identity_token.as_str().ok_or(anyhow::anyhow!( + "Kusto response did not contain a string value" + ))?; if kusto_identity_token.chars().all(char::is_whitespace) { return Err(anyhow::anyhow!("Kusto identity token is empty")); @@ -83,7 +81,7 @@ impl AuthorizationContext { /// Fetches the latest Kusto identity token, either retrieving from cache if valid, or by executing a KQL query pub async fn get(&self) -> Result { - // First, attempt to get the return the token from the cache + // Attempt to get the token from the cache let auth_context_cache = self.auth_context_cache.read().await; if !auth_context_cache.is_expired() { if let Some(inner_value) = auth_context_cache.get() { diff --git a/azure-kusto-ingest/src/resource_manager/cache.rs b/azure-kusto-ingest/src/resource_manager/cache.rs index 061343c..98a5e4e 100644 --- a/azure-kusto-ingest/src/resource_manager/cache.rs +++ b/azure-kusto-ingest/src/resource_manager/cache.rs @@ -28,7 +28,7 @@ impl Cached { } pub fn is_expired(&self) -> bool { - self.last_updated.elapsed() > self.refresh_period + self.last_updated.elapsed() >= self.refresh_period } pub fn update(&mut self, inner: T) { From 15288e51f9d5bd0694b17131b6236def31671b6f Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Fri, 1 Sep 2023 14:26:05 +0100 Subject: [PATCH 26/44] markup: remove accessor methods on ResourceUri --- .../src/resource_manager/resource_uri.rs | 44 +++++++------------ 1 file changed, 15 insertions(+), 29 deletions(-) diff --git a/azure-kusto-ingest/src/resource_manager/resource_uri.rs b/azure-kusto-ingest/src/resource_manager/resource_uri.rs index e78ca58..8fb2ef6 100644 --- a/azure-kusto-ingest/src/resource_manager/resource_uri.rs +++ b/azure-kusto-ingest/src/resource_manager/resource_uri.rs @@ -8,24 +8,10 @@ use anyhow::Result; /// Parsing logic of resource URIs as returned by the Kusto management endpoint #[derive(Debug, Clone)] -pub struct ResourceUri { - service_uri: String, - object_name: String, - sas_token: StorageCredentials, -} - -impl ResourceUri { - pub fn service_uri(&self) -> &str { - self.service_uri.as_str() - } - - pub fn object_name(&self) -> &str { - self.object_name.as_str() - } - - pub fn sas_token(&self) -> &StorageCredentials { - &self.sas_token - } +pub(crate) struct ResourceUri { + pub(crate) service_uri: String, + pub(crate) object_name: String, + pub(crate) sas_token: StorageCredentials, } impl TryFrom<&str> for ResourceUri { @@ -73,30 +59,30 @@ impl TryFrom<&str> for ResourceUri { } /// Trait to be used to create an Azure client from a resource URI with configurability of ClientOptions -pub trait ClientFromResourceUri { +pub(crate) trait ClientFromResourceUri { fn create_client(resource_uri: ResourceUri, client_options: ClientOptions) -> Self; } impl ClientFromResourceUri for QueueClient { fn create_client(resource_uri: ResourceUri, client_options: ClientOptions) -> Self { QueueServiceClientBuilder::with_location(azure_storage::CloudLocation::Custom { - uri: resource_uri.service_uri().to_string(), - credentials: resource_uri.sas_token().clone(), + uri: resource_uri.service_uri, + credentials: resource_uri.sas_token, }) .client_options(client_options) .build() - .queue_client(resource_uri.object_name()) + .queue_client(resource_uri.object_name) } } impl ClientFromResourceUri for ContainerClient { fn create_client(resource_uri: ResourceUri, client_options: ClientOptions) -> Self { ClientBuilder::with_location(azure_storage::CloudLocation::Custom { - uri: resource_uri.service_uri().to_string(), - credentials: resource_uri.sas_token().clone(), + uri: resource_uri.service_uri, + credentials: resource_uri.sas_token, }) .client_options(client_options) - .container_client(resource_uri.object_name()) + .container_client(resource_uri.object_name) } } @@ -111,17 +97,17 @@ mod tests { let resource_uri = ResourceUri::try_from(uri).unwrap(); assert_eq!( - resource_uri.service_uri(), + resource_uri.service_uri, "https://storageaccountname.blob.core.windows.com" ); - assert_eq!(resource_uri.object_name(), "containerobjectname"); + assert_eq!(resource_uri.object_name, "containerobjectname"); assert!(matches!( - resource_uri.sas_token(), + resource_uri.sas_token, StorageCredentials::SASToken(_) )); - if let StorageCredentials::SASToken(sas_vec) = resource_uri.sas_token() { + if let StorageCredentials::SASToken(sas_vec) = resource_uri.sas_token { assert_eq!(sas_vec.len(), 1); assert_eq!(sas_vec[0].0, "sas"); assert_eq!(sas_vec[0].1, "token"); From f03b21bcc4c141b74cc731ae178db00869c2f9e1 Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Fri, 1 Sep 2023 18:12:42 +0100 Subject: [PATCH 27/44] more markups --- azure-kusto-ingest/src/ingestion_blob_info.rs | 25 ++++++++++++------- azure-kusto-ingest/src/queued_ingest.rs | 2 -- .../src/resource_manager/cache.rs | 22 ++++++++-------- 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/azure-kusto-ingest/src/ingestion_blob_info.rs b/azure-kusto-ingest/src/ingestion_blob_info.rs index 5b2fb80..a3c60a9 100644 --- a/azure-kusto-ingest/src/ingestion_blob_info.rs +++ b/azure-kusto-ingest/src/ingestion_blob_info.rs @@ -8,15 +8,18 @@ use crate::{ resource_manager::authorization_context::KustoIdentityToken, }; -// Basing the ingestion message on -// https://learn.microsoft.com/en-us/azure/data-explorer/kusto/api/netfx/kusto-ingest-client-rest#ingestion-message-internal-structure +/// Message to be serialized as JSON and sent to the ingestion queue +/// +/// Basing the ingestion message on +/// https://learn.microsoft.com/en-us/azure/data-explorer/kusto/api/netfx/kusto-ingest-client-rest#ingestion-message-internal-structure #[derive(Serialize, Debug)] #[serde(rename_all = "PascalCase")] -pub struct QueuedIngestionMessage { +pub(crate) struct QueuedIngestionMessage { /// Message identifier for this upload id: Uuid, - /// Path (URI) to the blob, including the SAS key granting permissions to read/write/delete it. - /// Permissions are required so that the ingestion service can delete the blob once it has completed ingesting the data. + /// Path (URI) to the blob. + /// This should include any SAS token required to access the blob, or hints to use managed identity auth. + /// Extra permissions are required if the `RetainBlobOnSuccess` option is not true so that the ingestion service can delete the blob once it has completed ingesting the data. blob_path: String, // Name of the Kusto database the data will ingest into database_name: String, @@ -27,18 +30,20 @@ pub struct QueuedIngestionMessage { /// Although this property is optional, it is recommended to provide the size as otherwise the service will access the blob just to retrieve the size. #[serde(skip_serializing_if = "Option::is_none")] raw_data_size: Option, - /// If set to `true`, the blob won't be deleted once ingestion is successfully completed. Default is `false` + /// If set to `true`, the blob won't be deleted once ingestion is successfully completed. + /// Default is `false` when this property is not specified. Note that this has implications on permissions required against the blob. #[serde(skip_serializing_if = "Option::is_none")] retain_blob_on_success: Option, - /// If set to `true`, any server side aggregation will be skipped. Default is `false` + /// If set to `true`, any server side aggregation will be skipped - thus overriding the batching policy. Default is `false`. #[serde(skip_serializing_if = "Option::is_none")] flush_immediately: Option, source_message_creation_time: DateTime, + // Extra properties added to the ingestion command additional_properties: AdditionalProperties, } impl QueuedIngestionMessage { - pub fn new( + pub(crate) fn new( blob_descriptor: &BlobDescriptor, ingestion_properties: &IngestionProperties, authorization_context: KustoIdentityToken, @@ -62,9 +67,11 @@ impl QueuedIngestionMessage { } } -// The additional properties struct is modelled on: https://learn.microsoft.com/en-us/azure/data-explorer/ingestion-properties +/// Additional properties to be added to the ingestion message +/// This struct is modelled on: https://learn.microsoft.com/en-us/azure/data-explorer/ingestion-properties #[derive(Serialize, Clone, Debug)] struct AdditionalProperties { + /// Authorization string obtained from Kusto to allow for ingestion #[serde(rename = "authorizationContext")] authorization_context: KustoIdentityToken, #[serde(rename = "format")] diff --git a/azure-kusto-ingest/src/queued_ingest.rs b/azure-kusto-ingest/src/queued_ingest.rs index 57e78c9..c9ad84a 100644 --- a/azure-kusto-ingest/src/queued_ingest.rs +++ b/azure-kusto-ingest/src/queued_ingest.rs @@ -45,8 +45,6 @@ impl QueuedIngestClient { blob_descriptor: BlobDescriptor, ingestion_properties: IngestionProperties, ) -> Result<()> { - // The queues returned here should ideally be the storage queue client from azure-storage-queue - // As such, it may be better for ResourceManager to return a struct that contains the storage queue client let ingestion_queues = self .resource_manager .secured_ready_for_aggregation_queues() diff --git a/azure-kusto-ingest/src/resource_manager/cache.rs b/azure-kusto-ingest/src/resource_manager/cache.rs index 98a5e4e..eea665a 100644 --- a/azure-kusto-ingest/src/resource_manager/cache.rs +++ b/azure-kusto-ingest/src/resource_manager/cache.rs @@ -47,34 +47,34 @@ mod tests { #[test] fn test_cached_get() { let value = "hello"; - let cached = Cached::new(value.to_string(), Duration::from_secs(60)); + let cached_string = Cached::new(value.to_string(), Duration::from_secs(60)); - assert_eq!(cached.get(), value); + assert_eq!(cached_string.get(), value); } #[test] fn test_cached_is_expired() { let value = "hello"; - let mut cached = Cached::new(value.to_string(), Duration::from_secs(60)); + let mut cached_string = Cached::new(value.to_string(), Duration::from_secs(60)); - assert!(!cached.is_expired()); + assert!(!cached_string.is_expired()); - cached.last_updated = Instant::now() - Duration::from_secs(61); + cached_string.last_updated = Instant::now() - Duration::from_secs(61); - assert!(cached.is_expired()); + assert!(cached_string.is_expired()); } #[test] fn test_cached_update() { let value = "hello"; - let mut cached = Cached::new(value.to_string(), Duration::from_secs(60)); + let mut cached_string = Cached::new(value.to_string(), Duration::from_secs(60)); - assert_eq!(cached.get(), value); + assert_eq!(cached_string.get(), value); let new_value = "world"; - cached.update(new_value.to_string()); + cached_string.update(new_value.to_string()); - assert!(!cached.is_expired()); - assert_eq!(cached.get(), new_value); + assert!(!cached_string.is_expired()); + assert_eq!(cached_string.get(), new_value); } } From 0387e31addb67941b45591a4c6e5563baf3adf69 Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Mon, 4 Sep 2023 16:17:55 +0100 Subject: [PATCH 28/44] renaming --- azure-kusto-ingest/src/ingestion_blob_info.rs | 6 +++--- azure-kusto-ingest/src/queued_ingest.rs | 5 +---- azure-kusto-ingest/src/resource_manager.rs | 8 ++------ .../src/resource_manager/ingest_client_resources.rs | 12 +++++++----- 4 files changed, 13 insertions(+), 18 deletions(-) diff --git a/azure-kusto-ingest/src/ingestion_blob_info.rs b/azure-kusto-ingest/src/ingestion_blob_info.rs index a3c60a9..dc63d1b 100644 --- a/azure-kusto-ingest/src/ingestion_blob_info.rs +++ b/azure-kusto-ingest/src/ingestion_blob_info.rs @@ -17,7 +17,7 @@ use crate::{ pub(crate) struct QueuedIngestionMessage { /// Message identifier for this upload id: Uuid, - /// Path (URI) to the blob. + /// Path (URI) to the blob. /// This should include any SAS token required to access the blob, or hints to use managed identity auth. /// Extra permissions are required if the `RetainBlobOnSuccess` option is not true so that the ingestion service can delete the blob once it has completed ingesting the data. blob_path: String, @@ -30,11 +30,11 @@ pub(crate) struct QueuedIngestionMessage { /// Although this property is optional, it is recommended to provide the size as otherwise the service will access the blob just to retrieve the size. #[serde(skip_serializing_if = "Option::is_none")] raw_data_size: Option, - /// If set to `true`, the blob won't be deleted once ingestion is successfully completed. + /// If set to `true`, the blob won't be deleted once ingestion is successfully completed. /// Default is `false` when this property is not specified. Note that this has implications on permissions required against the blob. #[serde(skip_serializing_if = "Option::is_none")] retain_blob_on_success: Option, - /// If set to `true`, any server side aggregation will be skipped - thus overriding the batching policy. Default is `false`. + /// If set to `true`, any server side aggregation will be skipped - thus overriding the batching policy. Default is `false`. #[serde(skip_serializing_if = "Option::is_none")] flush_immediately: Option, source_message_creation_time: DateTime, diff --git a/azure-kusto-ingest/src/queued_ingest.rs b/azure-kusto-ingest/src/queued_ingest.rs index c9ad84a..b4f1def 100644 --- a/azure-kusto-ingest/src/queued_ingest.rs +++ b/azure-kusto-ingest/src/queued_ingest.rs @@ -45,10 +45,7 @@ impl QueuedIngestClient { blob_descriptor: BlobDescriptor, ingestion_properties: IngestionProperties, ) -> Result<()> { - let ingestion_queues = self - .resource_manager - .secured_ready_for_aggregation_queues() - .await?; + let ingestion_queues = self.resource_manager.ingestion_queues().await?; // println!("queues: {:#?}", ingestion_queues); let auth_context = self.resource_manager.authorization_context().await?; diff --git a/azure-kusto-ingest/src/resource_manager.rs b/azure-kusto-ingest/src/resource_manager.rs index d1524c2..c1c8b23 100644 --- a/azure-kusto-ingest/src/resource_manager.rs +++ b/azure-kusto-ingest/src/resource_manager.rs @@ -38,12 +38,8 @@ impl ResourceManager { } /// Returns the latest [QueueClient]s ready for posting ingestion messages to - pub async fn secured_ready_for_aggregation_queues(&self) -> Result> { - Ok(self - .ingest_client_resources - .get() - .await? - .secured_ready_for_aggregation_queues) + pub async fn ingestion_queues(&self) -> Result> { + Ok(self.ingest_client_resources.get().await?.ingestion_queues) } /// Returns the latest [ContainerClient]s for temporary storage, where local data can be uploaded to Kusto-owned storage prior to ingestion diff --git a/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs index cd155a5..a023fba 100644 --- a/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs +++ b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs @@ -67,8 +67,8 @@ where /// Storage of the clients required for ingestion #[derive(Debug, Clone)] pub struct InnerIngestClientResources { - pub secured_ready_for_aggregation_queues: Vec, - pub temp_storage: Vec, + pub ingestion_queues: Vec, + pub temp_storage_containers: Vec, } impl TryFrom<(&TableV1, &QueuedIngestClientOptions)> for InnerIngestClientResources { @@ -81,11 +81,14 @@ impl TryFrom<(&TableV1, &QueuedIngestClientOptions)> for InnerIngestClientResour let temp_storage = get_resource_by_name(table, "TempStorage".to_string())?; Ok(Self { - secured_ready_for_aggregation_queues: create_clients_vec( + ingestion_queues: create_clients_vec( &secured_ready_for_aggregation_queues, &client_options.queue_service, ), - temp_storage: create_clients_vec(&temp_storage, &client_options.blob_service), + temp_storage_containers: create_clients_vec( + &temp_storage, + &client_options.blob_service, + ), }) } } @@ -105,7 +108,6 @@ impl IngestClientResources { } } - /// Gets the latest resources from Kusto, updating the cached resources if they are expired /// Executes a KQL management query that retrieves resource URIs for the various Azure resources used for ingestion async fn execute_kql_mgmt_query(&self) -> Result { let results = self From 9773174426eeacf5d22cc7bef79f83fdf2c21751 Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Mon, 4 Sep 2023 16:48:19 +0100 Subject: [PATCH 29/44] convert println into initial debug logs --- azure-kusto-ingest/Cargo.toml | 1 + azure-kusto-ingest/src/queued_ingest.rs | 17 +++++++++-------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/azure-kusto-ingest/Cargo.toml b/azure-kusto-ingest/Cargo.toml index a9e9080..b2dcd99 100644 --- a/azure-kusto-ingest/Cargo.toml +++ b/azure-kusto-ingest/Cargo.toml @@ -19,5 +19,6 @@ rand = "0.8" serde = { version = "1", features = ["serde_derive"] } serde_json = "1" tokio = { version = "1", features = ["full"] } +tracing = { version = "0.1", default-features = false, features = ["std"] } url = "2" uuid = {version = "1", features = ["v4", "serde"]} diff --git a/azure-kusto-ingest/src/queued_ingest.rs b/azure-kusto-ingest/src/queued_ingest.rs index b4f1def..43cc7ac 100644 --- a/azure-kusto-ingest/src/queued_ingest.rs +++ b/azure-kusto-ingest/src/queued_ingest.rs @@ -6,6 +6,7 @@ use azure_kusto_data::prelude::KustoClient; use rand::rngs::StdRng; use rand::seq::SliceRandom; use rand::SeedableRng; +use tracing::debug; use crate::client_options::QueuedIngestClientOptions; use crate::descriptors::BlobDescriptor; @@ -46,31 +47,31 @@ impl QueuedIngestClient { ingestion_properties: IngestionProperties, ) -> Result<()> { let ingestion_queues = self.resource_manager.ingestion_queues().await?; - // println!("queues: {:#?}", ingestion_queues); + debug!("ingestion queues: {:#?}", ingestion_queues); let auth_context = self.resource_manager.authorization_context().await?; - // println!("auth_context: {:#?}\n", auth_context); + debug!("auth_context: {:#?}\n", auth_context); let message = QueuedIngestionMessage::new(&blob_descriptor, &ingestion_properties, auth_context); - // println!("message as struct: {:#?}\n", message); + debug!("message: {:#?}\n", message); // Pick a random queue from the queue clients returned by the resource manager let mut rng: StdRng = SeedableRng::from_entropy(); let queue_client = ingestion_queues .choose(&mut rng) .ok_or(anyhow::anyhow!("Failed to pick a random queue"))?; - // println!("queue_client: {:#?}\n", queue_client); + debug!("randomly seeded queue_client: {:#?}\n", queue_client); let message = serde_json::to_string(&message).unwrap(); - // println!("message as string: {}\n", message); + debug!("message as string: {}\n", message); // Base64 encode the ingestion message let message = base64::encode(&message); - // println!("message as base64 encoded string: {}\n", message); + debug!("message as base64 encoded string: {}\n", message); - let _resp = queue_client.put_message(message).await?; - // println!("resp: {:#?}\n", resp); + let resp = queue_client.put_message(message).await?; + debug!("resp: {:#?}\n", resp); Ok(()) } From 65b94830b58df929d2abd6f0514852c0714bc596 Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Mon, 4 Sep 2023 16:58:03 +0100 Subject: [PATCH 30/44] cleanup commented code related to container clients --- azure-kusto-ingest/src/resource_manager.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/azure-kusto-ingest/src/resource_manager.rs b/azure-kusto-ingest/src/resource_manager.rs index c1c8b23..ac8f1bd 100644 --- a/azure-kusto-ingest/src/resource_manager.rs +++ b/azure-kusto-ingest/src/resource_manager.rs @@ -42,11 +42,7 @@ impl ResourceManager { Ok(self.ingest_client_resources.get().await?.ingestion_queues) } - /// Returns the latest [ContainerClient]s for temporary storage, where local data can be uploaded to Kusto-owned storage prior to ingestion - // pub async fn temp_storage(&self) -> Result> { - // Ok(self.ingest_client_resources.get().await?.temp_storage) - // } - + /// Returns the latest [KustoIdentityToken] to be added as an authorization context to ingestion messages pub async fn authorization_context(&self) -> Result { self.authorization_context.get().await } From af220f6d16d05917f803f3e4aac4f61664eb7190 Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Mon, 4 Sep 2023 17:14:01 +0100 Subject: [PATCH 31/44] update function names --- .../src/resource_manager/authorization_context.rs | 4 ++-- .../src/resource_manager/ingest_client_resources.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/azure-kusto-ingest/src/resource_manager/authorization_context.rs b/azure-kusto-ingest/src/resource_manager/authorization_context.rs index a5a030b..7355deb 100644 --- a/azure-kusto-ingest/src/resource_manager/authorization_context.rs +++ b/azure-kusto-ingest/src/resource_manager/authorization_context.rs @@ -27,7 +27,7 @@ impl AuthorizationContext { } /// Executes a KQL query to get the Kusto identity token from the management endpoint - async fn execute_kql_mgmt_query(&self) -> Result { + async fn query_kusto_identity_token(&self) -> Result { let results = self .client .execute_command("NetDefaultDB", ".get kusto identity token", None) @@ -101,7 +101,7 @@ impl AuthorizationContext { } // Fetch new token from Kusto, update the cache, and return the token - let token = self.execute_kql_mgmt_query().await?; + let token = self.query_kusto_identity_token().await?; auth_context_cache.update(Some(token.clone())); Ok(token) diff --git a/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs index a023fba..7cb7b87 100644 --- a/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs +++ b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs @@ -109,7 +109,7 @@ impl IngestClientResources { } /// Executes a KQL management query that retrieves resource URIs for the various Azure resources used for ingestion - async fn execute_kql_mgmt_query(&self) -> Result { + async fn query_ingestion_resources(&self) -> Result { let results = self .client .execute_command("NetDefaultDB", ".get ingestion resources", None) @@ -142,7 +142,7 @@ impl IngestClientResources { } } - let new_resources = self.execute_kql_mgmt_query().await?; + let new_resources = self.query_ingestion_resources().await?; resources.update(Some(new_resources.clone())); Ok(new_resources) From 9f0244a8cbe848b210073df85e48da4dc5b88acf Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Tue, 5 Sep 2023 09:07:01 +0100 Subject: [PATCH 32/44] renames and add warning about using ingestion endpoint --- azure-kusto-ingest/src/queued_ingest.rs | 15 ++++++++------- .../src/resource_manager/authorization_context.rs | 8 ++++---- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/azure-kusto-ingest/src/queued_ingest.rs b/azure-kusto-ingest/src/queued_ingest.rs index 43cc7ac..13e9fff 100644 --- a/azure-kusto-ingest/src/queued_ingest.rs +++ b/azure-kusto-ingest/src/queued_ingest.rs @@ -21,23 +21,24 @@ pub struct QueuedIngestClient { } impl QueuedIngestClient { - /// Creates a new client from the given [KustoClient] + /// Creates a new client from the given [KustoClient]. + /// + /// **WARNING**: the [KustoClient] must be created with a connection string that points to the ingestion endpoint pub fn new(kusto_client: KustoClient) -> Self { Self::new_with_client_options(kusto_client, QueuedIngestClientOptions::default()) } /// Creates a new client from the given [KustoClient] and [QueuedIngestClientOptions] /// This allows for customisation of the [ClientOptions] used for the storage clients + /// + /// **WARNING**: the [KustoClient] must be created with a connection string that points to the ingestion endpoint pub fn new_with_client_options( kusto_client: KustoClient, options: QueuedIngestClientOptions, ) -> Self { - // TODO: add a validation check that the client provided is against the ingestion endpoint - // kusto_client.management_url() - - let resource_manager = Arc::new(ResourceManager::new(kusto_client, options)); - - Self { resource_manager } + Self { + resource_manager: Arc::new(ResourceManager::new(kusto_client, options)), + } } /// Ingest a file into Kusto from Azure Blob Storage diff --git a/azure-kusto-ingest/src/resource_manager/authorization_context.rs b/azure-kusto-ingest/src/resource_manager/authorization_context.rs index 7355deb..adb4894 100644 --- a/azure-kusto-ingest/src/resource_manager/authorization_context.rs +++ b/azure-kusto-ingest/src/resource_manager/authorization_context.rs @@ -84,8 +84,8 @@ impl AuthorizationContext { // Attempt to get the token from the cache let auth_context_cache = self.auth_context_cache.read().await; if !auth_context_cache.is_expired() { - if let Some(inner_value) = auth_context_cache.get() { - return Ok(inner_value.clone()); + if let Some(token) = auth_context_cache.get() { + return Ok(token.clone()); } } // Drop the read lock and get a write lock to refresh the token @@ -95,8 +95,8 @@ impl AuthorizationContext { // Again attempt to return from cache, check is done in case another thread // refreshed the token while we were waiting on the write lock if !auth_context_cache.is_expired() { - if let Some(inner_value) = auth_context_cache.get() { - return Ok(inner_value.clone()); + if let Some(token) = auth_context_cache.get() { + return Ok(token.clone()); } } From 26dc69a5d607de382fb5c7b8a1efd56a58010388 Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Tue, 5 Sep 2023 11:00:23 +0100 Subject: [PATCH 33/44] comments --- azure-kusto-ingest/src/descriptors.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/azure-kusto-ingest/src/descriptors.rs b/azure-kusto-ingest/src/descriptors.rs index 85804cb..44cfc38 100644 --- a/azure-kusto-ingest/src/descriptors.rs +++ b/azure-kusto-ingest/src/descriptors.rs @@ -11,6 +11,12 @@ pub struct BlobDescriptor { } impl BlobDescriptor { + /// Create a new BlobDescriptor. + /// + /// Parameters: + /// - `uri`: the uri of the blob to ingest from, note you can use the optional helper method `with_blob_auth` to add authentication information to the uri + /// - `size`: although the size is not required, providing it is recommended as it allows Kusto to better plan the ingestion process + /// - `source_id`: optional, useful if tracking ingestion status, if not provided, a random uuid will be generated pub fn new(uri: impl Into, size: Option, source_id: Option) -> Self { let source_id = match source_id { Some(source_id) => source_id, @@ -31,7 +37,7 @@ impl BlobDescriptor { self } - /// Returns the uri with the authentication information added, ready to be serialized into the ingestion message + /// Returns the uri with the authentication information concatenated, ready to be serialized into the ingestion message pub(crate) fn uri(&self) -> String { match &self.blob_auth { Some(BlobAuth::SASToken(sas_token)) => { @@ -59,6 +65,7 @@ pub enum BlobAuth { SystemAssignedManagedIdentity, } +/// Custom impl of Debug to avoid leaking sensitive information impl std::fmt::Debug for BlobAuth { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { From d477e2b0b5cc3ce623e4dee8f4794c5cdacbd502 Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Tue, 5 Sep 2023 11:21:26 +0100 Subject: [PATCH 34/44] markups --- azure-kusto-ingest/src/resource_manager.rs | 1 + .../resource_manager/authorization_context.rs | 50 +++++++++---------- .../src/resource_manager/cache.rs | 2 +- .../ingest_client_resources.rs | 21 ++------ .../src/resource_manager/utils.rs | 15 ++++++ 5 files changed, 45 insertions(+), 44 deletions(-) create mode 100644 azure-kusto-ingest/src/resource_manager/utils.rs diff --git a/azure-kusto-ingest/src/resource_manager.rs b/azure-kusto-ingest/src/resource_manager.rs index ac8f1bd..a91c041 100644 --- a/azure-kusto-ingest/src/resource_manager.rs +++ b/azure-kusto-ingest/src/resource_manager.rs @@ -4,6 +4,7 @@ pub mod authorization_context; pub mod cache; pub mod ingest_client_resources; pub mod resource_uri; +pub mod utils; use anyhow::Result; use azure_kusto_data::prelude::KustoClient; diff --git a/azure-kusto-ingest/src/resource_manager/authorization_context.rs b/azure-kusto-ingest/src/resource_manager/authorization_context.rs index adb4894..d8709d6 100644 --- a/azure-kusto-ingest/src/resource_manager/authorization_context.rs +++ b/azure-kusto-ingest/src/resource_manager/authorization_context.rs @@ -4,25 +4,26 @@ use anyhow::Result; use azure_kusto_data::prelude::KustoClient; use tokio::sync::RwLock; -use super::cache::{Cached, Refreshing}; +use super::cache::{Cached, ThreadSafeCachedValue}; +use super::utils::get_column_index; use super::RESOURCE_REFRESH_PERIOD; -pub type KustoIdentityToken = String; +pub(crate) type KustoIdentityToken = String; /// Logic to obtain a Kusto identity token from the management endpoint. This auth token is a temporary token #[derive(Debug, Clone)] -pub struct AuthorizationContext { +pub(crate) struct AuthorizationContext { /// A client against a Kusto ingestion cluster client: KustoClient, /// Cache of the Kusto identity token - auth_context_cache: Refreshing>, + token_cache: ThreadSafeCachedValue>, } impl AuthorizationContext { pub fn new(client: KustoClient) -> Self { Self { client, - auth_context_cache: Arc::new(RwLock::new(Cached::new(None, RESOURCE_REFRESH_PERIOD))), + token_cache: Arc::new(RwLock::new(Cached::new(None, RESOURCE_REFRESH_PERIOD))), } } @@ -45,64 +46,59 @@ impl AuthorizationContext { }; // Check that a column in this table actually exists called `AuthorizationContext` - let index = table - .columns - .iter() - .position(|c| c.column_name == "AuthorizationContext") - .ok_or(anyhow::anyhow!( - "AuthorizationContext column is missing in the table" - ))?; + let index = get_column_index(table, "AuthorizationContext")?; // Check that there is only 1 row in the table, and that the value in the first row at the given index is not empty - let kusto_identity_token = match &table.rows[..] { + let token = match &table.rows[..] { [row] => row.get(index).ok_or(anyhow::anyhow!( "Kusto response did not contain a value in the first row at position {}", index ))?, _ => { return Err(anyhow::anyhow!( - "Kusto Expected 1 row in results, found {}", + "Kusto expected 1 row in results, found {}", table.rows.len() )) } }; // Convert the JSON string into a Rust string - let kusto_identity_token = kusto_identity_token.as_str().ok_or(anyhow::anyhow!( - "Kusto response did not contain a string value" + let token = token.as_str().ok_or(anyhow::anyhow!( + "Kusto response did not contain a string value: {:?}", + token ))?; - if kusto_identity_token.chars().all(char::is_whitespace) { + if token.chars().all(char::is_whitespace) { return Err(anyhow::anyhow!("Kusto identity token is empty")); } - Ok(kusto_identity_token.to_string()) + Ok(token.to_string()) } /// Fetches the latest Kusto identity token, either retrieving from cache if valid, or by executing a KQL query - pub async fn get(&self) -> Result { + pub(crate) async fn get(&self) -> Result { // Attempt to get the token from the cache - let auth_context_cache = self.auth_context_cache.read().await; - if !auth_context_cache.is_expired() { - if let Some(token) = auth_context_cache.get() { + let token_cache = self.token_cache.read().await; + if !token_cache.is_expired() { + if let Some(token) = token_cache.get() { return Ok(token.clone()); } } // Drop the read lock and get a write lock to refresh the token - drop(auth_context_cache); - let mut auth_context_cache = self.auth_context_cache.write().await; + drop(token_cache); + let mut token_cache = self.token_cache.write().await; // Again attempt to return from cache, check is done in case another thread // refreshed the token while we were waiting on the write lock - if !auth_context_cache.is_expired() { - if let Some(token) = auth_context_cache.get() { + if !token_cache.is_expired() { + if let Some(token) = token_cache.get() { return Ok(token.clone()); } } // Fetch new token from Kusto, update the cache, and return the token let token = self.query_kusto_identity_token().await?; - auth_context_cache.update(Some(token.clone())); + token_cache.update(Some(token.clone())); Ok(token) } diff --git a/azure-kusto-ingest/src/resource_manager/cache.rs b/azure-kusto-ingest/src/resource_manager/cache.rs index eea665a..2170c31 100644 --- a/azure-kusto-ingest/src/resource_manager/cache.rs +++ b/azure-kusto-ingest/src/resource_manager/cache.rs @@ -37,7 +37,7 @@ impl Cached { } } -pub type Refreshing = Arc>>; +pub type ThreadSafeCachedValue = Arc>>; #[cfg(test)] mod tests { diff --git a/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs index 7cb7b87..d1a85d2 100644 --- a/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs +++ b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs @@ -3,8 +3,9 @@ use std::sync::Arc; use crate::client_options::QueuedIngestClientOptions; use super::{ - cache::{Cached, Refreshing}, + cache::{Cached, ThreadSafeCachedValue}, resource_uri::{ClientFromResourceUri, ResourceUri}, + utils::get_column_index, RESOURCE_REFRESH_PERIOD, }; use anyhow::Result; @@ -14,19 +15,6 @@ use azure_storage_blobs::prelude::ContainerClient; use azure_storage_queues::QueueClient; use tokio::sync::RwLock; -/// Helper to get a column index from a table -// TODO: this could be moved upstream into Kusto Data - would likely result in a change to the API of this function to return an Option -fn get_column_index(table: &TableV1, column_name: &str) -> Result { - table - .columns - .iter() - .position(|c| c.column_name == column_name) - .ok_or(anyhow::anyhow!( - "{} column is missing in the table", - column_name - )) -} - /// Helper to get a resource URI from a table, erroring if there are no resources of the given name fn get_resource_by_name(table: &TableV1, resource_name: String) -> Result> { let storage_root_index = get_column_index(table, "StorageRoot")?; @@ -38,7 +26,8 @@ fn get_resource_by_name(table: &TableV1, resource_name: String) -> Result for InnerIngestClientResour pub struct IngestClientResources { client: KustoClient, - resources: Refreshing>, + resources: ThreadSafeCachedValue>, client_options: QueuedIngestClientOptions, } diff --git a/azure-kusto-ingest/src/resource_manager/utils.rs b/azure-kusto-ingest/src/resource_manager/utils.rs new file mode 100644 index 0000000..62a7167 --- /dev/null +++ b/azure-kusto-ingest/src/resource_manager/utils.rs @@ -0,0 +1,15 @@ +use anyhow::Result; +use azure_kusto_data::models::TableV1; + +/// Helper to get a column index from a table +// TODO: this could be moved upstream into Kusto Data - would likely result in a change to the API of this function to return an Option +pub fn get_column_index(table: &TableV1, column_name: &str) -> Result { + table + .columns + .iter() + .position(|c| c.column_name == column_name) + .ok_or(anyhow::anyhow!( + "{} column is missing in the table", + column_name + )) +} From 909bd03645c9c70f72850d715c2b29bbcca57044 Mon Sep 17 00:00:00 2001 From: Krishan Mistry Date: Tue, 5 Sep 2023 11:48:52 +0100 Subject: [PATCH 35/44] use env vars in example --- .../examples/ingest_from_blob.rs | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/azure-kusto-ingest/examples/ingest_from_blob.rs b/azure-kusto-ingest/examples/ingest_from_blob.rs index 574ad7b..2948932 100644 --- a/azure-kusto-ingest/examples/ingest_from_blob.rs +++ b/azure-kusto-ingest/examples/ingest_from_blob.rs @@ -1,3 +1,5 @@ +use std::env; + use anyhow::Result; use azure_kusto_data::prelude::{ConnectionString, KustoClient, KustoClientOptions}; use azure_kusto_ingest::data_format::DataFormat; @@ -5,7 +7,8 @@ use azure_kusto_ingest::descriptors::{BlobAuth, BlobDescriptor}; use azure_kusto_ingest::ingestion_properties::IngestionProperties; use azure_kusto_ingest::queued_ingest::QueuedIngestClient; -/// Example of ingesting data into Kusto from Azure Blob Storage using managed identities +/// Example of ingesting data into Kusto from Azure Blob Storage using managed identities. +/// This example enforces that the Kusto cluster has a system assigned managed identity with access to the storage account /// /// There are some steps that need to be taken to allow for managed identities to work: /// - Permissions as the ingestor to initiate ingestion @@ -14,14 +17,14 @@ use azure_kusto_ingest::queued_ingest::QueuedIngestClient; /// https://learn.microsoft.com/en-us/azure/data-explorer/ingest-data-managed-identity #[tokio::main] async fn main() -> Result<()> { - let cluster_uri = "https://ingest-..kusto.windows.net"; - let user_mi_object_id = ""; + let cluster_ingest_uri = env::var("KUSTO_INGEST_URI").expect("Must define KUSTO_INGEST_URI"); + let user_mi_object_id = env::var("KUSTO_USER_MI_OBJECT_ID").expect("Must define KUSTO_USER_MI_OBJECT_ID"); // Create a Kusto client with managed identity authentication via the user assigned identity let kusto_client = KustoClient::new( ConnectionString::with_managed_identity_auth( - cluster_uri, - Some(user_mi_object_id.to_string()), + cluster_ingest_uri, + Some(user_mi_object_id), ), KustoClientOptions::default(), )?; @@ -31,8 +34,8 @@ async fn main() -> Result<()> { // Define ingestion properties let ingestion_properties = IngestionProperties { - database_name: "".into(), - table_name: "".into(), + database_name: env::var("KUSTO_DATABASE_NAME").expect("Must define KUSTO_DATABASE_NAME"), + table_name: env::var("KUSTO_TABLE_NAME").expect("Must define KUSTO_TABLE_NAME"), // Don't delete the blob on successful ingestion retain_blob_on_success: Some(true), // File format of the blob is Parquet @@ -42,11 +45,15 @@ async fn main() -> Result<()> { }; // Define the blob to ingest from - let blob_uri = "https://.blob.core.windows.net/"; + let blob_uri = env::var("BLOB_URI").expect("Must define BLOB_URI"); // Define the size of the blob if known, this improves ingestion performance as Kusto does not need to access the blob to determine the size - let blob_size = 123; + let blob_size: Option = match env::var("BLOB_SIZE") { + Ok(blob_size) => Some(blob_size.parse().expect("BLOB_SIZE must be a valid u64")), + Err(_) => None, + }; + // Create the blob descriptor, also specifying that the blob should be accessed using the system assigned managed identity of the Kusto cluster - let blob_descriptor = BlobDescriptor::new(blob_uri, Some(blob_size), None) + let blob_descriptor = BlobDescriptor::new(blob_uri, blob_size, None) .with_blob_auth(BlobAuth::SystemAssignedManagedIdentity); queued_ingest_client From 44f9e0822e8bb5fa6af59177160d618f147a7f0e Mon Sep 17 00:00:00 2001 From: Krishan Date: Tue, 12 Sep 2023 15:53:25 +0100 Subject: [PATCH 36/44] Update Azure SDK to 0.15 and remove time 0.1.45 (#3) --- azure-kusto-ingest/Cargo.toml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/azure-kusto-ingest/Cargo.toml b/azure-kusto-ingest/Cargo.toml index b2dcd99..435699f 100644 --- a/azure-kusto-ingest/Cargo.toml +++ b/azure-kusto-ingest/Cargo.toml @@ -8,13 +8,13 @@ edition = "2021" [dependencies] azure-kusto-data = {path = "../azure-kusto-data"} # Azure SDK for Rust crates versions must be kept in sync -azure_core = "0.14" -azure_storage = "0.14" -azure_storage_blobs = "0.14" -azure_storage_queues = "0.14" +azure_core = "0.15" +azure_storage = "0.15" +azure_storage_blobs = "0.15" +azure_storage_queues = "0.15" anyhow = "1" -chrono = { version = "0.4", features = ["serde"] } +chrono = { version = "0.4", default-features = false, features = ["serde"] } rand = "0.8" serde = { version = "1", features = ["serde_derive"] } serde_json = "1" From 1f9cfa57615544c3e298b1c515475aa3bb7afeeb Mon Sep 17 00:00:00 2001 From: samtarver <133898687+samtarver@users.noreply.github.com> Date: Tue, 19 Sep 2023 10:53:58 +0100 Subject: [PATCH 37/44] Update azure deps to 0.15 --- azure-kusto-data/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-kusto-data/Cargo.toml b/azure-kusto-data/Cargo.toml index 7bd53f7..30bf3de 100644 --- a/azure-kusto-data/Cargo.toml +++ b/azure-kusto-data/Cargo.toml @@ -15,11 +15,11 @@ categories = ["api-bindings"] [dependencies] arrow-array = { version = "42", optional = true } arrow-schema = { version = "42", optional = true } -azure_core = { version = "0.13", features = [ +azure_core = { version = "0.15", features = [ "enable_reqwest", "enable_reqwest_gzip", ] } -azure_identity = "0.13.0" +azure_identity = "0.15" async-trait = "0.1.64" async-convert = "1.0.0" bytes = "1.4" From 74dfd6a28d166d639276c614303ff3301953230c Mon Sep 17 00:00:00 2001 From: Krishan Date: Mon, 6 Nov 2023 19:42:22 +0000 Subject: [PATCH 38/44] Azure SDK deps to 0.16 (#5) * Azure SDK for Rust dependencies to 0.16 * changes --- azure-kusto-data/Cargo.toml | 4 +-- azure-kusto-ingest/Cargo.toml | 8 ++--- .../examples/ingest_from_blob.rs | 8 ++--- .../src/resource_manager/resource_uri.rs | 35 ++++++++++++------- 4 files changed, 31 insertions(+), 24 deletions(-) diff --git a/azure-kusto-data/Cargo.toml b/azure-kusto-data/Cargo.toml index 30bf3de..6bed2bc 100644 --- a/azure-kusto-data/Cargo.toml +++ b/azure-kusto-data/Cargo.toml @@ -15,11 +15,11 @@ categories = ["api-bindings"] [dependencies] arrow-array = { version = "42", optional = true } arrow-schema = { version = "42", optional = true } -azure_core = { version = "0.15", features = [ +azure_core = { version = "0.16", features = [ "enable_reqwest", "enable_reqwest_gzip", ] } -azure_identity = "0.15" +azure_identity = "0.16" async-trait = "0.1.64" async-convert = "1.0.0" bytes = "1.4" diff --git a/azure-kusto-ingest/Cargo.toml b/azure-kusto-ingest/Cargo.toml index 435699f..c695da9 100644 --- a/azure-kusto-ingest/Cargo.toml +++ b/azure-kusto-ingest/Cargo.toml @@ -8,10 +8,10 @@ edition = "2021" [dependencies] azure-kusto-data = {path = "../azure-kusto-data"} # Azure SDK for Rust crates versions must be kept in sync -azure_core = "0.15" -azure_storage = "0.15" -azure_storage_blobs = "0.15" -azure_storage_queues = "0.15" +azure_core = "0.16" +azure_storage = "0.16" +azure_storage_blobs = "0.16" +azure_storage_queues = "0.16" anyhow = "1" chrono = { version = "0.4", default-features = false, features = ["serde"] } diff --git a/azure-kusto-ingest/examples/ingest_from_blob.rs b/azure-kusto-ingest/examples/ingest_from_blob.rs index 2948932..ceae4b9 100644 --- a/azure-kusto-ingest/examples/ingest_from_blob.rs +++ b/azure-kusto-ingest/examples/ingest_from_blob.rs @@ -18,14 +18,12 @@ use azure_kusto_ingest::queued_ingest::QueuedIngestClient; #[tokio::main] async fn main() -> Result<()> { let cluster_ingest_uri = env::var("KUSTO_INGEST_URI").expect("Must define KUSTO_INGEST_URI"); - let user_mi_object_id = env::var("KUSTO_USER_MI_OBJECT_ID").expect("Must define KUSTO_USER_MI_OBJECT_ID"); + let user_mi_object_id = + env::var("KUSTO_USER_MI_OBJECT_ID").expect("Must define KUSTO_USER_MI_OBJECT_ID"); // Create a Kusto client with managed identity authentication via the user assigned identity let kusto_client = KustoClient::new( - ConnectionString::with_managed_identity_auth( - cluster_ingest_uri, - Some(user_mi_object_id), - ), + ConnectionString::with_managed_identity_auth(cluster_ingest_uri, Some(user_mi_object_id)), KustoClientOptions::default(), )?; diff --git a/azure-kusto-ingest/src/resource_manager/resource_uri.rs b/azure-kusto-ingest/src/resource_manager/resource_uri.rs index 8fb2ef6..41f2287 100644 --- a/azure-kusto-ingest/src/resource_manager/resource_uri.rs +++ b/azure-kusto-ingest/src/resource_manager/resource_uri.rs @@ -65,10 +65,12 @@ pub(crate) trait ClientFromResourceUri { impl ClientFromResourceUri for QueueClient { fn create_client(resource_uri: ResourceUri, client_options: ClientOptions) -> Self { - QueueServiceClientBuilder::with_location(azure_storage::CloudLocation::Custom { - uri: resource_uri.service_uri, - credentials: resource_uri.sas_token, - }) + QueueServiceClientBuilder::with_location( + azure_storage::CloudLocation::Custom { + uri: resource_uri.service_uri, + }, + resource_uri.sas_token, + ) .client_options(client_options) .build() .queue_client(resource_uri.object_name) @@ -77,10 +79,12 @@ impl ClientFromResourceUri for QueueClient { impl ClientFromResourceUri for ContainerClient { fn create_client(resource_uri: ResourceUri, client_options: ClientOptions) -> Self { - ClientBuilder::with_location(azure_storage::CloudLocation::Custom { - uri: resource_uri.service_uri, - credentials: resource_uri.sas_token, - }) + ClientBuilder::with_location( + azure_storage::CloudLocation::Custom { + uri: resource_uri.service_uri, + }, + resource_uri.sas_token, + ) .client_options(client_options) .container_client(resource_uri.object_name) } @@ -88,6 +92,8 @@ impl ClientFromResourceUri for ContainerClient { #[cfg(test)] mod tests { + use azure_storage::StorageCredentialsInner; + use super::*; use std::convert::TryFrom; @@ -102,12 +108,15 @@ mod tests { ); assert_eq!(resource_uri.object_name, "containerobjectname"); + let storage_credential_inner = std::sync::Arc::into_inner(resource_uri.sas_token.0) + .unwrap() + .into_inner(); assert!(matches!( - resource_uri.sas_token, - StorageCredentials::SASToken(_) + storage_credential_inner, + StorageCredentialsInner::SASToken(_) )); - if let StorageCredentials::SASToken(sas_vec) = resource_uri.sas_token { + if let StorageCredentialsInner::SASToken(sas_vec) = storage_credential_inner { assert_eq!(sas_vec.len(), 1); assert_eq!(sas_vec[0].0, "sas"); assert_eq!(sas_vec[0].1, "token"); @@ -154,7 +163,7 @@ mod tests { let resource_uri = ResourceUri { service_uri: "https://mystorageaccount.queue.core.windows.net".to_string(), object_name: "queuename".to_string(), - sas_token: StorageCredentials::SASToken(vec![("sas".to_string(), "token".to_string())]), + sas_token: StorageCredentials::sas_token("sas=token").unwrap(), }; let client_options = ClientOptions::default(); @@ -168,7 +177,7 @@ mod tests { let resource_uri = ResourceUri { service_uri: "https://mystorageaccount.blob.core.windows.net".to_string(), object_name: "containername".to_string(), - sas_token: StorageCredentials::SASToken(vec![("sas".to_string(), "token".to_string())]), + sas_token: StorageCredentials::sas_token("sas=token").unwrap(), }; let client_options = ClientOptions::default(); From 6145aadb7b005ec027a8a63d4be541050fab1900 Mon Sep 17 00:00:00 2001 From: Krishan Date: Tue, 7 Nov 2023 12:23:18 +0000 Subject: [PATCH 39/44] Azure SDK deps to 0.17 (#6) --- azure-kusto-data/Cargo.toml | 4 ++-- azure-kusto-ingest/Cargo.toml | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/azure-kusto-data/Cargo.toml b/azure-kusto-data/Cargo.toml index 6bed2bc..42a5b6d 100644 --- a/azure-kusto-data/Cargo.toml +++ b/azure-kusto-data/Cargo.toml @@ -15,11 +15,11 @@ categories = ["api-bindings"] [dependencies] arrow-array = { version = "42", optional = true } arrow-schema = { version = "42", optional = true } -azure_core = { version = "0.16", features = [ +azure_core = { version = "0.17", features = [ "enable_reqwest", "enable_reqwest_gzip", ] } -azure_identity = "0.16" +azure_identity = "0.17" async-trait = "0.1.64" async-convert = "1.0.0" bytes = "1.4" diff --git a/azure-kusto-ingest/Cargo.toml b/azure-kusto-ingest/Cargo.toml index c695da9..7ed7692 100644 --- a/azure-kusto-ingest/Cargo.toml +++ b/azure-kusto-ingest/Cargo.toml @@ -8,10 +8,10 @@ edition = "2021" [dependencies] azure-kusto-data = {path = "../azure-kusto-data"} # Azure SDK for Rust crates versions must be kept in sync -azure_core = "0.16" -azure_storage = "0.16" -azure_storage_blobs = "0.16" -azure_storage_queues = "0.16" +azure_core = "0.17" +azure_storage = "0.17" +azure_storage_blobs = "0.17" +azure_storage_queues = "0.17" anyhow = "1" chrono = { version = "0.4", default-features = false, features = ["serde"] } From f0dddceca75a9530333fb26d52d7bc35833d5f67 Mon Sep 17 00:00:00 2001 From: Krishan Date: Thu, 18 Jan 2024 20:01:17 +0000 Subject: [PATCH 40/44] Remove use of anyhow and replace with thiserror, other markups (#7) * Use thiserror * use scopes over explicit use of drop * update syntax * remove std::fmt::Debug * qualify thiserror --- azure-kusto-ingest/Cargo.toml | 2 +- .../examples/ingest_from_blob.rs | 9 ++- azure-kusto-ingest/src/error.rs | 20 +++++ azure-kusto-ingest/src/lib.rs | 1 + azure-kusto-ingest/src/queued_ingest.rs | 18 +---- azure-kusto-ingest/src/resource_manager.rs | 71 ++++++++++++++++- .../resource_manager/authorization_context.rs | 78 ++++++++++++------- .../ingest_client_resources.rs | 74 ++++++++++++------ .../src/resource_manager/resource_uri.rs | 48 ++++++++---- .../src/resource_manager/utils.rs | 9 +-- 10 files changed, 238 insertions(+), 92 deletions(-) create mode 100644 azure-kusto-ingest/src/error.rs diff --git a/azure-kusto-ingest/Cargo.toml b/azure-kusto-ingest/Cargo.toml index 7ed7692..b389b9d 100644 --- a/azure-kusto-ingest/Cargo.toml +++ b/azure-kusto-ingest/Cargo.toml @@ -13,11 +13,11 @@ azure_storage = "0.17" azure_storage_blobs = "0.17" azure_storage_queues = "0.17" -anyhow = "1" chrono = { version = "0.4", default-features = false, features = ["serde"] } rand = "0.8" serde = { version = "1", features = ["serde_derive"] } serde_json = "1" +thiserror = "1" tokio = { version = "1", features = ["full"] } tracing = { version = "0.1", default-features = false, features = ["std"] } url = "2" diff --git a/azure-kusto-ingest/examples/ingest_from_blob.rs b/azure-kusto-ingest/examples/ingest_from_blob.rs index ceae4b9..d69f126 100644 --- a/azure-kusto-ingest/examples/ingest_from_blob.rs +++ b/azure-kusto-ingest/examples/ingest_from_blob.rs @@ -1,6 +1,5 @@ use std::env; -use anyhow::Result; use azure_kusto_data::prelude::{ConnectionString, KustoClient, KustoClientOptions}; use azure_kusto_ingest::data_format::DataFormat; use azure_kusto_ingest::descriptors::{BlobAuth, BlobDescriptor}; @@ -16,7 +15,7 @@ use azure_kusto_ingest::queued_ingest::QueuedIngestClient; /// - Permissions for Kusto to access storage /// https://learn.microsoft.com/en-us/azure/data-explorer/ingest-data-managed-identity #[tokio::main] -async fn main() -> Result<()> { +async fn main() -> Result<(), Box> { let cluster_ingest_uri = env::var("KUSTO_INGEST_URI").expect("Must define KUSTO_INGEST_URI"); let user_mi_object_id = env::var("KUSTO_USER_MI_OBJECT_ID").expect("Must define KUSTO_USER_MI_OBJECT_ID"); @@ -54,7 +53,9 @@ async fn main() -> Result<()> { let blob_descriptor = BlobDescriptor::new(blob_uri, blob_size, None) .with_blob_auth(BlobAuth::SystemAssignedManagedIdentity); - queued_ingest_client + let _ = queued_ingest_client .ingest_from_blob(blob_descriptor, ingestion_properties) - .await + .await?; + + Ok(()) } diff --git a/azure-kusto-ingest/src/error.rs b/azure-kusto-ingest/src/error.rs new file mode 100644 index 0000000..cfa9462 --- /dev/null +++ b/azure-kusto-ingest/src/error.rs @@ -0,0 +1,20 @@ +//! Defines [Error] for representing failures in various operations. + +/// Error type for kusto ingestion operations. +#[derive(thiserror::Error, Debug)] +pub enum Error { + /// Error raised when failing to obtain ingestion resources. + #[error("Error obtaining ingestion resources: {0}")] + ResourceManagerError(#[from] super::resource_manager::ResourceManagerError), + + /// Error relating to (de-)serialization of JSON data + #[error("Error in JSON serialization/deserialization: {0}")] + JsonError(#[from] serde_json::Error), + + /// Error occurring within core azure crates + #[error("Error in azure-core: {0}")] + AzureError(#[from] azure_core::error::Error), +} + +/// Result type for kusto ingest operations. +pub type Result = std::result::Result; diff --git a/azure-kusto-ingest/src/lib.rs b/azure-kusto-ingest/src/lib.rs index eecc220..27eb0cb 100644 --- a/azure-kusto-ingest/src/lib.rs +++ b/azure-kusto-ingest/src/lib.rs @@ -1,6 +1,7 @@ pub mod client_options; pub mod data_format; pub mod descriptors; +pub mod error; pub(crate) mod ingestion_blob_info; pub mod ingestion_properties; pub mod queued_ingest; diff --git a/azure-kusto-ingest/src/queued_ingest.rs b/azure-kusto-ingest/src/queued_ingest.rs index 13e9fff..a732fee 100644 --- a/azure-kusto-ingest/src/queued_ingest.rs +++ b/azure-kusto-ingest/src/queued_ingest.rs @@ -1,11 +1,8 @@ use std::sync::Arc; -use anyhow::Result; +use crate::error::Result; use azure_core::base64; use azure_kusto_data::prelude::KustoClient; -use rand::rngs::StdRng; -use rand::seq::SliceRandom; -use rand::SeedableRng; use tracing::debug; use crate::client_options::QueuedIngestClientOptions; @@ -47,8 +44,8 @@ impl QueuedIngestClient { blob_descriptor: BlobDescriptor, ingestion_properties: IngestionProperties, ) -> Result<()> { - let ingestion_queues = self.resource_manager.ingestion_queues().await?; - debug!("ingestion queues: {:#?}", ingestion_queues); + let queue_client = self.resource_manager.ingestion_queue().await?; + debug!("ingestion queues: {:#?}", queue_client); let auth_context = self.resource_manager.authorization_context().await?; debug!("auth_context: {:#?}\n", auth_context); @@ -57,14 +54,7 @@ impl QueuedIngestClient { QueuedIngestionMessage::new(&blob_descriptor, &ingestion_properties, auth_context); debug!("message: {:#?}\n", message); - // Pick a random queue from the queue clients returned by the resource manager - let mut rng: StdRng = SeedableRng::from_entropy(); - let queue_client = ingestion_queues - .choose(&mut rng) - .ok_or(anyhow::anyhow!("Failed to pick a random queue"))?; - debug!("randomly seeded queue_client: {:#?}\n", queue_client); - - let message = serde_json::to_string(&message).unwrap(); + let message = serde_json::to_string(&message)?; debug!("message as string: {}\n", message); // Base64 encode the ingestion message diff --git a/azure-kusto-ingest/src/resource_manager.rs b/azure-kusto-ingest/src/resource_manager.rs index a91c041..24c2f08 100644 --- a/azure-kusto-ingest/src/resource_manager.rs +++ b/azure-kusto-ingest/src/resource_manager.rs @@ -6,7 +6,6 @@ pub mod ingest_client_resources; pub mod resource_uri; pub mod utils; -use anyhow::Result; use azure_kusto_data::prelude::KustoClient; use azure_storage_queues::QueueClient; @@ -18,8 +17,24 @@ use self::{ ingest_client_resources::IngestClientResources, }; +use rand::{rngs::StdRng, seq::SliceRandom, SeedableRng}; + pub const RESOURCE_REFRESH_PERIOD: Duration = Duration::from_secs(60 * 60); +#[derive(Debug, thiserror::Error)] +pub enum ResourceManagerError { + #[error("Failed to obtain ingestion resources: {0}")] + IngestClientResourcesError(#[from] ingest_client_resources::IngestionResourceError), + + #[error("Failed to obtain authorization token: {0}")] + AuthorizationContextError(#[from] authorization_context::KustoIdentityTokenError), + + #[error("Failed to select a resource - no resources found")] + NoResourcesFound, +} + +type Result = std::result::Result; + /// ResourceManager is a struct that keeps track of all the resources required for ingestion using the queued flavour pub struct ResourceManager { ingest_client_resources: Arc, @@ -39,12 +54,62 @@ impl ResourceManager { } /// Returns the latest [QueueClient]s ready for posting ingestion messages to - pub async fn ingestion_queues(&self) -> Result> { + async fn ingestion_queues(&self) -> Result> { Ok(self.ingest_client_resources.get().await?.ingestion_queues) } + /// Returns a [QueueClient] to ingest to. + /// This is a random selection from the list of ingestion queues + pub async fn ingestion_queue(&self) -> Result { + let ingestion_queues = self.ingestion_queues().await?; + let selected_queue = select_random_resource(ingestion_queues)?; + Ok(selected_queue.clone()) + } + /// Returns the latest [KustoIdentityToken] to be added as an authorization context to ingestion messages pub async fn authorization_context(&self) -> Result { - self.authorization_context.get().await + self.authorization_context + .get() + .await + .map_err(ResourceManagerError::AuthorizationContextError) + } +} +/// Selects a random resource from the given list of resources +fn select_random_resource(resources: Vec) -> Result { + let mut rng: StdRng = SeedableRng::from_entropy(); + resources + .choose(&mut rng) + .ok_or(ResourceManagerError::NoResourcesFound) + .cloned() +} + +#[cfg(test)] +mod select_random_resource_tests { + use super::*; + + #[test] + fn single_resource() { + const VALUE: i32 = 1; + let resources = vec![VALUE]; + let selected_resource = select_random_resource(resources).unwrap(); + assert!(selected_resource == VALUE) + } + + #[test] + fn multiple_resources() { + let resources = vec![1, 2, 3, 4, 5]; + let selected_resource = select_random_resource(resources.clone()).unwrap(); + assert!(resources.contains(&selected_resource)); + } + + #[test] + fn no_resources() { + let resources: Vec = vec![]; + let selected_resource = select_random_resource(resources); + assert!(selected_resource.is_err()); + assert!(matches!( + selected_resource.unwrap_err(), + ResourceManagerError::NoResourcesFound + )) } } diff --git a/azure-kusto-ingest/src/resource_manager/authorization_context.rs b/azure-kusto-ingest/src/resource_manager/authorization_context.rs index d8709d6..b27892e 100644 --- a/azure-kusto-ingest/src/resource_manager/authorization_context.rs +++ b/azure-kusto-ingest/src/resource_manager/authorization_context.rs @@ -1,7 +1,7 @@ use std::sync::Arc; -use anyhow::Result; use azure_kusto_data::prelude::KustoClient; +use serde_json::Value; use tokio::sync::RwLock; use super::cache::{Cached, ThreadSafeCachedValue}; @@ -10,6 +10,30 @@ use super::RESOURCE_REFRESH_PERIOD; pub(crate) type KustoIdentityToken = String; +const AUTHORIZATION_CONTEXT: &str = "AuthorizationContext"; + +#[derive(thiserror::Error, Debug)] +pub enum KustoIdentityTokenError { + #[error("Kusto expected 1 table in results, found {0}")] + ExpectedOneTable(usize), + + #[error("Kusto expected 1 row in table, found {0}")] + ExpectedOneRow(usize), + + #[error("Column {0} not found in table")] + ColumnNotFound(String), + + #[error("Invalid JSON response from Kusto: {0:?}")] + InvalidJSONResponse(Value), + + #[error("Token is empty")] + EmptyToken, + + #[error(transparent)] + KustoError(#[from] azure_kusto_data::error::Error), +} + +type Result = std::result::Result; /// Logic to obtain a Kusto identity token from the management endpoint. This auth token is a temporary token #[derive(Debug, Clone)] pub(crate) struct AuthorizationContext { @@ -38,38 +62,36 @@ impl AuthorizationContext { let table = match &results.tables[..] { [a] => a, _ => { - return Err(anyhow::anyhow!( - "Kusto Expected 1 table in results, found {}", - results.tables.len() + return Err(KustoIdentityTokenError::ExpectedOneTable( + results.tables.len(), )) } }; // Check that a column in this table actually exists called `AuthorizationContext` - let index = get_column_index(table, "AuthorizationContext")?; + let index = get_column_index(table, AUTHORIZATION_CONTEXT).ok_or( + KustoIdentityTokenError::ColumnNotFound(AUTHORIZATION_CONTEXT.into()), + )?; // Check that there is only 1 row in the table, and that the value in the first row at the given index is not empty let token = match &table.rows[..] { - [row] => row.get(index).ok_or(anyhow::anyhow!( - "Kusto response did not contain a value in the first row at position {}", - index - ))?, - _ => { - return Err(anyhow::anyhow!( - "Kusto expected 1 row in results, found {}", - table.rows.len() - )) - } + [row] => row + .get(index) + .ok_or(KustoIdentityTokenError::ColumnNotFound( + AUTHORIZATION_CONTEXT.into(), + ))?, + _ => return Err(KustoIdentityTokenError::ExpectedOneRow(table.rows.len())), }; // Convert the JSON string into a Rust string - let token = token.as_str().ok_or(anyhow::anyhow!( - "Kusto response did not contain a string value: {:?}", - token - ))?; + let token = token + .as_str() + .ok_or(KustoIdentityTokenError::InvalidJSONResponse( + token.to_owned(), + ))?; if token.chars().all(char::is_whitespace) { - return Err(anyhow::anyhow!("Kusto identity token is empty")); + return Err(KustoIdentityTokenError::EmptyToken); } Ok(token.to_string()) @@ -77,15 +99,17 @@ impl AuthorizationContext { /// Fetches the latest Kusto identity token, either retrieving from cache if valid, or by executing a KQL query pub(crate) async fn get(&self) -> Result { - // Attempt to get the token from the cache - let token_cache = self.token_cache.read().await; - if !token_cache.is_expired() { - if let Some(token) = token_cache.get() { - return Ok(token.clone()); + // first, try to get the resources from the cache by obtaining a read lock + { + let token_cache = self.token_cache.read().await; + if !token_cache.is_expired() { + if let Some(token) = token_cache.get() { + return Ok(token.clone()); + } } } - // Drop the read lock and get a write lock to refresh the token - drop(token_cache); + + // obtain a write lock to refresh the kusto response let mut token_cache = self.token_cache.write().await; // Again attempt to return from cache, check is done in case another thread diff --git a/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs index d1a85d2..c59e4aa 100644 --- a/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs +++ b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs @@ -5,16 +5,44 @@ use crate::client_options::QueuedIngestClientOptions; use super::{ cache::{Cached, ThreadSafeCachedValue}, resource_uri::{ClientFromResourceUri, ResourceUri}, - utils::get_column_index, - RESOURCE_REFRESH_PERIOD, + utils, RESOURCE_REFRESH_PERIOD, }; -use anyhow::Result; use azure_core::ClientOptions; use azure_kusto_data::{models::TableV1, prelude::KustoClient}; use azure_storage_blobs::prelude::ContainerClient; use azure_storage_queues::QueueClient; +use serde_json::Value; use tokio::sync::RwLock; +#[derive(Debug, thiserror::Error)] +pub enum IngestionResourceError { + #[error("{column_name} column is missing in the table")] + ColumnNotFoundError { column_name: String }, + + #[error("Response returned from Kusto could not be parsed as a string: {0}")] + ParseAsStringError(Value), + + #[error("No {0} resources found in the table")] + NoResourcesFound(String), + + #[error(transparent)] + KustoError(#[from] azure_kusto_data::error::Error), + + #[error(transparent)] + ResourceUriError(#[from] super::resource_uri::ResourceUriError), + + #[error("Kusto expected a table containing ingestion resource results, found no tables")] + NoTablesFound, +} + +type Result = std::result::Result; + +fn get_column_index(table: &TableV1, column_name: &str) -> Result { + utils::get_column_index(table, column_name).ok_or(IngestionResourceError::ColumnNotFoundError { + column_name: column_name.to_string(), + }) +} + /// Helper to get a resource URI from a table, erroring if there are no resources of the given name fn get_resource_by_name(table: &TableV1, resource_name: String) -> Result> { let storage_root_index = get_column_index(table, "StorageRoot")?; @@ -25,18 +53,15 @@ fn get_resource_by_name(table: &TableV1, resource_name: String) -> Result for InnerIngestClientResources { - type Error = anyhow::Error; + type Error = IngestionResourceError; /// Attempts to create a new InnerIngestClientResources from the given [TableV1] and [QueuedIngestClientOptions] - fn try_from((table, client_options): (&TableV1, &QueuedIngestClientOptions)) -> Result { + fn try_from( + (table, client_options): (&TableV1, &QueuedIngestClientOptions), + ) -> std::result::Result { let secured_ready_for_aggregation_queues = get_resource_by_name(table, "SecuredReadyForAggregationQueue".to_string())?; let temp_storage = get_resource_by_name(table, "TempStorage".to_string())?; @@ -104,24 +131,27 @@ impl IngestClientResources { .execute_command("NetDefaultDB", ".get ingestion resources", None) .await?; - let new_resources = results.tables.first().ok_or(anyhow::anyhow!( - "Kusto expected a table containing ingestion resource results, found no tables", - ))?; + let new_resources = results + .tables + .first() + .ok_or(IngestionResourceError::NoTablesFound)?; InnerIngestClientResources::try_from((new_resources, &self.client_options)) } /// Gets the latest resources either from cache, or fetching from Kusto and updating the cached resources pub async fn get(&self) -> Result { - let resources = self.resources.read().await; - if !resources.is_expired() { - if let Some(inner_value) = resources.get() { - return Ok(inner_value.clone()); + // first, try to get the resources from the cache by obtaining a read lock + { + let resources = self.resources.read().await; + if !resources.is_expired() { + if let Some(inner_value) = resources.get() { + return Ok(inner_value.clone()); + } } } - // otherwise, drop the read lock and get a write lock to refresh the kusto response - drop(resources); + // obtain a write lock to refresh the kusto response let mut resources = self.resources.write().await; // check again in case another thread refreshed while we were waiting on the write lock diff --git a/azure-kusto-ingest/src/resource_manager/resource_uri.rs b/azure-kusto-ingest/src/resource_manager/resource_uri.rs index 41f2287..f41d1ff 100644 --- a/azure-kusto-ingest/src/resource_manager/resource_uri.rs +++ b/azure-kusto-ingest/src/resource_manager/resource_uri.rs @@ -4,7 +4,23 @@ use azure_storage_blobs::prelude::{ClientBuilder, ContainerClient}; use azure_storage_queues::{QueueClient, QueueServiceClientBuilder}; use url::Url; -use anyhow::Result; +#[derive(Debug, thiserror::Error)] +pub enum ResourceUriError { + #[error("URI scheme must be 'https', was '{0}'")] + InvalidScheme(String), + + #[error("Object name is missing in the URI")] + MissingObjectName, + + #[error("SAS token is missing in the URI as a query parameter")] + MissingSasToken, + + #[error(transparent)] + ParseError(#[from] url::ParseError), + + #[error(transparent)] + AzureError(#[from] azure_core::Error), +} /// Parsing logic of resource URIs as returned by the Kusto management endpoint #[derive(Debug, Clone)] @@ -15,18 +31,14 @@ pub(crate) struct ResourceUri { } impl TryFrom<&str> for ResourceUri { - type Error = anyhow::Error; + type Error = ResourceUriError; - fn try_from(uri: &str) -> Result { + fn try_from(uri: &str) -> Result { let parsed_uri = Url::parse(uri)?; let scheme = match parsed_uri.scheme() { "https" => "https".to_string(), - other_scheme => { - return Err(anyhow::anyhow!( - "URI scheme must be 'https', was '{other_scheme}'" - )) - } + other_scheme => return Err(ResourceUriError::InvalidScheme(other_scheme.to_string())), }; let service_uri = scheme @@ -36,17 +48,13 @@ impl TryFrom<&str> for ResourceUri { .expect("Url::parse should always return a host for a URI"); let object_name = match parsed_uri.path().trim_start().trim_start_matches('/') { - "" => return Err(anyhow::anyhow!("Object name is missing in the URI")), + "" => return Err(ResourceUriError::MissingObjectName), name => name.to_string(), }; let sas_token = match parsed_uri.query() { Some(query) => query.to_string(), - None => { - return Err(anyhow::anyhow!( - "SAS token is missing in the URI as a query parameter" - )) - } + None => return Err(ResourceUriError::MissingSasToken), }; let sas_token = StorageCredentials::sas_token(sas_token)?; @@ -138,6 +146,10 @@ mod tests { println!("{:#?}", resource_uri); assert!(resource_uri.is_err()); + assert!(matches!( + resource_uri.unwrap_err(), + ResourceUriError::ParseError(_) + )); } #[test] @@ -147,6 +159,10 @@ mod tests { println!("{:#?}", resource_uri); assert!(resource_uri.is_err()); + assert!(matches!( + resource_uri.unwrap_err(), + ResourceUriError::MissingObjectName + )); } #[test] @@ -156,6 +172,10 @@ mod tests { println!("{:#?}", resource_uri); assert!(resource_uri.is_err()); + assert!(matches!( + resource_uri.unwrap_err(), + ResourceUriError::MissingSasToken + )); } #[test] diff --git a/azure-kusto-ingest/src/resource_manager/utils.rs b/azure-kusto-ingest/src/resource_manager/utils.rs index 62a7167..ebf5616 100644 --- a/azure-kusto-ingest/src/resource_manager/utils.rs +++ b/azure-kusto-ingest/src/resource_manager/utils.rs @@ -1,15 +1,10 @@ -use anyhow::Result; use azure_kusto_data::models::TableV1; /// Helper to get a column index from a table -// TODO: this could be moved upstream into Kusto Data - would likely result in a change to the API of this function to return an Option -pub fn get_column_index(table: &TableV1, column_name: &str) -> Result { +// TODO: this could be moved upstream into Kusto Data +pub fn get_column_index(table: &TableV1, column_name: &str) -> Option { table .columns .iter() .position(|c| c.column_name == column_name) - .ok_or(anyhow::anyhow!( - "{} column is missing in the table", - column_name - )) } From df610985d02a789b7a1b772aec401c9924fe50e1 Mon Sep 17 00:00:00 2001 From: Krishan Date: Thu, 18 Jan 2024 20:30:32 +0000 Subject: [PATCH 41/44] Update to Azure 0.19 (#9) * Ingest: Azure SDK changes for 0.19 * remove default features on data dependency in ingest * Format Rust code using rustfmt --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- azure-kusto-ingest/Cargo.toml | 12 ++++----- .../src/resource_manager/resource_uri.rs | 26 +++++++++++++++---- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/azure-kusto-ingest/Cargo.toml b/azure-kusto-ingest/Cargo.toml index b389b9d..1f75d07 100644 --- a/azure-kusto-ingest/Cargo.toml +++ b/azure-kusto-ingest/Cargo.toml @@ -6,12 +6,12 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -azure-kusto-data = {path = "../azure-kusto-data"} +azure-kusto-data = { path = "../azure-kusto-data", default-features = false } # Azure SDK for Rust crates versions must be kept in sync -azure_core = "0.17" -azure_storage = "0.17" -azure_storage_blobs = "0.17" -azure_storage_queues = "0.17" +azure_core = "0.19" +azure_storage = "0.19" +azure_storage_blobs = "0.19" +azure_storage_queues = "0.19" chrono = { version = "0.4", default-features = false, features = ["serde"] } rand = "0.8" @@ -21,4 +21,4 @@ thiserror = "1" tokio = { version = "1", features = ["full"] } tracing = { version = "0.1", default-features = false, features = ["std"] } url = "2" -uuid = {version = "1", features = ["v4", "serde"]} +uuid = { version = "1", features = ["v4", "serde"] } diff --git a/azure-kusto-ingest/src/resource_manager/resource_uri.rs b/azure-kusto-ingest/src/resource_manager/resource_uri.rs index f41d1ff..80c5c1c 100644 --- a/azure-kusto-ingest/src/resource_manager/resource_uri.rs +++ b/azure-kusto-ingest/src/resource_manager/resource_uri.rs @@ -15,6 +15,9 @@ pub enum ResourceUriError { #[error("SAS token is missing in the URI as a query parameter")] MissingSasToken, + #[error("Account name is missing in the URI")] + MissingAccountName, + #[error(transparent)] ParseError(#[from] url::ParseError), @@ -27,6 +30,7 @@ pub enum ResourceUriError { pub(crate) struct ResourceUri { pub(crate) service_uri: String, pub(crate) object_name: String, + pub(crate) account_name: String, pub(crate) sas_token: StorageCredentials, } @@ -41,11 +45,18 @@ impl TryFrom<&str> for ResourceUri { other_scheme => return Err(ResourceUriError::InvalidScheme(other_scheme.to_string())), }; - let service_uri = scheme - + "://" - + parsed_uri - .host_str() - .expect("Url::parse should always return a host for a URI"); + let host_string = parsed_uri + .host_str() + .expect("Url::parse should always return a host for a URI"); + + let service_uri = scheme + "://" + host_string; + + let host_string_components = host_string.split_terminator('.').collect::>(); + if host_string_components.len() < 2 { + return Err(ResourceUriError::MissingAccountName); + } + + let account_name = host_string_components[0].to_string(); let object_name = match parsed_uri.path().trim_start().trim_start_matches('/') { "" => return Err(ResourceUriError::MissingObjectName), @@ -61,6 +72,7 @@ impl TryFrom<&str> for ResourceUri { Ok(Self { service_uri, object_name, + account_name, sas_token, }) } @@ -76,6 +88,7 @@ impl ClientFromResourceUri for QueueClient { QueueServiceClientBuilder::with_location( azure_storage::CloudLocation::Custom { uri: resource_uri.service_uri, + account: resource_uri.account_name, }, resource_uri.sas_token, ) @@ -90,6 +103,7 @@ impl ClientFromResourceUri for ContainerClient { ClientBuilder::with_location( azure_storage::CloudLocation::Custom { uri: resource_uri.service_uri, + account: resource_uri.account_name, }, resource_uri.sas_token, ) @@ -183,6 +197,7 @@ mod tests { let resource_uri = ResourceUri { service_uri: "https://mystorageaccount.queue.core.windows.net".to_string(), object_name: "queuename".to_string(), + account_name: "mystorageaccount".to_string(), sas_token: StorageCredentials::sas_token("sas=token").unwrap(), }; @@ -197,6 +212,7 @@ mod tests { let resource_uri = ResourceUri { service_uri: "https://mystorageaccount.blob.core.windows.net".to_string(), object_name: "containername".to_string(), + account_name: "mystorageaccount".to_string(), sas_token: StorageCredentials::sas_token("sas=token").unwrap(), }; From 4b28b5cf8bd455ced818146d64d4fc82aad02c7c Mon Sep 17 00:00:00 2001 From: Krishan Date: Thu, 18 Jan 2024 20:50:12 +0000 Subject: [PATCH 42/44] Use async-lock rather than tokio for RwLock (#10) --- azure-kusto-ingest/Cargo.toml | 5 ++++- .../src/resource_manager/authorization_context.rs | 2 +- azure-kusto-ingest/src/resource_manager/cache.rs | 2 +- .../src/resource_manager/ingest_client_resources.rs | 2 +- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/azure-kusto-ingest/Cargo.toml b/azure-kusto-ingest/Cargo.toml index 1f75d07..cdaeaf8 100644 --- a/azure-kusto-ingest/Cargo.toml +++ b/azure-kusto-ingest/Cargo.toml @@ -13,12 +13,15 @@ azure_storage = "0.19" azure_storage_blobs = "0.19" azure_storage_queues = "0.19" +async-lock = "3" chrono = { version = "0.4", default-features = false, features = ["serde"] } rand = "0.8" serde = { version = "1", features = ["serde_derive"] } serde_json = "1" thiserror = "1" -tokio = { version = "1", features = ["full"] } tracing = { version = "0.1", default-features = false, features = ["std"] } url = "2" uuid = { version = "1", features = ["v4", "serde"] } + +[dev-dependencies] +tokio = { version = "1", features = ["macros"] } diff --git a/azure-kusto-ingest/src/resource_manager/authorization_context.rs b/azure-kusto-ingest/src/resource_manager/authorization_context.rs index b27892e..325ee20 100644 --- a/azure-kusto-ingest/src/resource_manager/authorization_context.rs +++ b/azure-kusto-ingest/src/resource_manager/authorization_context.rs @@ -1,8 +1,8 @@ use std::sync::Arc; +use async_lock::RwLock; use azure_kusto_data::prelude::KustoClient; use serde_json::Value; -use tokio::sync::RwLock; use super::cache::{Cached, ThreadSafeCachedValue}; use super::utils::get_column_index; diff --git a/azure-kusto-ingest/src/resource_manager/cache.rs b/azure-kusto-ingest/src/resource_manager/cache.rs index 2170c31..ade13eb 100644 --- a/azure-kusto-ingest/src/resource_manager/cache.rs +++ b/azure-kusto-ingest/src/resource_manager/cache.rs @@ -3,7 +3,7 @@ use std::{ time::{Duration, Instant}, }; -use tokio::sync::RwLock; +use async_lock::RwLock; /// Wrapper around a value that allows for storing when the value was last updated, /// as well as the period after which it should be refreshed (i.e. expired) diff --git a/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs index c59e4aa..7059bcd 100644 --- a/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs +++ b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs @@ -7,12 +7,12 @@ use super::{ resource_uri::{ClientFromResourceUri, ResourceUri}, utils, RESOURCE_REFRESH_PERIOD, }; +use async_lock::RwLock; use azure_core::ClientOptions; use azure_kusto_data::{models::TableV1, prelude::KustoClient}; use azure_storage_blobs::prelude::ContainerClient; use azure_storage_queues::QueueClient; use serde_json::Value; -use tokio::sync::RwLock; #[derive(Debug, thiserror::Error)] pub enum IngestionResourceError { From ab17f7e27933b29ae8c4c9235d9fc02ccbeb1468 Mon Sep 17 00:00:00 2001 From: Krishan Date: Thu, 18 Jan 2024 21:04:35 +0000 Subject: [PATCH 43/44] Remove debug logs (#11) --- azure-kusto-ingest/Cargo.toml | 1 - azure-kusto-ingest/src/queued_ingest.rs | 9 +-------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/azure-kusto-ingest/Cargo.toml b/azure-kusto-ingest/Cargo.toml index cdaeaf8..37173ff 100644 --- a/azure-kusto-ingest/Cargo.toml +++ b/azure-kusto-ingest/Cargo.toml @@ -19,7 +19,6 @@ rand = "0.8" serde = { version = "1", features = ["serde_derive"] } serde_json = "1" thiserror = "1" -tracing = { version = "0.1", default-features = false, features = ["std"] } url = "2" uuid = { version = "1", features = ["v4", "serde"] } diff --git a/azure-kusto-ingest/src/queued_ingest.rs b/azure-kusto-ingest/src/queued_ingest.rs index a732fee..63b2068 100644 --- a/azure-kusto-ingest/src/queued_ingest.rs +++ b/azure-kusto-ingest/src/queued_ingest.rs @@ -3,7 +3,6 @@ use std::sync::Arc; use crate::error::Result; use azure_core::base64; use azure_kusto_data::prelude::KustoClient; -use tracing::debug; use crate::client_options::QueuedIngestClientOptions; use crate::descriptors::BlobDescriptor; @@ -45,24 +44,18 @@ impl QueuedIngestClient { ingestion_properties: IngestionProperties, ) -> Result<()> { let queue_client = self.resource_manager.ingestion_queue().await?; - debug!("ingestion queues: {:#?}", queue_client); let auth_context = self.resource_manager.authorization_context().await?; - debug!("auth_context: {:#?}\n", auth_context); let message = QueuedIngestionMessage::new(&blob_descriptor, &ingestion_properties, auth_context); - debug!("message: {:#?}\n", message); let message = serde_json::to_string(&message)?; - debug!("message as string: {}\n", message); // Base64 encode the ingestion message let message = base64::encode(&message); - debug!("message as base64 encoded string: {}\n", message); - let resp = queue_client.put_message(message).await?; - debug!("resp: {:#?}\n", resp); + let _resp = queue_client.put_message(message).await?; Ok(()) } From b8ee0197dfd787eb570c9cd7be29f27a0767b46f Mon Sep 17 00:00:00 2001 From: Krishan Date: Fri, 26 Jan 2024 18:34:07 +0000 Subject: [PATCH 44/44] Simplified ingest client markups (#12) * resource_uri changes * markups in client_options * resource_manager markups * remove dependency on chrono - use time * cache changes * add missing dev dependency feature * add some basic tests for caching implementation --- azure-kusto-ingest/Cargo.toml | 4 +- azure-kusto-ingest/src/client_options.rs | 28 ++--- azure-kusto-ingest/src/ingestion_blob_info.rs | 46 +++++++- azure-kusto-ingest/src/queued_ingest.rs | 2 +- azure-kusto-ingest/src/resource_manager.rs | 50 ++------ .../resource_manager/authorization_context.rs | 38 +------ .../src/resource_manager/cache.rs | 107 +++++++++++++++++- .../ingest_client_resources.rs | 44 ++----- .../src/resource_manager/resource_uri.rs | 88 ++++++++++---- 9 files changed, 256 insertions(+), 151 deletions(-) diff --git a/azure-kusto-ingest/Cargo.toml b/azure-kusto-ingest/Cargo.toml index 37173ff..e322d71 100644 --- a/azure-kusto-ingest/Cargo.toml +++ b/azure-kusto-ingest/Cargo.toml @@ -14,13 +14,13 @@ azure_storage_blobs = "0.19" azure_storage_queues = "0.19" async-lock = "3" -chrono = { version = "0.4", default-features = false, features = ["serde"] } rand = "0.8" serde = { version = "1", features = ["serde_derive"] } serde_json = "1" thiserror = "1" +time = { version = "0.3", features = ["serde-human-readable", "macros"] } url = "2" uuid = { version = "1", features = ["v4", "serde"] } [dev-dependencies] -tokio = { version = "1", features = ["macros"] } +tokio = { version = "1", features = ["macros", "rt-multi-thread"] } diff --git a/azure-kusto-ingest/src/client_options.rs b/azure-kusto-ingest/src/client_options.rs index c7b6bd8..b8677eb 100644 --- a/azure-kusto-ingest/src/client_options.rs +++ b/azure-kusto-ingest/src/client_options.rs @@ -3,16 +3,16 @@ use azure_core::ClientOptions; /// Allows configurability of ClientOptions for the storage clients used within [QueuedIngestClient](crate::queued_ingest::QueuedIngestClient) #[derive(Clone, Default)] pub struct QueuedIngestClientOptions { - pub queue_service: ClientOptions, - pub blob_service: ClientOptions, + pub queue_service_options: ClientOptions, + pub blob_service_options: ClientOptions, } impl From for QueuedIngestClientOptions { /// Creates a `QueuedIngestClientOptions` struct where the same [ClientOptions] are used for all services fn from(client_options: ClientOptions) -> Self { Self { - queue_service: client_options.clone(), - blob_service: client_options, + queue_service_options: client_options.clone(), + blob_service_options: client_options, } } } @@ -20,32 +20,32 @@ impl From for QueuedIngestClientOptions { /// Builder for [QueuedIngestClientOptions], call `build()` to create the [QueuedIngestClientOptions] #[derive(Clone, Default)] pub struct QueuedIngestClientOptionsBuilder { - queue_service: ClientOptions, - blob_service: ClientOptions, + queue_service_options: ClientOptions, + blob_service_options: ClientOptions, } impl QueuedIngestClientOptionsBuilder { pub fn new() -> Self { Self { - queue_service: ClientOptions::default(), - blob_service: ClientOptions::default(), + queue_service_options: ClientOptions::default(), + blob_service_options: ClientOptions::default(), } } - pub fn with_queue_service(mut self, queue_service: ClientOptions) -> Self { - self.queue_service = queue_service; + pub fn with_queue_service_options(mut self, queue_service_options: ClientOptions) -> Self { + self.queue_service_options = queue_service_options; self } - pub fn with_blob_service(mut self, blob_service: ClientOptions) -> Self { - self.blob_service = blob_service; + pub fn with_blob_service_options(mut self, blob_service_options: ClientOptions) -> Self { + self.blob_service_options = blob_service_options; self } pub fn build(self) -> QueuedIngestClientOptions { QueuedIngestClientOptions { - queue_service: self.queue_service, - blob_service: self.blob_service, + queue_service_options: self.queue_service_options, + blob_service_options: self.blob_service_options, } } } diff --git a/azure-kusto-ingest/src/ingestion_blob_info.rs b/azure-kusto-ingest/src/ingestion_blob_info.rs index dc63d1b..ef554fa 100644 --- a/azure-kusto-ingest/src/ingestion_blob_info.rs +++ b/azure-kusto-ingest/src/ingestion_blob_info.rs @@ -1,4 +1,3 @@ -use chrono::{DateTime, Utc}; use serde::Serialize; use uuid::Uuid; @@ -8,6 +7,18 @@ use crate::{ resource_manager::authorization_context::KustoIdentityToken, }; +use time::{ + format_description::well_known::{iso8601, Iso8601}, + OffsetDateTime, +}; +/// The [DEFAULT](iso8601::Config::DEFAULT) ISO8601 format that the time crate serializes to uses a 6 digit year, +/// Here we create our own serializer function that uses a 4 digit year which is exposed as `kusto_ingest_iso8601_format` +const CONFIG: iso8601::EncodedConfig = iso8601::Config::DEFAULT + .set_year_is_six_digits(false) + .encode(); +const FORMAT: Iso8601 = Iso8601::; +time::serde::format_description!(kusto_ingest_iso8601_format, OffsetDateTime, FORMAT); + /// Message to be serialized as JSON and sent to the ingestion queue /// /// Basing the ingestion message on @@ -37,7 +48,9 @@ pub(crate) struct QueuedIngestionMessage { /// If set to `true`, any server side aggregation will be skipped - thus overriding the batching policy. Default is `false`. #[serde(skip_serializing_if = "Option::is_none")] flush_immediately: Option, - source_message_creation_time: DateTime, + #[serde(with = "kusto_ingest_iso8601_format")] + source_message_creation_time: OffsetDateTime, + // source_message_creation_time: DateTime, // Extra properties added to the ingestion command additional_properties: AdditionalProperties, } @@ -61,7 +74,7 @@ impl QueuedIngestionMessage { table_name: ingestion_properties.table_name.clone(), retain_blob_on_success: ingestion_properties.retain_blob_on_success, flush_immediately: ingestion_properties.flush_immediately, - source_message_creation_time: Utc::now(), + source_message_creation_time: OffsetDateTime::now_utc(), additional_properties, } } @@ -77,3 +90,30 @@ struct AdditionalProperties { #[serde(rename = "format")] data_format: DataFormat, } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn time_custom_iso8601_serialization() { + #[derive(Serialize, Debug)] + struct TestTimeSerialize { + #[serde(with = "kusto_ingest_iso8601_format")] + customised_time_format: time::OffsetDateTime, + } + + let test_message = TestTimeSerialize { + customised_time_format: time::OffsetDateTime::from_unix_timestamp_nanos( + 1_234_567_890_123_456_789, + ) + .unwrap(), + }; + + let serialized_message = serde_json::to_string(&test_message).unwrap(); + assert_eq!( + serialized_message, + "{\"customised_time_format\":\"2009-02-13T23:31:30.123456789Z\"}" + ); + } +} diff --git a/azure-kusto-ingest/src/queued_ingest.rs b/azure-kusto-ingest/src/queued_ingest.rs index 63b2068..125cadb 100644 --- a/azure-kusto-ingest/src/queued_ingest.rs +++ b/azure-kusto-ingest/src/queued_ingest.rs @@ -43,7 +43,7 @@ impl QueuedIngestClient { blob_descriptor: BlobDescriptor, ingestion_properties: IngestionProperties, ) -> Result<()> { - let queue_client = self.resource_manager.ingestion_queue().await?; + let queue_client = self.resource_manager.random_ingestion_queue().await?; let auth_context = self.resource_manager.authorization_context().await?; diff --git a/azure-kusto-ingest/src/resource_manager.rs b/azure-kusto-ingest/src/resource_manager.rs index 24c2f08..396b339 100644 --- a/azure-kusto-ingest/src/resource_manager.rs +++ b/azure-kusto-ingest/src/resource_manager.rs @@ -17,7 +17,7 @@ use self::{ ingest_client_resources::IngestClientResources, }; -use rand::{rngs::StdRng, seq::SliceRandom, SeedableRng}; +use rand::{seq::SliceRandom, thread_rng}; pub const RESOURCE_REFRESH_PERIOD: Duration = Duration::from_secs(60 * 60); @@ -60,9 +60,14 @@ impl ResourceManager { /// Returns a [QueueClient] to ingest to. /// This is a random selection from the list of ingestion queues - pub async fn ingestion_queue(&self) -> Result { + pub async fn random_ingestion_queue(&self) -> Result { let ingestion_queues = self.ingestion_queues().await?; - let selected_queue = select_random_resource(ingestion_queues)?; + + let mut rng = thread_rng(); + let selected_queue = ingestion_queues + .choose(&mut rng) + .ok_or(ResourceManagerError::NoResourcesFound)?; + Ok(selected_queue.clone()) } @@ -74,42 +79,3 @@ impl ResourceManager { .map_err(ResourceManagerError::AuthorizationContextError) } } -/// Selects a random resource from the given list of resources -fn select_random_resource(resources: Vec) -> Result { - let mut rng: StdRng = SeedableRng::from_entropy(); - resources - .choose(&mut rng) - .ok_or(ResourceManagerError::NoResourcesFound) - .cloned() -} - -#[cfg(test)] -mod select_random_resource_tests { - use super::*; - - #[test] - fn single_resource() { - const VALUE: i32 = 1; - let resources = vec![VALUE]; - let selected_resource = select_random_resource(resources).unwrap(); - assert!(selected_resource == VALUE) - } - - #[test] - fn multiple_resources() { - let resources = vec![1, 2, 3, 4, 5]; - let selected_resource = select_random_resource(resources.clone()).unwrap(); - assert!(resources.contains(&selected_resource)); - } - - #[test] - fn no_resources() { - let resources: Vec = vec![]; - let selected_resource = select_random_resource(resources); - assert!(selected_resource.is_err()); - assert!(matches!( - selected_resource.unwrap_err(), - ResourceManagerError::NoResourcesFound - )) - } -} diff --git a/azure-kusto-ingest/src/resource_manager/authorization_context.rs b/azure-kusto-ingest/src/resource_manager/authorization_context.rs index 325ee20..91eeca6 100644 --- a/azure-kusto-ingest/src/resource_manager/authorization_context.rs +++ b/azure-kusto-ingest/src/resource_manager/authorization_context.rs @@ -1,10 +1,7 @@ -use std::sync::Arc; - -use async_lock::RwLock; use azure_kusto_data::prelude::KustoClient; use serde_json::Value; -use super::cache::{Cached, ThreadSafeCachedValue}; +use super::cache::ThreadSafeCachedValue; use super::utils::get_column_index; use super::RESOURCE_REFRESH_PERIOD; @@ -40,14 +37,14 @@ pub(crate) struct AuthorizationContext { /// A client against a Kusto ingestion cluster client: KustoClient, /// Cache of the Kusto identity token - token_cache: ThreadSafeCachedValue>, + token_cache: ThreadSafeCachedValue, } impl AuthorizationContext { pub fn new(client: KustoClient) -> Self { Self { client, - token_cache: Arc::new(RwLock::new(Cached::new(None, RESOURCE_REFRESH_PERIOD))), + token_cache: ThreadSafeCachedValue::new(RESOURCE_REFRESH_PERIOD), } } @@ -99,31 +96,8 @@ impl AuthorizationContext { /// Fetches the latest Kusto identity token, either retrieving from cache if valid, or by executing a KQL query pub(crate) async fn get(&self) -> Result { - // first, try to get the resources from the cache by obtaining a read lock - { - let token_cache = self.token_cache.read().await; - if !token_cache.is_expired() { - if let Some(token) = token_cache.get() { - return Ok(token.clone()); - } - } - } - - // obtain a write lock to refresh the kusto response - let mut token_cache = self.token_cache.write().await; - - // Again attempt to return from cache, check is done in case another thread - // refreshed the token while we were waiting on the write lock - if !token_cache.is_expired() { - if let Some(token) = token_cache.get() { - return Ok(token.clone()); - } - } - - // Fetch new token from Kusto, update the cache, and return the token - let token = self.query_kusto_identity_token().await?; - token_cache.update(Some(token.clone())); - - Ok(token) + self.token_cache + .get(self.query_kusto_identity_token()) + .await } } diff --git a/azure-kusto-ingest/src/resource_manager/cache.rs b/azure-kusto-ingest/src/resource_manager/cache.rs index ade13eb..0b024e1 100644 --- a/azure-kusto-ingest/src/resource_manager/cache.rs +++ b/azure-kusto-ingest/src/resource_manager/cache.rs @@ -1,4 +1,6 @@ use std::{ + error::Error, + future::Future, sync::Arc, time::{Duration, Instant}, }; @@ -37,10 +39,57 @@ impl Cached { } } -pub type ThreadSafeCachedValue = Arc>>; +#[derive(Debug, Clone)] +pub struct ThreadSafeCachedValue +where + T: Clone, +{ + cache: Arc>>>, +} + +impl ThreadSafeCachedValue { + pub fn new(refresh_period: Duration) -> Self { + Self { + cache: Arc::new(RwLock::new(Cached::new(None, refresh_period))), + } + } + + /// Fetches the latest value, either retrieving from cache if valid, or by executing the callback + pub async fn get(&self, callback: F) -> Result + where + F: Future>, + { + // First, try to get a value from the cache by obtaining a read lock + { + let cache = self.cache.read().await; + if !cache.is_expired() { + if let Some(cached_value) = cache.get() { + return Ok(cached_value.clone()); + } + } + } + + // Obtain a write lock to refresh the cached value + let mut cache = self.cache.write().await; + + // Again attempt to return from cache, check is done in case another thread + // refreshed the cached value while we were waiting on the write lock and its now valid + if !cache.is_expired() { + if let Some(cached_value) = cache.get() { + return Ok(cached_value.clone()); + } + } + + // Fetch new value by executing the callback, update the cache, and return the value + let fetched_value = callback.await?; + cache.update(Some(fetched_value.clone())); + + Ok(fetched_value) + } +} #[cfg(test)] -mod tests { +mod cached_tests { use super::*; use std::time::Duration; @@ -78,3 +127,57 @@ mod tests { assert_eq!(cached_string.get(), new_value); } } + +#[cfg(test)] +mod thread_safe_cached_value_tests { + use super::*; + use std::{fmt::Error, sync::Mutex}; + + #[derive(Debug)] + struct MockToken { + get_token_call_count: Mutex, + } + + impl MockToken { + fn new() -> Self { + Self { + get_token_call_count: Mutex::new(0), + } + } + + async fn get_new_token(&self) -> Result { + // Include an incrementing counter in the token to track how many times the token has been refreshed + let mut call_count = self.get_token_call_count.lock().unwrap(); + *call_count += 1; + Ok(call_count.clone()) + } + } + + #[tokio::test] + async fn returns_same_value_if_unexpired() -> Result<(), Error> { + let cache = ThreadSafeCachedValue::new(Duration::from_secs(300)); + let mock_token = MockToken::new(); + + let token1 = cache.get(mock_token.get_new_token()).await?; + let token2 = cache.get(mock_token.get_new_token()).await?; + + assert_eq!(token1, 1); + assert_eq!(token2, 1); + Ok(()) + } + + #[tokio::test] + async fn returns_new_value_if_expired() -> Result<(), Error> { + let cache = ThreadSafeCachedValue::new(Duration::from_millis(1)); + let mock_token = MockToken::new(); + + let token1 = cache.get(mock_token.get_new_token()).await?; + // Sleep to ensure the token expires + tokio::time::sleep(Duration::from_secs(1)).await; + let token2 = cache.get(mock_token.get_new_token()).await?; + + assert_eq!(token1, 1); + assert_eq!(token2, 2); + Ok(()) + } +} diff --git a/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs index 7059bcd..0043d28 100644 --- a/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs +++ b/azure-kusto-ingest/src/resource_manager/ingest_client_resources.rs @@ -1,13 +1,11 @@ -use std::sync::Arc; - use crate::client_options::QueuedIngestClientOptions; use super::{ - cache::{Cached, ThreadSafeCachedValue}, + cache::ThreadSafeCachedValue, resource_uri::{ClientFromResourceUri, ResourceUri}, utils, RESOURCE_REFRESH_PERIOD, }; -use async_lock::RwLock; + use azure_core::ClientOptions; use azure_kusto_data::{models::TableV1, prelude::KustoClient}; use azure_storage_blobs::prelude::ContainerClient; @@ -99,19 +97,22 @@ impl TryFrom<(&TableV1, &QueuedIngestClientOptions)> for InnerIngestClientResour Ok(Self { ingestion_queues: create_clients_vec( &secured_ready_for_aggregation_queues, - &client_options.queue_service, + &client_options.queue_service_options, ), temp_storage_containers: create_clients_vec( &temp_storage, - &client_options.blob_service, + &client_options.blob_service_options, ), }) } } pub struct IngestClientResources { + /// A client against a Kusto ingestion cluster client: KustoClient, - resources: ThreadSafeCachedValue>, + /// Cache of the ingest client resources + resources_cache: ThreadSafeCachedValue, + /// Options to customise the storage clients client_options: QueuedIngestClientOptions, } @@ -119,7 +120,7 @@ impl IngestClientResources { pub fn new(client: KustoClient, client_options: QueuedIngestClientOptions) -> Self { Self { client, - resources: Arc::new(RwLock::new(Cached::new(None, RESOURCE_REFRESH_PERIOD))), + resources_cache: ThreadSafeCachedValue::new(RESOURCE_REFRESH_PERIOD), client_options, } } @@ -141,29 +142,8 @@ impl IngestClientResources { /// Gets the latest resources either from cache, or fetching from Kusto and updating the cached resources pub async fn get(&self) -> Result { - // first, try to get the resources from the cache by obtaining a read lock - { - let resources = self.resources.read().await; - if !resources.is_expired() { - if let Some(inner_value) = resources.get() { - return Ok(inner_value.clone()); - } - } - } - - // obtain a write lock to refresh the kusto response - let mut resources = self.resources.write().await; - - // check again in case another thread refreshed while we were waiting on the write lock - if !resources.is_expired() { - if let Some(inner_value) = resources.get() { - return Ok(inner_value.clone()); - } - } - - let new_resources = self.query_ingestion_resources().await?; - resources.update(Some(new_resources.clone())); - - Ok(new_resources) + self.resources_cache + .get(self.query_ingestion_resources()) + .await } } diff --git a/azure-kusto-ingest/src/resource_manager/resource_uri.rs b/azure-kusto-ingest/src/resource_manager/resource_uri.rs index 80c5c1c..615f930 100644 --- a/azure-kusto-ingest/src/resource_manager/resource_uri.rs +++ b/azure-kusto-ingest/src/resource_manager/resource_uri.rs @@ -9,6 +9,9 @@ pub enum ResourceUriError { #[error("URI scheme must be 'https', was '{0}'")] InvalidScheme(String), + #[error("URI host must be a domain")] + InvalidHost, + #[error("Object name is missing in the URI")] MissingObjectName, @@ -40,39 +43,49 @@ impl TryFrom<&str> for ResourceUri { fn try_from(uri: &str) -> Result { let parsed_uri = Url::parse(uri)?; - let scheme = match parsed_uri.scheme() { - "https" => "https".to_string(), + match parsed_uri.scheme() { + "https" => {} other_scheme => return Err(ResourceUriError::InvalidScheme(other_scheme.to_string())), }; - let host_string = parsed_uri - .host_str() - .expect("Url::parse should always return a host for a URI"); - - let service_uri = scheme + "://" + host_string; - - let host_string_components = host_string.split_terminator('.').collect::>(); - if host_string_components.len() < 2 { - return Err(ResourceUriError::MissingAccountName); - } - - let account_name = host_string_components[0].to_string(); - - let object_name = match parsed_uri.path().trim_start().trim_start_matches('/') { - "" => return Err(ResourceUriError::MissingObjectName), - name => name.to_string(), + let host_string = match parsed_uri.host() { + Some(url::Host::Domain(host_string)) => host_string, + _ => return Err(ResourceUriError::InvalidHost), }; - let sas_token = match parsed_uri.query() { - Some(query) => query.to_string(), - None => return Err(ResourceUriError::MissingSasToken), + let service_uri = String::from("https://") + host_string; + + // WIBNI: better parsing that this conforms to a storage resource URI, + // perhaps then ResourceUri could take a type like ResourceUri or ResourceUri + let (account_name, _service_endpoint) = host_string + .split_once('.') + .ok_or(ResourceUriError::MissingAccountName)?; + + let object_name = match parsed_uri.path_segments() { + Some(mut path_segments) => { + let object_name = match path_segments.next() { + Some(object_name) if !object_name.is_empty() => object_name, + _ => return Err(ResourceUriError::MissingObjectName), + }; + // Ensure there is only one path segment (i.e. the object name) + if path_segments.next().is_some() { + return Err(ResourceUriError::MissingObjectName); + }; + object_name + } + None => return Err(ResourceUriError::MissingObjectName), }; + + let sas_token = parsed_uri + .query() + .ok_or(ResourceUriError::MissingSasToken)?; + let sas_token = StorageCredentials::sas_token(sas_token)?; Ok(Self { service_uri, - object_name, - account_name, + object_name: object_name.to_string(), + account_name: account_name.to_string(), sas_token, }) } @@ -151,6 +164,10 @@ mod tests { let resource_uri = ResourceUri::try_from(uri); assert!(resource_uri.is_err()); + assert!(matches!( + resource_uri.unwrap_err(), + ResourceUriError::InvalidScheme(_) + )); } #[test] @@ -166,6 +183,31 @@ mod tests { )); } + #[test] + fn invalid_host_ipv4() { + let uri = "https://127.0.0.1/containerobjectname?sas=token"; + let resource_uri = ResourceUri::try_from(uri); + + assert!(resource_uri.is_err()); + assert!(matches!( + resource_uri.unwrap_err(), + ResourceUriError::InvalidHost + )); + } + + #[test] + fn invalid_host_ipv6() { + let uri = "https://[3FFE:FFFF:0::CD30]/containerobjectname?sas=token"; + let resource_uri = ResourceUri::try_from(uri); + println!("{:#?}", resource_uri); + + assert!(resource_uri.is_err()); + assert!(matches!( + resource_uri.unwrap_err(), + ResourceUriError::InvalidHost + )); + } + #[test] fn missing_object_name() { let uri = "https://storageaccountname.blob.core.windows.com/?sas=token";