Skip to content

Commit

Permalink
added __sub__ for OrientationHash and documented python code
Browse files Browse the repository at this point in the history
  • Loading branch information
GrayHat12 committed Jun 29, 2024
1 parent 82285c0 commit 8f5ec94
Show file tree
Hide file tree
Showing 5 changed files with 210 additions and 27 deletions.
7 changes: 4 additions & 3 deletions python/pymhash/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from ._pymhash import *
from .lib import *

from . import lib
from . import extras
from .lib import PymHash

__doc__ = _pymhash.__doc__ # type: ignore
__all__ = _pymhash.__all__ # type: ignore
__all__ = [*_pymhash.__all__, 'lib', 'extras', 'PymHash'] # type: ignore
34 changes: 30 additions & 4 deletions python/pymhash/__init__.pyi
Original file line number Diff line number Diff line change
@@ -1,20 +1,46 @@
"""
PymHash Hashing
Base Hash Implementation (In Rust)
"""

from typing import List
from .lib import *

class OrientationHash:
def __init__(self, hash_value: List[List[bool]]) -> None: ...
"""Class OrientationHash represents an orientation of image"""
def __init__(self, hash_value: List[List[bool]]) -> None:
"""
:param hash_value: boolean matrix
"""
...
def __str__(self) -> str: ...
def __hash__(self) -> int: ...
def __sub__(self, value: "OrientationHash") -> float: ...
def __eq__(self, value: "OrientationHash") -> bool: ...
@classmethod
def from_str(cls, hash_str: str) -> "OrientationHash": ...
def to_str(self) -> str: ...
def from_str(cls, hash_str: str) -> "OrientationHash":
"""
Convert Hash string to OrientationHash object
:param hash_str: Hash String
:return: OrientationHash
"""
...
def to_str(self) -> str:
"""
Convert OrientationHash object to str
:return: str
"""
...
def hash_size(self) -> int: ...
def unique_hash(self) -> int: ...

class ImageHash:
"""Class Representing a single ImageHash containing 4 orientations of the image"""

VERSION: int = ...
"""Hash version"""

def __init__(self) -> None: ...
def add_hash(self, hash: OrientationHash) -> None: ...
Expand Down
49 changes: 40 additions & 9 deletions python/pymhash/extras.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,64 @@
from typing import List
"""
Pyhash extras
Extra functionality for Image Comparisons
"""

from typing import Any, Generator, List
from pathlib import Path
from .lib import PymHash, ImageFileMetadata
from .lib import PymHash, ImageFileMetadata, PathLike

try:
from tqdm import tqdm # type: ignore
except ImportError:
tqdm = lambda x: x

IMAGE_EXTENSIONS = ['.bmp', '.dib', '.jpeg', '.jpg', '.jp2', '.png', '.pbm', '.pgm', '.ppm', '.sr', '.ras', '.tiff', '.tif', '.exr', '.jxr', '.pfm', '.pds', '.pfm', '.viff', '.xbm', '.xpm', '.dds', '.eis', '.mng', '.web', '.hei', '.hei', '.av']
"""Common image extensions that can be parsed by cv2"""

def drilldown(targeted_directory: PathLike, extensions: List[str] = IMAGE_EXTENSIONS) -> Generator[str, Any, None]:
"""
Generator Function
def drilldown(targeted_directory: str, extensions: List[str] = IMAGE_EXTENSIONS):
image_files:list[str]= []
Yields all valid image paths inside target directory and it's subdirectories
:param targeted_directory: Path of a root folder
:param extensions: Acceptable image extensions
:return: Generator over all valid image paths inside the target and it's subdirectories
"""
folder_path:Path = Path(targeted_directory)
if not folder_path.exists():
raise ValueError(f"Folder path {targeted_directory} does not exist.")
for file_path in folder_path.glob('**/*'):
if file_path.is_file() and file_path.suffix.lower() in extensions:
raw_filename = u'{}'.format(file_path)
image_files.append(raw_filename)
return image_files
yield raw_filename
return None

def get_duplicates(
folder_path: str,
valid_image_extensions: List[str] = IMAGE_EXTENSIONS,
hash_size: int = 8,
highfreq_factor: int = 4
) -> List[List[PymHash[ImageFileMetadata]]]:
"""
Find Duplicate Image Groups
Return all duplicate image groups inside target directory and it's subdirectories
def get_duplicates(folder_path: str):
image_paths = drilldown(folder_path)
:param targeted_directory: Path of a root folder
:param valid_image_extensions: List of valid image extensions to consider
:param hash_size: hash size
:param highfreq_factor: factor to amplify higher frequencies with
:return: Groups of all duplicate PymHash images
"""
image_paths = drilldown(folder_path, extensions=valid_image_extensions)

# List of similarity groups
similarity_groups: List[List[PymHash[ImageFileMetadata]]] = []

for image_path in tqdm(image_paths):
metadata = PymHash.from_image(image_path)
metadata = PymHash.from_image(image_path, hash_size=hash_size, highfreq_factor=highfreq_factor)

inserted = False

Expand Down
143 changes: 132 additions & 11 deletions python/pymhash/lib.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
"""
Core PyHash Module: Hash your images
Easily hash and compare images
"""

import os
import cv2
import exifread
Expand All @@ -13,25 +19,57 @@

@dataclass
class Metadata:
"""Dataclass representing base Image metadata"""

width: int
"""width of the image"""

height: int
"""height of the image"""

channels: int
"""channels in the image"""

hash: ImageHash
"""hash of the image"""

@dataclass
class ImageBufferMetadata(Metadata):
"""Dataclass representing Image metadata for images loaded through a binary stream"""

exiftags: dict
"""exif tags of the image"""

@dataclass
class ImageFileMetadata(ImageBufferMetadata):
"""Dataclass representing Image metadata for images loaded through filepath"""

size: int
"""size of the image reported by `os.path.getsize()`"""

extension: str
"""Image extension"""

filename: str
"""Image File name"""

filepath: str
"""Image file path"""

T = TypeVar("T", bound=Metadata)

def get_image_hash(image: cv2.typing.MatLike, hash_size: int, highfreq_factor: int) -> ImageHash: # type: ignore
"""
Get image hash
Calculate image hash
:param image: Matrix Like cv2 image
:param hash_size: hash size
:param highfreq_factor: factor to amplify higher frequencies with
:return: ImageHash
"""

img_size = hash_size * highfreq_factor

sample = cv2.resize(image, (img_size, img_size))
Expand All @@ -56,6 +94,14 @@ def get_image_hash(image: cv2.typing.MatLike, hash_size: int, highfreq_factor: i
return image_hash

def _get_exiftags(buffer: BinaryIO):
"""
Get exif tags
Get exiftags from image
:param buffer: Image buffer
:return: dictionary containg exiftags
"""
exif_tags = {}
try:
for tag, value in exifread.process_file(buffer).items():
Expand All @@ -66,6 +112,16 @@ def _get_exiftags(buffer: BinaryIO):
return exif_tags

def _metadata_from_image_buffer(buffer: BinaryIO, hash_size: int, highfreq_factor: int):
"""
Metadata from image buffer
Get ImageBufferMetadata from image buffer
:param buffer: Image buffer
:param hash_size: hash size
:param highfreq_factor: factor to amplify higher frequencies with
:return: ImageBufferMetadata
"""
file_bytes = np.asarray(bytearray(buffer.read()), dtype=np.uint8)
image = cv2.imdecode(file_bytes, cv2.IMREAD_COLOR)
buffer.seek(0)
Expand All @@ -78,6 +134,16 @@ def _metadata_from_image_buffer(buffer: BinaryIO, hash_size: int, highfreq_facto
)

def _metadata_from_image_path(path: Union[str, Path], hash_size: int, highfreq_factor: int):
"""
Metadata from image filepath
Get ImageFileMetadata from image path
:param path: Image path
:param hash_size: hash size
:param highfreq_factor: factor to amplify higher frequencies with
:return: ImageFileMetadata
"""
if not os.path.exists(path) or not os.path.isfile(path):
raise ValueError(f"No file at {path=}")
filename, extension = os.path.splitext(path)
Expand All @@ -97,23 +163,63 @@ def _metadata_from_image_path(path: Union[str, Path], hash_size: int, highfreq_f
)

class PymHash(Generic[T]):
"""Class representing a parsed Image"""

def __init__(self, metadata: T) -> None:
"""
:param metadata: metadata that is parsed
"""
self.metadata = metadata

@overload
@classmethod
def from_image(cls, image: PathLike, hash_size: int = ..., highfreq_factor: int = ...) -> 'PymHash[ImageFileMetadata]': ...
def from_image(cls, image: PathLike, hash_size: int = ..., highfreq_factor: int = ...) -> 'PymHash[ImageFileMetadata]':
"""
Instantiate PymHash using an image path
:param image: Image path
:param hash_size: hash size
:param highfreq_factor: factor to amplify higher frequencies with
:return: PymHash[ImageFileMetadata]
"""
...

@overload
@classmethod
def from_image(cls, image: BinaryIO, hash_size: int = ..., highfreq_factor: int = ...) -> 'PymHash[ImageBufferMetadata]': ...
def from_image(cls, image: BinaryIO, hash_size: int = ..., highfreq_factor: int = ...) -> 'PymHash[ImageBufferMetadata]':
"""
Instantiate PymHash using an image buffer
:param image: Image buffer
:param hash_size: hash size
:param highfreq_factor: factor to amplify higher frequencies with
:return: PymHash[ImageBufferMetadata]
"""
...

@overload
@classmethod
def from_image(cls, image: cv2.typing.MatLike, hash_size: int = ..., highfreq_factor: int = ...) -> 'PymHash[Metadata]': ...
def from_image(cls, image: cv2.typing.MatLike, hash_size: int = ..., highfreq_factor: int = ...) -> 'PymHash[Metadata]':
"""
Instantiate PymHash using an image cv2 matrix
:param image: Image matrix
:param hash_size: hash size
:param highfreq_factor: factor to amplify higher frequencies with
:return: PymHash[Metadata]
"""
...

@classmethod
def from_image(cls, image: Union[PathLike, BinaryIO, cv2.typing.MatLike], hash_size: int = 8, highfreq_factor: int = 4):
"""
Instantiate PymHash using an image path/buffer/matrix
:param image: Image path/buffer/matrix
:param hash_size: hash size
:param highfreq_factor: factor to amplify higher frequencies with
:return: PymHash
"""
if isinstance(image, (str, Path)):
return cls(_metadata_from_image_path(image, hash_size=hash_size, highfreq_factor=highfreq_factor))
elif isinstance(cls, BinaryIO):
Expand All @@ -129,26 +235,41 @@ def from_image(cls, image: Union[PathLike, BinaryIO, cv2.typing.MatLike], hash_s
raise ValueError("Invalid image")

def __eq__(self, other: 'PymHash[Metadata]'):
"""
Compare PymHash objects
Two PymHash objects are equal if their respective ImageHash(es) satisfy equality
:param other: PymHash
:return: bool
"""
if not isinstance(other, PymHash):
return False
return self.metadata.hash == other.metadata.hash

def similar(self, other: 'PymHash[Metadata]', threshold: float = 0.01):
"""
Compare PymHash objects
Two PymHash objects are similar if the difference between their respective ImageHash(es) is less than the provided threshold
:param other: PymHash
:param threshold: float
:return: bool
"""
return (self.metadata.hash - other.metadata.hash) <= threshold

def to_dict(self):
"""
Convert PymHash to dict
:return: serializable json
"""
dictionary = {}
for field in fields(self.metadata):
value = getattr(self.metadata, field.name)
if isinstance(value, ImageHash):
dictionary.update({field.name: value.to_str()})
else:
dictionary.update({field.name: value})
return dictionary

@staticmethod
def asdict(data):
if isinstance(data, PymHash):
return data.to_dict()
else:
return data
return dictionary
4 changes: 4 additions & 0 deletions src/hash/orientation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ impl OrientationHash {
return self.to_str();
}

fn __sub__(&self, other: &Self) -> PyResult<f32> {
return Ok(self.clone().sub(other.clone()).unwrap());
}

fn __hash__(&self) -> u64 {
let mut hasher = DefaultHasher::new();
self.hash(&mut hasher);
Expand Down

0 comments on commit 8f5ec94

Please sign in to comment.