diff --git a/.env.sample b/.env.sample index 10323f6b..5875f8a4 100644 --- a/.env.sample +++ b/.env.sample @@ -13,6 +13,11 @@ # OPENAI_API_KEY= +############### Search ############## + +# MEILI_ADDR= +# MEILI_MASTER_KEY= + ############## Auth ############## # Authentik for auth diff --git a/README.md b/README.md index 1bdfb960..9b17b074 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,8 @@ A self-hostable bookmark-everything app with a touch of AI for the data hoarders - 🔖 Chrome plugin for quick bookmarking. - 📱 iOS shortcut for bookmarking content from the phone. A minimal mobile app might come later. - 💾 Self-hostable first. +- 🔎 Full text search of all the content stored. - [Planned] Archiving the content for offline reading. -- [Planned] Full text search of all the content stored. - [Planned] Store raw images. **⚠️ This app is under heavy development and it's far from stable.** @@ -37,6 +37,8 @@ The app is configured with env variables. | DATABASE_URL | Not set | The path for the sqlite database. | | REDIS_HOST | localhost | The address of redis used by background jobs | | REDIS_POST | 6379 | The port of redis used by background jobs | +| MEILI_ADDR | Not set | The address of meilisearch. If not set, Search will be disabled. | +| MEILI_MASTER_KEY | Not set | The master key configured for meili. Not needed in development. | ## Security Considerations @@ -62,6 +64,7 @@ To mitigate those risks, you can do one of the following: - [Puppeteer](https://pptr.dev/) for crawling the bookmarks. - [OpenAI](https://openai.com/) because AI is so hot right now. - [BullMQ](https://bullmq.io) for scheduling the background jobs. +- [Meilisearch](https://meilisearch.com) for the full content search. ## Why did I build it? diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml index 1619a2c9..ebc599dd 100644 --- a/docker/docker-compose.dev.yml +++ b/docker/docker-compose.dev.yml @@ -10,6 +10,7 @@ services: - 3000:3000 environment: REDIS_HOST: redis + MEILI_ADDR: http://meilisearch:7700 DATABASE_URL: "/data/db.db" command: - pnpm @@ -22,6 +23,10 @@ services: image: redis:7.2-alpine volumes: - redis:/data + meilisearch: + image: getmeili/meilisearch:v1.6 + volumes: + - meilisearch:/meili_data workers: build: dockerfile: Dockerfile.dev @@ -31,6 +36,7 @@ services: working_dir: /app environment: REDIS_HOST: redis + MEILI_ADDR: http://meilisearch:7700 DATABASE_URL: "/data/db.db" # OPENAI_API_KEY: ... command: @@ -55,4 +61,5 @@ services: volumes: redis: + meilisearch: data: diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 4f7a43f9..b290ffa6 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -8,17 +8,23 @@ services: - 3000:3000 environment: REDIS_HOST: redis + MEILI_ADDR: http://meilisearch:7700 DATABASE_URL: "/data/db.db" redis: image: redis:7.2-alpine volumes: - redis:/data + meilisearch: + image: getmeili/meilisearch:v1.6 + volumes: + - meilisearch:/meili_data workers: image: ghcr.io/mohamedbassem/hoarder-workers:latest volumes: - data:/data environment: REDIS_HOST: redis + MEILI_ADDR: http://meilisearch:7700 DATABASE_URL: "/data/db.db" # OPENAI_API_KEY: ... depends_on: @@ -27,4 +33,5 @@ services: volumes: redis: + meilisearch: data: diff --git a/packages/shared/config.ts b/packages/shared/config.ts index 6ca7b89d..1dee4c4d 100644 --- a/packages/shared/config.ts +++ b/packages/shared/config.ts @@ -29,6 +29,12 @@ const serverConfig = { browserExecutablePath: process.env.BROWSER_EXECUTABLE_PATH, // If not set, the system's browser will be used browserUserDataDir: process.env.BROWSER_USER_DATA_DIR, }, + meilisearch: process.env.MEILI_ADDR + ? { + address: process.env.MEILI_ADDR || "http://127.0.0.1:7700", + key: process.env.MEILI_MASTER_KEY || "", + } + : undefined, logLevel: process.env.LOG_LEVEL || "debug", demoMode: (process.env.DEMO_MODE ?? "false") == "true", }; diff --git a/packages/shared/package.json b/packages/shared/package.json index 9f6b5498..0b3a8078 100644 --- a/packages/shared/package.json +++ b/packages/shared/package.json @@ -4,6 +4,7 @@ "version": "0.1.0", "private": true, "dependencies": { + "meilisearch": "^0.37.0", "winston": "^3.11.0", "zod": "^3.22.4" }, diff --git a/packages/shared/queues.ts b/packages/shared/queues.ts index 0155b1e7..a2cbeceb 100644 --- a/packages/shared/queues.ts +++ b/packages/shared/queues.ts @@ -27,3 +27,25 @@ export type ZOpenAIRequest = z.infer; export const OpenAIQueue = new Queue("openai_queue", { connection: queueConnectionDetails, }); + +// Search Indexing Worker +export const zSearchIndexingRequestSchema = z.object({ + bookmarkId: z.string(), + type: z.enum(["index", "delete"]), +}); +export type ZSearchIndexingRequest = z.infer< + typeof zSearchIndexingRequestSchema +>; +export const SearchIndexingQueue = new Queue( + "searching_indexing", + { + connection: queueConnectionDetails, + defaultJobOptions: { + attempts: 5, + backoff: { + type: "exponential", + delay: 1000, + }, + }, + }, +); diff --git a/packages/shared/search.ts b/packages/shared/search.ts new file mode 100644 index 00000000..3bdf1ad1 --- /dev/null +++ b/packages/shared/search.ts @@ -0,0 +1,50 @@ +import { MeiliSearch, Index } from "meilisearch"; +import serverConfig from "./config"; +import { z } from "zod"; + +export const zBookmarkIdxSchema = z.object({ + id: z.string(), + userId: z.string(), + url: z.string().nullish(), + title: z.string().nullish(), + description: z.string().nullish(), + content: z.string().nullish(), + tags: z.array(z.string()).default([]), +}); + +export type ZBookmarkIdx = z.infer; + +let searchClient: MeiliSearch | undefined; + +if (serverConfig.meilisearch) { + searchClient = new MeiliSearch({ + host: serverConfig.meilisearch.address, + apiKey: serverConfig.meilisearch.key, + }); +} + +const BOOKMARKS_IDX_NAME = "bookmarks"; + +let idxClient: Index | undefined; + +export async function getSearchIdxClient(): Promise | null> { + if (idxClient) { + return idxClient; + } + if (!searchClient) { + return null; + } + + const indicies = await searchClient.getIndexes(); + let idxFound = indicies.results.find((i) => i.uid == BOOKMARKS_IDX_NAME); + if (!idxFound) { + const idx = await searchClient.createIndex(BOOKMARKS_IDX_NAME, { + primaryKey: "id", + }); + await searchClient.waitForTask(idx.taskUid); + idxFound = await searchClient.getIndex(BOOKMARKS_IDX_NAME); + const taskId = await idxFound.updateFilterableAttributes(["id", "userId"]); + await searchClient.waitForTask(taskId.taskUid); + } + return idxFound; +} diff --git a/packages/web/app/dashboard/components/Sidebar.tsx b/packages/web/app/dashboard/components/Sidebar.tsx index 7eea6b6d..010ee103 100644 --- a/packages/web/app/dashboard/components/Sidebar.tsx +++ b/packages/web/app/dashboard/components/Sidebar.tsx @@ -1,4 +1,12 @@ -import { Archive, Star, Tag, Home, PackageOpen, Settings } from "lucide-react"; +import { + Archive, + Star, + Tag, + Home, + PackageOpen, + Settings, + Search, +} from "lucide-react"; import { redirect } from "next/navigation"; import SidebarItem from "./SidebarItem"; import { getServerAuthSession } from "@/server/auth"; @@ -6,6 +14,7 @@ import Link from "next/link"; import SidebarProfileOptions from "./SidebarProfileOptions"; import { Separator } from "@/components/ui/separator"; import AllLists from "./AllLists"; +import serverConfig from "@hoarder/shared/config"; export default async function Sidebar() { const session = await getServerAuthSession(); @@ -34,6 +43,13 @@ export default async function Sidebar() { name="Favourites" path="/dashboard/bookmarks/favourites" /> + {serverConfig.meilisearch && ( + } + name="Search" + path="/dashboard/search" + /> + )} } name="Archive" diff --git a/packages/web/app/dashboard/search/page.tsx b/packages/web/app/dashboard/search/page.tsx new file mode 100644 index 00000000..1c26608e --- /dev/null +++ b/packages/web/app/dashboard/search/page.tsx @@ -0,0 +1,93 @@ +"use client"; + +import { api } from "@/lib/trpc"; +import { usePathname, useRouter, useSearchParams } from "next/navigation"; +import BookmarksGrid from "../bookmarks/components/BookmarksGrid"; +import { Input } from "@/components/ui/input"; +import Loading from "../bookmarks/loading"; +import { keepPreviousData } from "@tanstack/react-query"; +import { Search } from "lucide-react"; +import { ActionButton } from "@/components/ui/action-button"; +import { Suspense, useRef } from "react"; + +function SearchComp() { + const router = useRouter(); + const pathname = usePathname(); + const searchParams = useSearchParams(); + const searchQuery = searchParams.get("q") || ""; + + const { data, isPending, isPlaceholderData, error } = + api.bookmarks.searchBookmarks.useQuery( + { + text: searchQuery, + }, + { + placeholderData: keepPreviousData, + }, + ); + + if (error) { + throw error; + } + + const inputRef: React.MutableRefObject = + useRef(null); + + let timeoutId: NodeJS.Timeout | undefined; + + // Debounce user input + const doSearch = () => { + if (!inputRef.current) { + return; + } + router.replace(`${pathname}?q=${inputRef.current.value}`); + }; + + const onInputChange = () => { + if (timeoutId) { + clearTimeout(timeoutId); + } + timeoutId = setTimeout(() => { + doSearch(); + }, 200); + }; + + return ( +
+
+ + + + + Search + + +
+
+ {data ? ( + b.id) }} + bookmarks={data.bookmarks} + /> + ) : ( + + )} +
+ ); +} + +export default function SearchPage() { + return ( + + + + ); +} diff --git a/packages/web/package.json b/packages/web/package.json index 7687704f..b25fc2e9 100644 --- a/packages/web/package.json +++ b/packages/web/package.json @@ -41,6 +41,7 @@ "drizzle-orm": "^0.29.4", "install": "^0.13.0", "lucide-react": "^0.322.0", + "meilisearch": "^0.37.0", "next": "14.1.0", "next-auth": "^4.24.5", "prettier": "^3.2.5", diff --git a/packages/web/server/api/routers/bookmarks.ts b/packages/web/server/api/routers/bookmarks.ts index 8b59f1ef..73818508 100644 --- a/packages/web/server/api/routers/bookmarks.ts +++ b/packages/web/server/api/routers/bookmarks.ts @@ -1,5 +1,6 @@ import { z } from "zod"; import { Context, authedProcedure, router } from "../trpc"; +import { getSearchIdxClient } from "@hoarder/shared/search"; import { ZBookmark, ZBookmarkContent, @@ -17,7 +18,11 @@ import { bookmarks, tagsOnBookmarks, } from "@hoarder/db/schema"; -import { LinkCrawlerQueue, OpenAIQueue } from "@hoarder/shared/queues"; +import { + LinkCrawlerQueue, + OpenAIQueue, + SearchIndexingQueue, +} from "@hoarder/shared/queues"; import { TRPCError, experimental_trpcMiddleware } from "@trpc/server"; import { and, desc, eq, inArray } from "drizzle-orm"; import { ZBookmarkTags } from "@/lib/types/api/tags"; @@ -172,6 +177,10 @@ export const bookmarksAppRouter = router({ break; } } + SearchIndexingQueue.add("search_indexing", { + bookmarkId: bookmark.id, + type: "index", + }); return bookmark; }), @@ -224,6 +233,10 @@ export const bookmarksAppRouter = router({ message: "Bookmark not found", }); } + SearchIndexingQueue.add("search_indexing", { + bookmarkId: input.bookmarkId, + type: "index", + }); }), deleteBookmark: authedProcedure @@ -238,6 +251,10 @@ export const bookmarksAppRouter = router({ eq(bookmarks.id, input.bookmarkId), ), ); + SearchIndexingQueue.add("search_indexing", { + bookmarkId: input.bookmarkId, + type: "delete", + }); }), recrawlBookmark: authedProcedure .input(z.object({ bookmarkId: z.string() })) @@ -280,6 +297,49 @@ export const bookmarksAppRouter = router({ return toZodSchema(bookmark); }), + searchBookmarks: authedProcedure + .input( + z.object({ + text: z.string(), + }), + ) + .output(zGetBookmarksResponseSchema) + .query(async ({ input, ctx }) => { + const client = await getSearchIdxClient(); + if (!client) { + throw new TRPCError({ + code: "INTERNAL_SERVER_ERROR", + message: "Search functionality is not configured", + }); + } + const resp = await client.search(input.text, { + filter: [`userId = '${ctx.user.id}'`], + }); + + if (resp.hits.length == 0) { + return { bookmarks: [] }; + } + const results = await ctx.db.query.bookmarks.findMany({ + where: and( + eq(bookmarks.userId, ctx.user.id), + inArray( + bookmarks.id, + resp.hits.map((h) => h.id), + ), + ), + with: { + tagsOnBookmarks: { + with: { + tag: true, + }, + }, + link: true, + text: true, + }, + }); + + return { bookmarks: results.map(toZodSchema) }; + }), getBookmarks: authedProcedure .input(zGetBookmarksRequestSchema) .output(zGetBookmarksResponseSchema) diff --git a/packages/workers/crawler.ts b/packages/workers/crawler.ts index bfb46218..7be014a7 100644 --- a/packages/workers/crawler.ts +++ b/packages/workers/crawler.ts @@ -2,6 +2,7 @@ import logger from "@hoarder/shared/logger"; import { LinkCrawlerQueue, OpenAIQueue, + SearchIndexingQueue, ZCrawlLinkRequest, queueConnectionDetails, zCrawlLinkRequestSchema, @@ -30,6 +31,7 @@ import assert from "assert"; import serverConfig from "@hoarder/shared/config"; import { bookmarkLinks } from "@hoarder/db/schema"; import { eq } from "drizzle-orm"; +import { SearchIndexingWorker } from "./search"; const metascraperParser = metascraper([ metascraperReadability(), @@ -172,4 +174,10 @@ async function runCrawler(job: Job) { OpenAIQueue.add("openai", { bookmarkId, }); + + // Update the search index + SearchIndexingQueue.add("search_indexing", { + bookmarkId, + type: "index", + }); } diff --git a/packages/workers/index.ts b/packages/workers/index.ts index 67be7af2..295eeaef 100644 --- a/packages/workers/index.ts +++ b/packages/workers/index.ts @@ -1,14 +1,16 @@ import "dotenv/config"; import { CrawlerWorker } from "./crawler"; import { OpenAiWorker } from "./openai"; +import { SearchIndexingWorker } from "./search"; async function main() { - const [crawler, openai] = [ + const [crawler, openai, search] = [ await CrawlerWorker.build(), await OpenAiWorker.build(), + await SearchIndexingWorker.build(), ]; - await Promise.all([crawler.run(), openai.run()]); + await Promise.all([crawler.run(), openai.run(), search.run()]); } main(); diff --git a/packages/workers/openai.ts b/packages/workers/openai.ts index 8f85c4ec..cc456616 100644 --- a/packages/workers/openai.ts +++ b/packages/workers/openai.ts @@ -3,6 +3,7 @@ import logger from "@hoarder/shared/logger"; import serverConfig from "@hoarder/shared/config"; import { OpenAIQueue, + SearchIndexingQueue, ZOpenAIRequest, queueConnectionDetails, zOpenAIRequestSchema, @@ -159,13 +160,16 @@ async function connectTags(bookmarkId: string, tagIds: string[]) { if (tagIds.length == 0) { return; } - await db.insert(tagsOnBookmarks).values( - tagIds.map((tagId) => ({ - tagId, - bookmarkId, - attachedBy: "ai" as const, - })), - ); + await db + .insert(tagsOnBookmarks) + .values( + tagIds.map((tagId) => ({ + tagId, + bookmarkId, + attachedBy: "ai" as const, + })), + ) + .onConflictDoNothing(); } async function runOpenAI(job: Job) { @@ -203,4 +207,10 @@ async function runOpenAI(job: Job) { const tagIds = await createTags(tags, bookmark.userId); await connectTags(bookmarkId, tagIds); + + // Update the search index + SearchIndexingQueue.add("search_indexing", { + bookmarkId, + type: "index", + }); } diff --git a/packages/workers/search.ts b/packages/workers/search.ts new file mode 100644 index 00000000..a628b2ed --- /dev/null +++ b/packages/workers/search.ts @@ -0,0 +1,115 @@ +import { db } from "@hoarder/db"; +import logger from "@hoarder/shared/logger"; +import { getSearchIdxClient } from "@hoarder/shared/search"; +import { + SearchIndexingQueue, + ZSearchIndexingRequest, + queueConnectionDetails, + zSearchIndexingRequestSchema, +} from "@hoarder/shared/queues"; +import { Job } from "bullmq"; +import { Worker } from "bullmq"; +import { bookmarks } from "@hoarder/db/schema"; +import { eq } from "drizzle-orm"; + +export class SearchIndexingWorker { + static async build() { + logger.info("Starting search indexing worker ..."); + const worker = new Worker( + SearchIndexingQueue.name, + runSearchIndexing, + { + connection: queueConnectionDetails, + autorun: false, + }, + ); + + worker.on("completed", (job) => { + const jobId = job?.id || "unknown"; + logger.info(`[search][${jobId}] Completed successfully`); + }); + + worker.on("failed", (job, error) => { + const jobId = job?.id || "unknown"; + logger.error(`[search][${jobId}] openai job failed: ${error}`); + }); + + return worker; + } +} + +async function runIndex( + searchClient: NonNullable>>, + bookmarkId: string, +) { + const bookmark = await db.query.bookmarks.findFirst({ + where: eq(bookmarks.id, bookmarkId), + with: { + link: true, + text: true, + tagsOnBookmarks: { + with: { + tag: true, + }, + }, + }, + }); + + if (!bookmark) { + throw new Error(`Bookmark ${bookmarkId} not found`); + } + + searchClient.addDocuments([ + { + id: bookmark.id, + userId: bookmark.userId, + ...(bookmark.link + ? { + url: bookmark.link.url, + title: bookmark.link.title, + description: bookmark.link.description, + } + : undefined), + ...(bookmark.text ? { content: bookmark.text.text } : undefined), + tags: bookmark.tagsOnBookmarks.map((t) => t.tag.name), + }, + ]); +} + +async function runDelete( + searchClient: NonNullable>>, + bookmarkId: string, +) { + await searchClient.deleteDocument(bookmarkId); +} + +async function runSearchIndexing(job: Job) { + const jobId = job.id || "unknown"; + + const request = zSearchIndexingRequestSchema.safeParse(job.data); + if (!request.success) { + throw new Error( + `[search][${jobId}] Got malformed job request: ${request.error.toString()}`, + ); + } + + const searchClient = await getSearchIdxClient(); + if (!searchClient) { + logger.debug( + `[search][${jobId}] Search is not configured, nothing to do now`, + ); + return; + } + + const bookmarkId = request.data.bookmarkId; + switch (request.data.type) { + case "index": { + await runIndex(searchClient, bookmarkId); + break; + } + case "delete": { + await runDelete(searchClient, bookmarkId); + break; + } + } +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index bfec96d4..5b2ab5fa 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -173,6 +173,9 @@ importers: packages/shared: dependencies: + meilisearch: + specifier: ^0.37.0 + version: 0.37.0 winston: specifier: ^3.11.0 version: 3.11.0 @@ -272,6 +275,9 @@ importers: lucide-react: specifier: ^0.322.0 version: 0.322.0(react@18.2.0) + meilisearch: + specifier: ^0.37.0 + version: 0.37.0 next: specifier: 14.1.0 version: 14.1.0(react-dom@18.2.0)(react@18.2.0) @@ -4036,6 +4042,14 @@ packages: luxon: 3.4.4 dev: false + /cross-fetch@3.1.8: + resolution: {integrity: sha512-cvA+JwZoU0Xq+h6WkMvAUqPEYy92Obet6UdKLfW60qn99ftItKjB5T+BkyWOFWe2pUyfQ+IJHmpOTznqk1M6Kg==} + dependencies: + node-fetch: 2.7.0 + transitivePeerDependencies: + - encoding + dev: false + /cross-fetch@4.0.0: resolution: {integrity: sha512-e4a5N8lVvuLgAWgnCrLr2PP0YyDOTHa9H/Rj54dirp61qXnNq46m82bRhNqIA5VccJtWBvPTFRV3TtvHUKPB1g==} dependencies: @@ -6633,6 +6647,14 @@ packages: '@types/mdast': 4.0.3 dev: false + /meilisearch@0.37.0: + resolution: {integrity: sha512-LdbK6JmRghCawrmWKJSEQF0OiE82md+YqJGE/U2JcCD8ROwlhTx0KM6NX4rQt0u0VpV0QZVG9umYiu3CSSIJAQ==} + dependencies: + cross-fetch: 3.1.8 + transitivePeerDependencies: + - encoding + dev: false + /memoize-one@6.0.0: resolution: {integrity: sha512-rkpe71W0N0c0Xz6QD0eJETuWAJGnJ9afsl1srmwPrI+yBCkge5EycXXbYRyvL29zZVUWQCY7InPRCv3GDXuZNw==} dev: false