Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Link processing #173

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 62 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.

# dependencies
/node_modules
node_modules
/.pnp
.pnp.js
.yarn/install-state.gz
Expand Down Expand Up @@ -40,4 +40,64 @@ next-env.d.ts
amplify_outputs*
amplifyconfiguration*

/cdk.out
# Optional eslint cache
.eslintcache

# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/

# Optional REPL history
.node_repl_history

# Output of 'npm pack'
*.tgz

# Yarn Integrity file
.yarn-integrity

# dotenv environment variables file
.env
.env.test

# parcel-bundler cache (https://parceljs.org/)
.cache

# Next.js build output
.next

# Nuxt.js build / generate output
.nuxt
dist

# Gatsby files
.cache/
# Comment in the public line in if your project uses Gatsby and not Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public

# vuepress build output
.vuepress/dist

# Serverless directories
.serverless/

# FuseBox cache
.fusebox/

# DynamoDB Local files
.dynamodb/

# TernJS port file
.tern-port

# Stores VSCode versions used for testing VSCode extensions
.vscode-test

# End of https://www.toptal.com/developers/gitignore/api/node

cdk.out

*-outputs.json
10 changes: 9 additions & 1 deletion amplify/backend.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import { TableNotifications } from './constructs/table-notifications';
import * as sns from 'aws-cdk-lib/aws-sns';
import * as snsSubscriptions from 'aws-cdk-lib/aws-sns-subscriptions';
import * as iam from 'aws-cdk-lib/aws-iam';
import { ProcessLinks } from './extract/resource';

/**
* @see https://docs.amplify.aws/react/build-a-backend/ to add storage, functions, and more
Expand Down Expand Up @@ -72,4 +73,11 @@ new TableNotifications(links, 'TableNotifications', {
message: 'A new link with the url "<$.dynamodb.NewImage.url.S>" has been added to https://cdk.dev - check it out now'
})

export default backend;
export default backend;

// === Scraper ===

const scraper = backend.createStack('scraper')
new ProcessLinks(scraper, 'ProcessLinks', {
table: dataResources.tables['LinkSuggestion'],
})
53 changes: 53 additions & 0 deletions amplify/extract/converter/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import { S3Client, GetObjectCommand, PutObjectCommand } from "@aws-sdk/client-s3";
import TurndownService from "turndown";
import { Readable } from "stream";
import { Handler } from "aws-lambda";

const s3 = new S3Client({ region: process.env.AWS_REGION });
const turndownService = new TurndownService();

const streamToString = (stream: Readable): Promise<string> =>
new Promise((resolve, reject) => {
const chunks: any[] = [];
stream.on("data", (chunk) => chunks.push(chunk));
stream.on("error", reject);
stream.on("end", () => resolve(Buffer.concat(chunks).toString("utf-8")));
});

export const handler: Handler = async (event: { id: string }) => {
const id = event.id;
const bucketName = process.env.BUCKET_NAME;

try {
// Get HTML file from S3
const getObjectParams = {
Bucket: bucketName,
Key: `${id}.html`,
};
const data = await s3.send(new GetObjectCommand(getObjectParams));
const htmlContent = await streamToString(data.Body as Readable);

// Convert HTML to Markdown
const markdownContent = turndownService.turndown(htmlContent);

// Save Markdown to S3
const putObjectParams = {
Bucket: bucketName,
Key: `${id}.md`,
Body: markdownContent,
ContentType: "text/markdown",
};
await s3.send(new PutObjectCommand(putObjectParams));

return {
statusCode: 200,
body: JSON.stringify({ message: "Markdown file saved to S3", id: id }),
};
} catch (error) {
console.error("Error:", error);
return {
statusCode: 500,
body: JSON.stringify({ message: "An error occurred", id: id }),
};
}
};
65 changes: 65 additions & 0 deletions amplify/extract/resource.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import * as dynamodb from 'aws-cdk-lib/aws-dynamodb';
import { RemovalPolicy } from 'aws-cdk-lib';
import * as pipes from 'aws-cdk-lib/aws-pipes';
import * as iam from 'aws-cdk-lib/aws-iam';
import * as logs from 'aws-cdk-lib/aws-logs';
import * as sfn from 'aws-cdk-lib/aws-stepfunctions';
import * as s3 from 'aws-cdk-lib/aws-s3';
import { Construct } from 'constructs';
import { ExtractContentStateMachine } from './sfn';


class DdbToSfnPipe extends Construct {
constructor(scope: Construct, id: string, table: dynamodb.ITable, stateMachine: sfn.IStateMachine) {
super(scope, id);

const pipeRole = new iam.Role(this, 'PipeRole', {
assumedBy: new iam.ServicePrincipal('pipes.amazonaws.com'),
});

table.grantStreamRead(pipeRole);
stateMachine.grantStartExecution(pipeRole);

const logGroup = new logs.LogGroup(this, 'PipeLogGroup', {
retention: logs.RetentionDays.ONE_WEEK,
});

new pipes.CfnPipe(this, 'DdbToSfnPipe', {
roleArn: pipeRole.roleArn,
source: table.tableStreamArn!,
sourceParameters: {
dynamoDbStreamParameters: {
startingPosition: 'LATEST',
},
},
target: stateMachine.stateMachineArn,
targetParameters: {
stepFunctionStateMachineParameters: {
invocationType: 'FIRE_AND_FORGET',
},
},
logConfiguration: {
cloudwatchLogsLogDestination: {
logGroupArn: logGroup.logGroupArn,
},
level: 'INFO',
},
});
}
}

export class ProcessLinks extends Construct {
constructor(scope: Construct, id: string, props: { table: dynamodb.ITable }) {
super(scope, id);

const { table } = props;

const bucket = new s3.Bucket(this, 'AssetsBucket', {
removalPolicy: RemovalPolicy.DESTROY,
autoDeleteObjects: true,
});

const stateMachine = new ExtractContentStateMachine(this, 'ExtractContentStateMachine', bucket);
new DdbToSfnPipe(this, 'DdbToSfnPipe', table, stateMachine.stateMachine);
}
}
1 change: 1 addition & 0 deletions amplify/extract/scraper/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
node_modules
13 changes: 13 additions & 0 deletions amplify/extract/scraper/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
FROM --platform=linux/amd64 public.ecr.aws/lambda/nodejs:18 as build-image

WORKDIR ${LAMBDA_TASK_ROOT}

COPY . .

# @sparticuz/chromium takes care of installing the browser
ENV PUPPETEER_SKIP_DOWNLOAD=true

RUN npm install
RUN npx tsc

CMD [ "index.handler" ]
1 change: 1 addition & 0 deletions amplify/extract/scraper/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
inspired by https://github.com/thejoeosborne/puppeteer-extra-lambda
153 changes: 153 additions & 0 deletions amplify/extract/scraper/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import { Context, Handler } from "aws-lambda";
import { Browser, Page, PuppeteerLaunchOptions } from "puppeteer";
import { PuppeteerExtra } from "puppeteer-extra";
import { S3Client, PutObjectCommand } from "@aws-sdk/client-s3";

interface Link {
id: string;
url: string;
comment: string;
}

const s3 = new S3Client({ region: process.env.AWS_REGION });

export const handler: Handler = async (
event: Link,
context: Context,
): Promise<any> => {
let browser: Browser | null = null;
try {
console.log("event:", event);
const puppeteer: PuppeteerExtra = require("puppeteer-extra");
const stealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(stealthPlugin());
const chromium = require("@sparticuz/chromium");

const browserPath = await chromium.executablePath();

console.log({path: browserPath})

const launchOptions: PuppeteerLaunchOptions = context.functionName
? {
headless: true,
executablePath: browserPath,
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
"--disable-gpu",
"--single-process",
"--incognito",
"--disable-client-side-phishing-detection",
"--disable-software-rasterizer",
],
}
: {
headless: false,
executablePath: browserPath,
};

console.log('launch', {launchOptions})
browser = await puppeteer.launch(launchOptions);
console.log('launched')
const page: Page = await browser.newPage();
console.log('page')

// Set viewport to a common desktop resolution
await page.setViewport({ width: 1920, height: 1080 });
console.log('viewport set')

await page.goto(event.url);
console.log('page loaded')
await new Promise((resolve) => setTimeout(resolve, 5000));
console.log('page content')

// Take viewport screenshot
const viewportScreenshot = await page.screenshot({ encoding: 'binary' });
console.log('viewport screenshot taken')

// Take full page screenshot
const fullPageScreenshot = await page.screenshot({ encoding: 'binary', fullPage: true });
console.log('full page screenshot taken')

const content = await page.content();

// Extract Open Graph image
const ogImageUrl = await page.evaluate(() => {
const metaTag = document.querySelector('meta[property="og:image"]');
return metaTag ? metaTag.getAttribute('content') : null;
});

if (ogImageUrl) {
const ogImageResponse = await fetch(ogImageUrl);
const ogImageBuffer = Buffer.from(await ogImageResponse.arrayBuffer());

// Save Open Graph image to S3
const ogImageParams = {
Bucket: process.env.BUCKET_NAME,
Key: `${event.id}_og_image.png`,
Body: ogImageBuffer,
ContentType: 'image/png'
};
await s3.send(new PutObjectCommand(ogImageParams));
console.log('Open Graph image saved to S3');
}

// Save HTML to S3
const htmlParams = {
Bucket: process.env.BUCKET_NAME,
Key: `${event.id}.html`,
Body: content,
ContentType: 'text/html'
};
await s3.send(new PutObjectCommand(htmlParams));

// Save viewport screenshot to S3
const viewportScreenshotParams = {
Bucket: process.env.BUCKET_NAME,
Key: `${event.id}_viewport.png`,
Body: viewportScreenshot,
ContentType: 'image/png'
};
await s3.send(new PutObjectCommand(viewportScreenshotParams));

// Save full page screenshot to S3
const fullPageScreenshotParams = {
Bucket: process.env.BUCKET_NAME,
Key: `${event.id}_fullpage.png`,
Body: fullPageScreenshot,
ContentType: 'image/png'
};
await s3.send(new PutObjectCommand(fullPageScreenshotParams));

return {
statusCode: 200,
body: JSON.stringify({ message: "HTML and screenshots saved to S3", id: event.id }),
};
} catch (e: any) {
console.log("Error in Lambda Handler:", e);
return {
statusCode: 500,
body: JSON.stringify({ error: e.message }),
};
} finally {
if (browser) {
try {
await Promise.race([
browser.close(),
new Promise((_, reject) => setTimeout(() => reject(new Error("Browser close timeout")), 10000))
]);
} catch (closeError) {
console.log("Error closing browser:", closeError);
}
}
// Clean up temporary files
const fs = require('fs').promises;
try {
await fs.rm('/tmp', { recursive: true, force: true });
await fs.mkdir('/tmp');
} catch (cleanupError) {
console.log("Error cleaning up temporary files:", cleanupError);
}
}
};
Loading