feat: split get metadata to standalone function and cache added
This commit is contained in:
parent
98d23e7c94
commit
cc238f8f2a
2 changed files with 109 additions and 63 deletions
|
@ -1,4 +1,6 @@
|
|||
---
|
||||
import { getMetadata, getWaybackMetadata } from '../../plugins/get-metadata';
|
||||
|
||||
interface Props {
|
||||
url: string;
|
||||
showArchive?: boolean;
|
||||
|
@ -29,71 +31,12 @@ function formatDateToNumber(date: Date | string | undefined): string {
|
|||
return `${year}${month}${day}`;
|
||||
}
|
||||
|
||||
// Get metadata from the URL
|
||||
async function fetchMetadata(url: string) {
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; LinkCard/1.0)'
|
||||
}
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
|
||||
const html = await response.text();
|
||||
|
||||
// 提取元数据
|
||||
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
|
||||
const descriptionMatch = html.match(/<meta[^>]+name=["']description["'][^>]+content=["']([^"']+)["']/i) ||
|
||||
html.match(/<meta[^>]+property=["']og:description["'][^>]+content=["']([^"']+)["']/i);
|
||||
const imageMatch = html.match(/<meta[^>]+property=["']og:image["'][^>]+content=["']([^"']+)["']/i) ||
|
||||
html.match(/<meta[^>]+name=["']twitter:image["'][^>]+content=["']([^"']+)["']/i);
|
||||
const siteNameMatch = html.match(/<meta[^>]+property=["']og:site_name["'][^>]+content=["']([^"']+)["']/i);
|
||||
|
||||
return {
|
||||
title: titleMatch?.[1]?.trim() || new URL(url).hostname,
|
||||
description: descriptionMatch?.[1]?.trim() || '',
|
||||
image: imageMatch?.[1]?.trim() || '',
|
||||
siteName: siteNameMatch?.[1]?.trim() || new URL(url).hostname,
|
||||
domain: new URL(url).hostname
|
||||
};
|
||||
} catch (error) {
|
||||
console.warn(`Failed to fetch metadata for ${url}:`, error);
|
||||
const domain = new URL(url).hostname;
|
||||
return {
|
||||
title: domain,
|
||||
description: '',
|
||||
image: '',
|
||||
siteName: domain,
|
||||
domain
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Check if the URL is archived on the Wayback Machine at the updated/build time
|
||||
// TODO: bringing user's own archive service link
|
||||
async function checkArchive(url: string) {
|
||||
try {
|
||||
// Determine which date to use (prefer updatedDate if available, or fallback to the build time)
|
||||
const timestamp = (updatedDate ? formatDateToNumber(updatedDate) : formatDateToNumber(pubDate)) || formatDateToNumber(new Date());
|
||||
const archiveUrl = `https://archive.org/wayback/available?url=${encodeURIComponent(url)}×tamp=${timestamp}`;
|
||||
const response = await fetch(archiveUrl);
|
||||
const data = await response.json();
|
||||
|
||||
if (data.archived_snapshots?.closest?.available) {
|
||||
return data.archived_snapshots.closest.url;
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn(`Failed to check archive for ${url}:`, error);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
// Determine which date to use (prefer updatedDate if available, or fallback to the build time)
|
||||
const timestamp = (updatedDate ? formatDateToNumber(updatedDate) : formatDateToNumber(pubDate)) || formatDateToNumber(new Date());
|
||||
|
||||
// extract metadata and archive URL
|
||||
const metadata = Astro.props.title ? siteMetadata : await fetchMetadata(url);
|
||||
const archiveUrl = showArchive ? await checkArchive(url) : null;
|
||||
const metadata = Astro.props.title ? siteMetadata : await getMetadata(url);
|
||||
const archiveUrl = showArchive ? await getWaybackMetadata(url, timestamp) : null;
|
||||
---
|
||||
|
||||
<div class="link-card">
|
||||
|
|
103
src/plugins/get-metadata.js
Normal file
103
src/plugins/get-metadata.js
Normal file
|
@ -0,0 +1,103 @@
|
|||
import { parse } from "ultrahtml";
|
||||
import "ultrahtml/selector";
|
||||
import {querySelector} from "ultrahtml/selector";
|
||||
|
||||
// Simple in-memory cache
|
||||
const metadataCache = new Map();
|
||||
|
||||
export async function getMetadata(url) {
|
||||
if (metadataCache.has(url)) {
|
||||
const cached = metadataCache.get(url);
|
||||
return cached.data;
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; LinkCard/1.1)',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
|
||||
}
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Request not succeed: HTTP ${response.status}`);
|
||||
}
|
||||
|
||||
const html = await response.text();
|
||||
|
||||
const document = parse(html);
|
||||
|
||||
const metadata = {
|
||||
title: '',
|
||||
description: '',
|
||||
image: '',
|
||||
siteName: '',
|
||||
domain: new URL(url).hostname
|
||||
};
|
||||
|
||||
// Extract title
|
||||
const titleElement = querySelector(document,'title');
|
||||
|
||||
if (titleElement) {
|
||||
metadata.title = titleElement.children[0].value.trim();
|
||||
}
|
||||
// Extract other metadata
|
||||
const descriptionElement = querySelector(document, 'meta[name="description"]');
|
||||
if (descriptionElement) {
|
||||
metadata.description = descriptionElement.attributes.content || '';
|
||||
}
|
||||
const imageElement = querySelector(document,'meta[property="og:image"]') || querySelector(document,'meta[name="twitter:image"]');
|
||||
if (imageElement) {
|
||||
metadata.image = imageElement.attributes.content || '';
|
||||
}
|
||||
const siteNameElement = querySelector(document,'meta[property="og:site_name"]')
|
||||
if (siteNameElement) {
|
||||
metadata.siteName = siteNameElement.attributes.content || '';
|
||||
} else {
|
||||
metadata.siteName = metadata.domain; // Fallback to domain if no site name found
|
||||
}
|
||||
|
||||
// Store in cache
|
||||
metadataCache.set(url, {
|
||||
data: metadata
|
||||
});
|
||||
|
||||
return metadata;
|
||||
} catch (error) {
|
||||
console.warn(`Failed to fetch metadata for ${url}:`, error);
|
||||
const domain = new URL(url).hostname;
|
||||
return {
|
||||
title: domain,
|
||||
description: '',
|
||||
image: '',
|
||||
siteName: domain,
|
||||
domain
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export async function getWaybackMetadata(url, timestamp){
|
||||
try {
|
||||
const archiveUrl = `https://archive.org/wayback/available?url=${encodeURIComponent(url)}×tamp=${timestamp}`;
|
||||
|
||||
if (metadataCache.has(archiveUrl)) {
|
||||
const cached = metadataCache.get(archiveUrl);
|
||||
return cached.data;
|
||||
}
|
||||
|
||||
const response = await fetch(archiveUrl);
|
||||
const data = await response.json();
|
||||
|
||||
if (data.archived_snapshots?.closest?.available) {
|
||||
// Store in cache
|
||||
metadataCache.set(archiveUrl, {
|
||||
data: data.archived_snapshots.closest.url
|
||||
});
|
||||
|
||||
return data.archived_snapshots.closest.url;
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn(`Failed to check archive for ${url}:`, error);
|
||||
}
|
||||
return null;
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue