feat: split get metadata to standalone function and cache added

2025-07-23 18:37:34 +08:00 · 2025-07-23 18:37:34 +08:00 · cc238f8f2a
commit cc238f8f2a
parent 98d23e7c94
2 changed files with 109 additions and 63 deletions
--- a/src/components/shortcodes/LinkCard.astro
+++ b/src/components/shortcodes/LinkCard.astro
@ -1,4 +1,6 @@
 ---
+import { getMetadata, getWaybackMetadata } from '../../plugins/get-metadata';
+
 interface Props {
    url: string;
    showArchive?: boolean;
@ -29,71 +31,12 @@ function formatDateToNumber(date: Date | string | undefined): string {
    return `${year}${month}${day}`;
 }

-// Get metadata from the URL
-async function fetchMetadata(url: string) {
-    try {
-        const response = await fetch(url, {
-            headers: {
-                'User-Agent': 'Mozilla/5.0 (compatible; LinkCard/1.0)'
-            }
-        });
-
-        if (!response.ok) {
-            throw new Error(`HTTP ${response.status}`);
-        }
-
-        const html = await response.text();
-
-        // 提取元数据
-        const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
-        const descriptionMatch = html.match(/<meta[^>]+name=["']description["'][^>]+content=["']([^"']+)["']/i) ||
-            html.match(/<meta[^>]+property=["']og:description["'][^>]+content=["']([^"']+)["']/i);
-        const imageMatch = html.match(/<meta[^>]+property=["']og:image["'][^>]+content=["']([^"']+)["']/i) ||
-            html.match(/<meta[^>]+name=["']twitter:image["'][^>]+content=["']([^"']+)["']/i);
-        const siteNameMatch = html.match(/<meta[^>]+property=["']og:site_name["'][^>]+content=["']([^"']+)["']/i);
-
-        return {
-            title: titleMatch?.[1]?.trim() || new URL(url).hostname,
-            description: descriptionMatch?.[1]?.trim() || '',
-            image: imageMatch?.[1]?.trim() || '',
-            siteName: siteNameMatch?.[1]?.trim() || new URL(url).hostname,
-            domain: new URL(url).hostname
-        };
-    } catch (error) {
-        console.warn(`Failed to fetch metadata for ${url}:`, error);
-        const domain = new URL(url).hostname;
-        return {
-            title: domain,
-            description: '',
-            image: '',
-            siteName: domain,
-            domain
-        };
-    }
-}
-
-// Check if the URL is archived on the Wayback Machine at the updated/build time
-// TODO: bringing user's own archive service link
-async function checkArchive(url: string) {
-    try {
-        // Determine which date to use (prefer updatedDate if available, or fallback to the build time)
-        const timestamp = (updatedDate ? formatDateToNumber(updatedDate) : formatDateToNumber(pubDate)) || formatDateToNumber(new Date());
-        const archiveUrl = `https://archive.org/wayback/available?url=${encodeURIComponent(url)}&timestamp=${timestamp}`;
-        const response = await fetch(archiveUrl);
-        const data = await response.json();
-
-        if (data.archived_snapshots?.closest?.available) {
-            return data.archived_snapshots.closest.url;
-        }
-    } catch (error) {
-        console.warn(`Failed to check archive for ${url}:`, error);
-    }
-    return null;
-}
+// Determine which date to use (prefer updatedDate if available, or fallback to the build time)
+const timestamp = (updatedDate ? formatDateToNumber(updatedDate) : formatDateToNumber(pubDate)) || formatDateToNumber(new Date());

 // extract metadata and archive URL
-const metadata = Astro.props.title ? siteMetadata : await fetchMetadata(url);
-const archiveUrl = showArchive ? await checkArchive(url) : null;
+const metadata = Astro.props.title ? siteMetadata : await getMetadata(url);
+const archiveUrl = showArchive ? await getWaybackMetadata(url, timestamp) : null;
 ---

 <div class="link-card">
--- a/src/plugins/get-metadata.js
+++ b/src/plugins/get-metadata.js
@ -0,0 +1,103 @@
+import { parse } from "ultrahtml";
+import "ultrahtml/selector";
+import {querySelector} from "ultrahtml/selector";
+
+// Simple in-memory cache
+const metadataCache = new Map();
+
+export async function getMetadata(url) {
+    if (metadataCache.has(url)) {
+        const cached = metadataCache.get(url);
+        return cached.data;
+    }
+
+    try {
+        const response = await fetch(url, {
+            headers: {
+                'User-Agent': 'Mozilla/5.0 (compatible; LinkCard/1.1)',
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
+            }
+        });
+
+        if (!response.ok) {
+            throw new Error(`Request not succeed: HTTP ${response.status}`);
+        }
+
+        const html = await response.text();
+
+        const document = parse(html);
+
+        const metadata = {
+            title: '',
+            description: '',
+            image: '',
+            siteName: '',
+            domain: new URL(url).hostname
+        };
+
+        // Extract title
+        const titleElement = querySelector(document,'title');
+
+        if (titleElement) {
+            metadata.title = titleElement.children[0].value.trim();
+        }
+        // Extract other metadata
+        const descriptionElement = querySelector(document, 'meta[name="description"]');
+        if (descriptionElement) {
+            metadata.description = descriptionElement.attributes.content || '';
+        }
+        const imageElement = querySelector(document,'meta[property="og:image"]') || querySelector(document,'meta[name="twitter:image"]');
+        if (imageElement) {
+            metadata.image = imageElement.attributes.content || '';
+        }
+        const siteNameElement = querySelector(document,'meta[property="og:site_name"]')
+        if (siteNameElement) {
+            metadata.siteName = siteNameElement.attributes.content || '';
+        } else {
+            metadata.siteName = metadata.domain; // Fallback to domain if no site name found
+        }
+
+        // Store in cache
+        metadataCache.set(url, {
+            data: metadata
+        });
+
+        return metadata;
+    } catch (error) {
+        console.warn(`Failed to fetch metadata for ${url}:`, error);
+        const domain = new URL(url).hostname;
+        return {
+            title: domain,
+            description: '',
+            image: '',
+            siteName: domain,
+            domain
+        };
+    }
+}
+
+export async function getWaybackMetadata(url, timestamp){
+    try {
+        const archiveUrl = `https://archive.org/wayback/available?url=${encodeURIComponent(url)}&timestamp=${timestamp}`;
+
+        if (metadataCache.has(archiveUrl)) {
+            const cached = metadataCache.get(archiveUrl);
+            return cached.data;
+        }
+
+        const response = await fetch(archiveUrl);
+        const data = await response.json();
+
+        if (data.archived_snapshots?.closest?.available) {
+            // Store in cache
+            metadataCache.set(archiveUrl, {
+                data: data.archived_snapshots.closest.url
+            });
+
+            return data.archived_snapshots.closest.url;
+        }
+    } catch (error) {
+        console.warn(`Failed to check archive for ${url}:`, error);
+    }
+    return null;
+}