Jelajahi Sumber

Fix: Resolve another fetch-rss error

The fetch-rss function is still returning 500 errors.
gpt-engineer-app[bot] 5 bulan lalu
induk
melakukan
f887c02725
1 mengubah file dengan 83 tambahan dan 41 penghapusan
  1. 83 41
      supabase/functions/fetch-rss/index.ts

+ 83 - 41
supabase/functions/fetch-rss/index.ts

@@ -1,7 +1,6 @@
 
 import { serve } from "https://deno.land/std@0.168.0/http/server.ts"
 import { createClient } from 'https://esm.sh/@supabase/supabase-js@2'
-import { DOMParser } from 'https://deno.land/x/deno_dom@v0.1.38/deno-dom-wasm.ts'
 
 interface RSSItem {
   title: string;
@@ -24,6 +23,26 @@ const corsHeaders = {
   'Access-Control-Allow-Headers': 'authorization, x-client-info, apikey, content-type',
 }
 
+// Helper function to extract text content from XML tags
+function extractTextContent(xml: string, tagName: string): string {
+  const regex = new RegExp(`<${tagName}[^>]*>([\\s\\S]*?)<\\/${tagName}>`, 'i');
+  const match = xml.match(regex);
+  if (match && match[1]) {
+    // Remove CDATA wrapper if present and clean HTML tags
+    let content = match[1].replace(/<!\[CDATA\[(.*?)\]\]>/s, '$1');
+    content = content.replace(/<[^>]*>/g, '').trim();
+    return content;
+  }
+  return '';
+}
+
+// Helper function to extract attribute from XML tags
+function extractAttribute(xml: string, tagName: string, attributeName: string): string {
+  const regex = new RegExp(`<${tagName}[^>]*${attributeName}=["']([^"']*)["'][^>]*>`, 'i');
+  const match = xml.match(regex);
+  return match ? match[1] : '';
+}
+
 serve(async (req) => {
   if (req.method === 'OPTIONS') {
     return new Response('ok', { headers: corsHeaders })
@@ -46,52 +65,60 @@ serve(async (req) => {
     }
 
     const rssText = await response.text()
+    console.log(`RSS content length: ${rssText.length}`)
     
-    // Parse RSS using deno-dom DOMParser
-    const parser = new DOMParser()
-    const doc = parser.parseFromString(rssText, 'text/xml')
+    // Parse RSS using regex patterns
+    const items: RSSItem[] = [];
     
-    if (!doc) {
-      throw new Error('Failed to parse RSS XML')
-    }
-
-    // Extract RSS items
-    const items = Array.from(doc.querySelectorAll('item, entry')).map(item => {
-      const title = item.querySelector('title')?.textContent?.trim() || ''
-      const description = item.querySelector('description, summary')?.textContent?.trim() || ''
-      const link = item.querySelector('link')?.textContent?.trim() || 
-                  item.querySelector('link')?.getAttribute('href') || ''
-      const pubDate = item.querySelector('pubDate, published')?.textContent?.trim() || ''
-      const guid = item.querySelector('guid')?.textContent?.trim() || link
+    // Split by item or entry tags
+    const itemRegex = /<(item|entry)[^>]*>([\s\S]*?)<\/\1>/gi;
+    let itemMatch;
+    
+    while ((itemMatch = itemRegex.exec(rssText)) !== null) {
+      const itemXml = itemMatch[0];
       
-      // Try to extract image from content or enclosure
-      let image = ''
-      const enclosure = item.querySelector('enclosure[type^="image"]')
-      if (enclosure) {
-        image = enclosure.getAttribute('url') || ''
-      } else {
-        // Try to find image in content
-        const content = item.querySelector('content\\:encoded, content')?.textContent
-        if (content) {
-          const imgMatch = content.match(/<img[^>]+src="([^">]+)"/i)
-          if (imgMatch) {
-            image = imgMatch[1]
-          }
+      const title = extractTextContent(itemXml, 'title');
+      const description = extractTextContent(itemXml, 'description') || 
+                         extractTextContent(itemXml, 'summary') ||
+                         extractTextContent(itemXml, 'content');
+      
+      let link = extractTextContent(itemXml, 'link');
+      if (!link) {
+        // Try to get link from href attribute
+        link = extractAttribute(itemXml, 'link', 'href');
+      }
+      
+      const pubDate = extractTextContent(itemXml, 'pubDate') || 
+                     extractTextContent(itemXml, 'published') ||
+                     extractTextContent(itemXml, 'updated');
+      
+      const guid = extractTextContent(itemXml, 'guid') || 
+                  extractTextContent(itemXml, 'id') || 
+                  link;
+      
+      // Try to extract image from enclosure or content
+      let image = extractAttribute(itemXml, 'enclosure', 'url');
+      if (!image && description) {
+        const imgMatch = description.match(/src=["']([^"']*\.(jpg|jpeg|png|gif|webp))[^"']*/i);
+        if (imgMatch) {
+          image = imgMatch[1];
         }
       }
 
-      return {
-        title,
-        description,
-        link,
-        pubDate,
-        guid,
-        image,
-        content: description
+      if (title && guid) {
+        items.push({
+          title,
+          description: description || '',
+          link: link || '',
+          pubDate: pubDate || new Date().toISOString(),
+          guid,
+          image: image || '',
+          content: description || ''
+        });
       }
-    }).filter(item => item.title && item.guid)
+    }
 
-    console.log(`Found ${items.length} items`)
+    console.log(`Parsed ${items.length} items from RSS feed`)
 
     // Save articles to database
     const articlesToInsert = items.map(item => {
@@ -99,19 +126,34 @@ serve(async (req) => {
       const wordCount = (item.description || '').split(' ').length
       const readTime = Math.max(1, Math.ceil(wordCount / 200))
 
+      // Parse and validate date
+      let publishedAt: string;
+      try {
+        if (item.pubDate) {
+          publishedAt = new Date(item.pubDate).toISOString();
+        } else {
+          publishedAt = new Date().toISOString();
+        }
+      } catch (error) {
+        console.log(`Invalid date format: ${item.pubDate}, using current date`);
+        publishedAt = new Date().toISOString();
+      }
+
       return {
         feed_id: feedId,
         title: item.title,
         description: item.description,
         content: item.content || item.description,
-        url: item.link,
+        url: item.link || null,
         image_url: item.image || null,
-        published_at: item.pubDate ? new Date(item.pubDate).toISOString() : new Date().toISOString(),
+        published_at: publishedAt,
         guid: item.guid,
         read_time: readTime
       }
     })
 
+    console.log(`Preparing to insert ${articlesToInsert.length} articles`)
+
     // Insert articles (on conflict do nothing to avoid duplicates)
     const { error: insertError } = await supabaseClient
       .from('articles')