index.ts 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. import { serve } from "https://deno.land/std@0.168.0/http/server.ts"
  2. import { createClient } from 'https://esm.sh/@supabase/supabase-js@2'
  3. interface RSSItem {
  4. title: string;
  5. description: string;
  6. link: string;
  7. pubDate: string;
  8. guid?: string;
  9. content?: string;
  10. image?: string;
  11. }
  12. interface RSSFeed {
  13. title: string;
  14. description: string;
  15. items: RSSItem[];
  16. }
  17. const corsHeaders = {
  18. 'Access-Control-Allow-Origin': '*',
  19. 'Access-Control-Allow-Headers': 'authorization, x-client-info, apikey, content-type',
  20. }
  21. // Helper function to extract text content from XML tags
  22. function extractTextContent(xml: string, tagName: string): string {
  23. const regex = new RegExp(`<${tagName}[^>]*>([\\s\\S]*?)<\\/${tagName}>`, 'i');
  24. const match = xml.match(regex);
  25. if (match && match[1]) {
  26. // Remove CDATA wrapper if present and clean HTML tags
  27. let content = match[1].replace(/<!\[CDATA\[(.*?)\]\]>/s, '$1');
  28. content = content.replace(/<[^>]*>/g, '').trim();
  29. return content;
  30. }
  31. return '';
  32. }
  33. // Helper function to extract attribute from XML tags
  34. function extractAttribute(xml: string, tagName: string, attributeName: string): string {
  35. const regex = new RegExp(`<${tagName}[^>]*${attributeName}=["']([^"']*)["'][^>]*>`, 'i');
  36. const match = xml.match(regex);
  37. return match ? match[1] : '';
  38. }
  39. // Helper function to extract YouTube video ID from URL
  40. function getYouTubeVideoId(url: string): string | null {
  41. if (!url) return null;
  42. const patterns = [
  43. /(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/)([a-zA-Z0-9_-]{11})/,
  44. /youtube\.com\/.*[?&]v=([a-zA-Z0-9_-]{11})/
  45. ];
  46. for (const pattern of patterns) {
  47. const match = url.match(pattern);
  48. if (match) return match[1];
  49. }
  50. return null;
  51. }
  52. // Helper function to generate YouTube thumbnail URL
  53. function generateYouTubeThumbnail(videoUrl: string): string | null {
  54. const videoId = getYouTubeVideoId(videoUrl);
  55. return videoId ? `https://img.youtube.com/vi/${videoId}/hqdefault.jpg` : null;
  56. }
  57. serve(async (req) => {
  58. if (req.method === 'OPTIONS') {
  59. return new Response('ok', { headers: corsHeaders })
  60. }
  61. try {
  62. const supabaseClient = createClient(
  63. Deno.env.get('SUPABASE_URL') ?? '',
  64. Deno.env.get('SUPABASE_SERVICE_ROLE_KEY') ?? ''
  65. )
  66. const { feedId, feedUrl } = await req.json()
  67. console.log(`Fetching RSS for feed: ${feedId}, URL: ${feedUrl}`)
  68. // Fetch RSS content
  69. const response = await fetch(feedUrl)
  70. if (!response.ok) {
  71. throw new Error(`Failed to fetch RSS: ${response.statusText}`)
  72. }
  73. const rssText = await response.text()
  74. console.log(`RSS content length: ${rssText.length}`)
  75. // Parse RSS using regex patterns
  76. const items: RSSItem[] = [];
  77. // Split by item or entry tags
  78. const itemRegex = /<(item|entry)[^>]*>([\s\S]*?)<\/\1>/gi;
  79. let itemMatch;
  80. while ((itemMatch = itemRegex.exec(rssText)) !== null) {
  81. const itemXml = itemMatch[0];
  82. const title = extractTextContent(itemXml, 'title');
  83. const description = extractTextContent(itemXml, 'description') ||
  84. extractTextContent(itemXml, 'summary') ||
  85. extractTextContent(itemXml, 'content');
  86. let link = extractTextContent(itemXml, 'link');
  87. if (!link) {
  88. // Try to get link from href attribute
  89. link = extractAttribute(itemXml, 'link', 'href');
  90. }
  91. const pubDate = extractTextContent(itemXml, 'pubDate') ||
  92. extractTextContent(itemXml, 'published') ||
  93. extractTextContent(itemXml, 'updated');
  94. const guid = extractTextContent(itemXml, 'guid') ||
  95. extractTextContent(itemXml, 'id') ||
  96. link;
  97. // Try to extract image from enclosure or content
  98. let image = extractAttribute(itemXml, 'enclosure', 'url');
  99. if (!image && description) {
  100. const imgMatch = description.match(/src=["']([^"']*\.(jpg|jpeg|png|gif|webp))[^"']*/i);
  101. if (imgMatch) {
  102. image = imgMatch[1];
  103. }
  104. }
  105. // Generate YouTube thumbnail if this is a YouTube link and no image found
  106. if (!image && link) {
  107. const youtubeImage = generateYouTubeThumbnail(link);
  108. if (youtubeImage) {
  109. image = youtubeImage;
  110. }
  111. }
  112. if (title && guid) {
  113. items.push({
  114. title,
  115. description: description || '',
  116. link: link || '',
  117. pubDate: pubDate || new Date().toISOString(),
  118. guid,
  119. image: image || '',
  120. content: description || ''
  121. });
  122. }
  123. }
  124. console.log(`Parsed ${items.length} items from RSS feed`)
  125. // Save articles to database
  126. const articlesToInsert = items.map(item => {
  127. // Calculate read time (rough estimate: 200 words per minute)
  128. const wordCount = (item.description || '').split(' ').length
  129. const readTime = Math.max(1, Math.ceil(wordCount / 200))
  130. // Parse and validate date
  131. let publishedAt: string;
  132. try {
  133. if (item.pubDate) {
  134. publishedAt = new Date(item.pubDate).toISOString();
  135. } else {
  136. publishedAt = new Date().toISOString();
  137. }
  138. } catch (error) {
  139. console.log(`Invalid date format: ${item.pubDate}, using current date`);
  140. publishedAt = new Date().toISOString();
  141. }
  142. return {
  143. feed_id: feedId,
  144. title: item.title,
  145. description: item.description,
  146. content: item.content || item.description,
  147. url: item.link || null,
  148. image_url: item.image || null,
  149. published_at: publishedAt,
  150. guid: item.guid,
  151. read_time: readTime
  152. }
  153. })
  154. console.log(`Preparing to insert ${articlesToInsert.length} articles`)
  155. // Insert articles (on conflict do nothing to avoid duplicates)
  156. const { error: insertError } = await supabaseClient
  157. .from('articles')
  158. .upsert(articlesToInsert, {
  159. onConflict: 'feed_id,guid',
  160. ignoreDuplicates: true
  161. })
  162. if (insertError) {
  163. console.error('Error inserting articles:', insertError)
  164. throw insertError
  165. }
  166. // Get current article count for this feed
  167. const { count: currentArticleCount, error: countError } = await supabaseClient
  168. .from('articles')
  169. .select('*', { count: 'exact', head: true })
  170. .eq('feed_id', feedId)
  171. if (countError) {
  172. console.error('Error counting articles:', countError)
  173. }
  174. // Update feed's last_fetched_at and article_count
  175. const { error: updateError } = await supabaseClient
  176. .from('feeds')
  177. .update({
  178. last_fetched_at: new Date().toISOString(),
  179. status: 'active',
  180. article_count: currentArticleCount || 0
  181. })
  182. .eq('id', feedId)
  183. if (updateError) {
  184. console.error('Error updating feed:', updateError)
  185. throw updateError
  186. }
  187. console.log(`Successfully processed ${articlesToInsert.length} articles for feed ${feedId}`)
  188. return new Response(
  189. JSON.stringify({
  190. success: true,
  191. articlesProcessed: articlesToInsert.length
  192. }),
  193. {
  194. headers: { ...corsHeaders, 'Content-Type': 'application/json' }
  195. }
  196. )
  197. } catch (error) {
  198. console.error('Error in fetch-rss function:', error)
  199. return new Response(
  200. JSON.stringify({
  201. error: error.message,
  202. success: false
  203. }),
  204. {
  205. status: 500,
  206. headers: { ...corsHeaders, 'Content-Type': 'application/json' }
  207. }
  208. )
  209. }
  210. })