index.ts 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. import { serve } from "https://deno.land/std@0.168.0/http/server.ts"
  2. import { createClient } from 'https://esm.sh/@supabase/supabase-js@2'
  3. import { isValidExternalUrl, verifyAuth, isInternalCall, validateCronSecret } from '../_shared/security.ts'
  4. interface RSSItem {
  5. title: string;
  6. description: string;
  7. link: string;
  8. pubDate: string;
  9. guid?: string;
  10. content?: string;
  11. image?: string;
  12. }
  13. interface RSSFeed {
  14. title: string;
  15. description: string;
  16. items: RSSItem[];
  17. }
  18. const corsHeaders = {
  19. 'Access-Control-Allow-Origin': '*',
  20. 'Access-Control-Allow-Headers': 'authorization, x-client-info, apikey, content-type, x-cron-secret',
  21. }
  22. // Helper function to extract text content from XML tags
  23. function extractTextContent(xml: string, tagName: string): string {
  24. const regex = new RegExp(`<${tagName}[^>]*>([\\s\\S]*?)<\\/${tagName}>`, 'i');
  25. const match = xml.match(regex);
  26. if (match && match[1]) {
  27. // Remove CDATA wrapper if present and clean HTML tags
  28. let content = match[1].replace(/<!\[CDATA\[(.*?)\]\]>/s, '$1');
  29. content = content.replace(/<[^>]*>/g, '').trim();
  30. return content;
  31. }
  32. return '';
  33. }
  34. // Helper function to extract attribute from XML tags
  35. function extractAttribute(xml: string, tagName: string, attributeName: string): string {
  36. const regex = new RegExp(`<${tagName}[^>]*${attributeName}=["']([^"']*)["'][^>]*>`, 'i');
  37. const match = xml.match(regex);
  38. return match ? match[1] : '';
  39. }
  40. // Helper function to extract YouTube video ID from URL
  41. function getYouTubeVideoId(url: string): string | null {
  42. if (!url) return null;
  43. const patterns = [
  44. /(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/)([a-zA-Z0-9_-]{11})/,
  45. /youtube\.com\/.*[?&]v=([a-zA-Z0-9_-]{11})/
  46. ];
  47. for (const pattern of patterns) {
  48. const match = url.match(pattern);
  49. if (match) return match[1];
  50. }
  51. return null;
  52. }
  53. // Helper function to generate YouTube thumbnail URL
  54. function generateYouTubeThumbnail(videoUrl: string): string | null {
  55. const videoId = getYouTubeVideoId(videoUrl);
  56. return videoId ? `https://img.youtube.com/vi/${videoId}/hqdefault.jpg` : null;
  57. }
  58. serve(async (req) => {
  59. if (req.method === 'OPTIONS') {
  60. return new Response('ok', { headers: corsHeaders })
  61. }
  62. try {
  63. // Authentication: Allow internal calls (cron jobs) OR authenticated users
  64. const isCronJob = validateCronSecret(req);
  65. const isInternal = isInternalCall(req);
  66. const auth = await verifyAuth(req);
  67. if (!isCronJob && !isInternal && !auth) {
  68. console.log('Unauthorized access attempt to fetch-rss');
  69. return new Response(
  70. JSON.stringify({ error: 'Unauthorized', success: false }),
  71. { status: 401, headers: { ...corsHeaders, 'Content-Type': 'application/json' } }
  72. )
  73. }
  74. const supabaseClient = createClient(
  75. Deno.env.get('SUPABASE_URL') ?? '',
  76. Deno.env.get('SUPABASE_SERVICE_ROLE_KEY') ?? ''
  77. )
  78. const { feedId, feedUrl } = await req.json()
  79. // Input validation
  80. if (!feedId || typeof feedId !== 'string') {
  81. return new Response(
  82. JSON.stringify({ error: 'Invalid feed ID', success: false }),
  83. { status: 400, headers: { ...corsHeaders, 'Content-Type': 'application/json' } }
  84. )
  85. }
  86. if (!feedUrl || typeof feedUrl !== 'string') {
  87. return new Response(
  88. JSON.stringify({ error: 'Invalid feed URL', success: false }),
  89. { status: 400, headers: { ...corsHeaders, 'Content-Type': 'application/json' } }
  90. )
  91. }
  92. // SSRF Protection: Validate URL
  93. const urlValidation = isValidExternalUrl(feedUrl);
  94. if (!urlValidation.valid) {
  95. console.log(`SSRF blocked: ${feedUrl} - ${urlValidation.error}`);
  96. return new Response(
  97. JSON.stringify({ error: urlValidation.error, success: false }),
  98. { status: 400, headers: { ...corsHeaders, 'Content-Type': 'application/json' } }
  99. )
  100. }
  101. // Verify the feed exists in the database (prevents arbitrary URL fetching)
  102. const { data: feedData, error: feedError } = await supabaseClient
  103. .from('feeds')
  104. .select('id, url')
  105. .eq('id', feedId)
  106. .single()
  107. if (feedError || !feedData) {
  108. console.log(`Feed not found: ${feedId}`);
  109. return new Response(
  110. JSON.stringify({ error: 'Feed not found', success: false }),
  111. { status: 404, headers: { ...corsHeaders, 'Content-Type': 'application/json' } }
  112. )
  113. }
  114. console.log(`Fetching RSS for feed: ${feedId}, URL: ${feedUrl}`)
  115. // Fetch RSS content with timeout
  116. const controller = new AbortController();
  117. const timeoutId = setTimeout(() => controller.abort(), 30000); // 30s timeout
  118. let response;
  119. try {
  120. response = await fetch(feedUrl, {
  121. signal: controller.signal,
  122. headers: {
  123. 'User-Agent': 'Mozilla/5.0 (compatible; RSS Feed Reader/1.0)'
  124. }
  125. });
  126. } finally {
  127. clearTimeout(timeoutId);
  128. }
  129. if (!response.ok) {
  130. throw new Error(`Failed to fetch RSS: ${response.statusText}`)
  131. }
  132. const rssText = await response.text()
  133. // Limit content size to prevent memory exhaustion
  134. if (rssText.length > 5 * 1024 * 1024) { // 5MB limit
  135. throw new Error('RSS feed content too large');
  136. }
  137. console.log(`RSS content length: ${rssText.length}`)
  138. // Parse RSS using regex patterns
  139. const items: RSSItem[] = [];
  140. // Split by item or entry tags
  141. const itemRegex = /<(item|entry)[^>]*>([\s\S]*?)<\/\1>/gi;
  142. let itemMatch;
  143. while ((itemMatch = itemRegex.exec(rssText)) !== null) {
  144. const itemXml = itemMatch[0];
  145. const title = extractTextContent(itemXml, 'title');
  146. const description = extractTextContent(itemXml, 'description') ||
  147. extractTextContent(itemXml, 'summary') ||
  148. extractTextContent(itemXml, 'content');
  149. let link = extractTextContent(itemXml, 'link');
  150. if (!link) {
  151. // Try to get link from href attribute
  152. link = extractAttribute(itemXml, 'link', 'href');
  153. }
  154. const pubDate = extractTextContent(itemXml, 'pubDate') ||
  155. extractTextContent(itemXml, 'published') ||
  156. extractTextContent(itemXml, 'updated');
  157. const guid = extractTextContent(itemXml, 'guid') ||
  158. extractTextContent(itemXml, 'id') ||
  159. link;
  160. // Try to extract image from enclosure or content
  161. let image = extractAttribute(itemXml, 'enclosure', 'url');
  162. if (!image && description) {
  163. const imgMatch = description.match(/src=["']([^"']*\.(jpg|jpeg|png|gif|webp))[^"']*/i);
  164. if (imgMatch) {
  165. image = imgMatch[1];
  166. }
  167. }
  168. // Generate YouTube thumbnail if this is a YouTube link and no image found
  169. if (!image && link) {
  170. const youtubeImage = generateYouTubeThumbnail(link);
  171. if (youtubeImage) {
  172. image = youtubeImage;
  173. }
  174. }
  175. if (title && guid) {
  176. items.push({
  177. title,
  178. description: description || '',
  179. link: link || '',
  180. pubDate: pubDate || new Date().toISOString(),
  181. guid,
  182. image: image || '',
  183. content: description || ''
  184. });
  185. }
  186. }
  187. console.log(`Parsed ${items.length} items from RSS feed`)
  188. // Save articles to database
  189. const now = new Date().toISOString();
  190. const articlesToInsert = items.map(item => {
  191. // Calculate read time (rough estimate: 200 words per minute)
  192. const wordCount = (item.description || '').split(' ').length
  193. const readTime = Math.max(1, Math.ceil(wordCount / 200))
  194. // Parse and validate date
  195. let publishedAt: string;
  196. try {
  197. if (item.pubDate) {
  198. publishedAt = new Date(item.pubDate).toISOString();
  199. } else {
  200. publishedAt = new Date().toISOString();
  201. }
  202. } catch (error) {
  203. console.log(`Invalid date format: ${item.pubDate}, using current date`);
  204. publishedAt = new Date().toISOString();
  205. }
  206. return {
  207. feed_id: feedId,
  208. title: item.title,
  209. description: item.description,
  210. content: item.content || item.description,
  211. url: item.link || null,
  212. image_url: item.image || null,
  213. published_at: publishedAt,
  214. guid: item.guid,
  215. read_time: readTime,
  216. last_seen_at: now
  217. }
  218. })
  219. console.log(`Preparing to insert ${articlesToInsert.length} articles`)
  220. // Insert articles - on conflict update last_seen_at
  221. const { error: insertError } = await supabaseClient
  222. .from('articles')
  223. .upsert(articlesToInsert, {
  224. onConflict: 'feed_id,guid'
  225. })
  226. if (insertError) {
  227. console.error('Error inserting articles:', insertError)
  228. throw insertError
  229. }
  230. // Get current article count for this feed
  231. const { count: currentArticleCount, error: countError } = await supabaseClient
  232. .from('articles')
  233. .select('*', { count: 'exact', head: true })
  234. .eq('feed_id', feedId)
  235. if (countError) {
  236. console.error('Error counting articles:', countError)
  237. }
  238. // Update feed's last_fetched_at and article_count
  239. const { error: updateError } = await supabaseClient
  240. .from('feeds')
  241. .update({
  242. last_fetched_at: new Date().toISOString(),
  243. status: 'active',
  244. article_count: currentArticleCount || 0
  245. })
  246. .eq('id', feedId)
  247. if (updateError) {
  248. console.error('Error updating feed:', updateError)
  249. throw updateError
  250. }
  251. console.log(`Successfully processed ${articlesToInsert.length} articles for feed ${feedId}`)
  252. return new Response(
  253. JSON.stringify({
  254. success: true,
  255. articlesProcessed: articlesToInsert.length
  256. }),
  257. {
  258. headers: { ...corsHeaders, 'Content-Type': 'application/json' }
  259. }
  260. )
  261. } catch (error: any) {
  262. console.error('Error in fetch-rss function:', error)
  263. return new Response(
  264. JSON.stringify({
  265. error: 'An error occurred while fetching the feed',
  266. success: false
  267. }),
  268. {
  269. status: 500,
  270. headers: { ...corsHeaders, 'Content-Type': 'application/json' }
  271. }
  272. )
  273. }
  274. })