index.ts 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
  1. import { serve } from "https://deno.land/std@0.168.0/http/server.ts"
  2. import { createClient } from 'https://esm.sh/@supabase/supabase-js@2'
  3. import { isValidExternalUrl, verifyAuth, isInternalCall, validateCronSecret } from '../_shared/security.ts'
  4. interface RSSItem {
  5. title: string;
  6. description: string;
  7. link: string;
  8. pubDate: string;
  9. guid?: string;
  10. content?: string;
  11. image?: string;
  12. }
  13. interface RSSFeed {
  14. title: string;
  15. description: string;
  16. items: RSSItem[];
  17. }
  18. const corsHeaders = {
  19. 'Access-Control-Allow-Origin': '*',
  20. 'Access-Control-Allow-Headers': 'authorization, x-client-info, apikey, content-type, x-cron-secret',
  21. }
  22. // Helper function to extract text content from XML tags
  23. function extractTextContent(xml: string, tagName: string): string {
  24. const regex = new RegExp(`<${tagName}[^>]*>([\\s\\S]*?)<\\/${tagName}>`, 'i');
  25. const match = xml.match(regex);
  26. if (match && match[1]) {
  27. // Remove CDATA wrapper if present and clean HTML tags
  28. let content = match[1].replace(/<!\[CDATA\[(.*?)\]\]>/s, '$1');
  29. content = content.replace(/<[^>]*>/g, '').trim();
  30. return content;
  31. }
  32. return '';
  33. }
  34. // Helper function to extract attribute from XML tags
  35. function extractAttribute(xml: string, tagName: string, attributeName: string): string {
  36. const regex = new RegExp(`<${tagName}[^>]*${attributeName}=["']([^"']*)["'][^>]*>`, 'i');
  37. const match = xml.match(regex);
  38. return match ? match[1] : '';
  39. }
  40. // Helper function to extract YouTube video ID from URL
  41. function getYouTubeVideoId(url: string): string | null {
  42. if (!url) return null;
  43. const patterns = [
  44. /(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/)([a-zA-Z0-9_-]{11})/,
  45. /youtube\.com\/.*[?&]v=([a-zA-Z0-9_-]{11})/
  46. ];
  47. for (const pattern of patterns) {
  48. const match = url.match(pattern);
  49. if (match) return match[1];
  50. }
  51. return null;
  52. }
  53. // Helper function to generate YouTube thumbnail URL
  54. function generateYouTubeThumbnail(videoUrl: string): string | null {
  55. const videoId = getYouTubeVideoId(videoUrl);
  56. return videoId ? `https://img.youtube.com/vi/${videoId}/hqdefault.jpg` : null;
  57. }
  58. serve(async (req) => {
  59. if (req.method === 'OPTIONS') {
  60. return new Response('ok', { headers: corsHeaders })
  61. }
  62. try {
  63. // Authentication: Allow internal calls (cron jobs) OR authenticated users
  64. const isCronJob = validateCronSecret(req);
  65. const isInternal = isInternalCall(req);
  66. const auth = await verifyAuth(req);
  67. if (!isCronJob && !isInternal && !auth) {
  68. console.log('Unauthorized access attempt to fetch-rss');
  69. return new Response(
  70. JSON.stringify({ error: 'Unauthorized', success: false }),
  71. { status: 401, headers: { ...corsHeaders, 'Content-Type': 'application/json' } }
  72. )
  73. }
  74. const supabaseClient = createClient(
  75. Deno.env.get('SUPABASE_URL') ?? '',
  76. Deno.env.get('SUPABASE_SERVICE_ROLE_KEY') ?? ''
  77. )
  78. const { feedId, feedUrl } = await req.json()
  79. // Input validation
  80. if (!feedId || typeof feedId !== 'string') {
  81. return new Response(
  82. JSON.stringify({ error: 'Invalid feed ID', success: false }),
  83. { status: 400, headers: { ...corsHeaders, 'Content-Type': 'application/json' } }
  84. )
  85. }
  86. if (!feedUrl || typeof feedUrl !== 'string') {
  87. return new Response(
  88. JSON.stringify({ error: 'Invalid feed URL', success: false }),
  89. { status: 400, headers: { ...corsHeaders, 'Content-Type': 'application/json' } }
  90. )
  91. }
  92. // SSRF Protection: Validate URL
  93. const urlValidation = isValidExternalUrl(feedUrl);
  94. if (!urlValidation.valid) {
  95. console.log(`SSRF blocked: ${feedUrl} - ${urlValidation.error}`);
  96. return new Response(
  97. JSON.stringify({ error: urlValidation.error, success: false }),
  98. { status: 400, headers: { ...corsHeaders, 'Content-Type': 'application/json' } }
  99. )
  100. }
  101. // Verify the feed exists in the database (prevents arbitrary URL fetching)
  102. const { data: feedData, error: feedError } = await supabaseClient
  103. .from('feeds')
  104. .select('id, url')
  105. .eq('id', feedId)
  106. .single()
  107. if (feedError || !feedData) {
  108. console.log(`Feed not found: ${feedId}`);
  109. return new Response(
  110. JSON.stringify({ error: 'Feed not found', success: false }),
  111. { status: 404, headers: { ...corsHeaders, 'Content-Type': 'application/json' } }
  112. )
  113. }
  114. console.log(`Fetching RSS for feed: ${feedId}, URL: ${feedUrl}`)
  115. // Fetch RSS content with timeout
  116. const controller = new AbortController();
  117. const timeoutId = setTimeout(() => controller.abort(), 30000); // 30s timeout
  118. let response;
  119. try {
  120. response = await fetch(feedUrl, {
  121. signal: controller.signal,
  122. headers: {
  123. 'User-Agent': 'Mozilla/5.0 (compatible; RSS Feed Reader/1.0)'
  124. }
  125. });
  126. } finally {
  127. clearTimeout(timeoutId);
  128. }
  129. if (!response.ok) {
  130. throw new Error(`Failed to fetch RSS: ${response.statusText}`)
  131. }
  132. const rssText = await response.text()
  133. // Limit content size to prevent memory exhaustion
  134. if (rssText.length > 5 * 1024 * 1024) { // 5MB limit
  135. throw new Error('RSS feed content too large');
  136. }
  137. console.log(`RSS content length: ${rssText.length}`)
  138. // Parse RSS using regex patterns
  139. const items: RSSItem[] = [];
  140. // Split by item or entry tags
  141. const itemRegex = /<(item|entry)[^>]*>([\s\S]*?)<\/\1>/gi;
  142. let itemMatch;
  143. while ((itemMatch = itemRegex.exec(rssText)) !== null) {
  144. const itemXml = itemMatch[0];
  145. const title = extractTextContent(itemXml, 'title');
  146. const description = extractTextContent(itemXml, 'description') ||
  147. extractTextContent(itemXml, 'summary') ||
  148. extractTextContent(itemXml, 'content');
  149. let link = extractTextContent(itemXml, 'link');
  150. if (!link) {
  151. // Try to get link from href attribute
  152. link = extractAttribute(itemXml, 'link', 'href');
  153. }
  154. const pubDate = extractTextContent(itemXml, 'pubDate') ||
  155. extractTextContent(itemXml, 'published') ||
  156. extractTextContent(itemXml, 'updated');
  157. const guid = extractTextContent(itemXml, 'guid') ||
  158. extractTextContent(itemXml, 'id') ||
  159. link;
  160. // Try to extract image from enclosure or content
  161. let image = extractAttribute(itemXml, 'enclosure', 'url');
  162. if (!image && description) {
  163. const imgMatch = description.match(/src=["']([^"']*\.(jpg|jpeg|png|gif|webp))[^"']*/i);
  164. if (imgMatch) {
  165. image = imgMatch[1];
  166. }
  167. }
  168. // Generate YouTube thumbnail if this is a YouTube link and no image found
  169. if (!image && link) {
  170. const youtubeImage = generateYouTubeThumbnail(link);
  171. if (youtubeImage) {
  172. image = youtubeImage;
  173. }
  174. }
  175. if (title && guid) {
  176. items.push({
  177. title,
  178. description: description || '',
  179. link: link || '',
  180. pubDate: pubDate || new Date().toISOString(),
  181. guid,
  182. image: image || '',
  183. content: description || ''
  184. });
  185. }
  186. }
  187. console.log(`Parsed ${items.length} items from RSS feed`)
  188. // Save articles to database
  189. const now = new Date().toISOString();
  190. // First, get existing article GUIDs for this feed to avoid full rewrites
  191. const { data: existingArticles } = await supabaseClient
  192. .from('articles')
  193. .select('guid')
  194. .eq('feed_id', feedId)
  195. const existingGuids = new Set((existingArticles || []).map((a: { guid: string | null }) => a.guid))
  196. const newItems = items.filter(item => !existingGuids.has(item.guid))
  197. const existingItems = items.filter(item => existingGuids.has(item.guid))
  198. console.log(`New articles: ${newItems.length}, existing: ${existingItems.length}`)
  199. // Only update last_seen_at for existing articles (no content rewrite = less I/O)
  200. if (existingItems.length > 0) {
  201. const { error: updateError } = await supabaseClient
  202. .from('articles')
  203. .update({ last_seen_at: now })
  204. .eq('feed_id', feedId)
  205. .in('guid', existingItems.map(i => i.guid).filter(Boolean))
  206. if (updateError) {
  207. console.error('Error updating last_seen_at:', updateError)
  208. }
  209. }
  210. // Insert only new articles
  211. const articlesToInsert = newItems.map(item => {
  212. const wordCount = (item.description || '').split(' ').length
  213. const readTime = Math.max(1, Math.ceil(wordCount / 200))
  214. let publishedAt: string;
  215. try {
  216. publishedAt = item.pubDate ? new Date(item.pubDate).toISOString() : new Date().toISOString();
  217. } catch {
  218. publishedAt = new Date().toISOString();
  219. }
  220. return {
  221. feed_id: feedId,
  222. title: item.title,
  223. description: item.description,
  224. content: item.content || item.description,
  225. url: item.link || null,
  226. image_url: item.image || null,
  227. published_at: publishedAt,
  228. guid: item.guid,
  229. read_time: readTime,
  230. last_seen_at: now
  231. }
  232. })
  233. console.log(`Preparing to insert ${articlesToInsert.length} new articles`)
  234. // Insert only truly new articles
  235. if (articlesToInsert.length > 0) {
  236. const { error: insertError } = await supabaseClient
  237. .from('articles')
  238. .insert(articlesToInsert)
  239. if (insertError) {
  240. console.error('Error inserting articles:', insertError)
  241. throw insertError
  242. }
  243. }
  244. // Get current article count for this feed
  245. const { count: currentArticleCount, error: countError } = await supabaseClient
  246. .from('articles')
  247. .select('*', { count: 'exact', head: true })
  248. .eq('feed_id', feedId)
  249. if (countError) {
  250. console.error('Error counting articles:', countError)
  251. }
  252. // Update feed's last_fetched_at and article_count
  253. const { error: feedUpdateError } = await supabaseClient
  254. .from('feeds')
  255. .update({
  256. last_fetched_at: new Date().toISOString(),
  257. status: 'active',
  258. article_count: currentArticleCount || 0
  259. })
  260. .eq('id', feedId)
  261. if (feedUpdateError) {
  262. console.error('Error updating feed:', feedUpdateError)
  263. throw feedUpdateError
  264. }
  265. const totalProcessed = newItems.length + existingItems.length
  266. console.log(`Feed ${feedId}: ${newItems.length} new articles inserted, ${existingItems.length} existing updated`)
  267. return new Response(
  268. JSON.stringify({
  269. success: true,
  270. articlesProcessed: totalProcessed,
  271. newArticles: newItems.length,
  272. updatedArticles: existingItems.length
  273. }),
  274. {
  275. headers: { ...corsHeaders, 'Content-Type': 'application/json' }
  276. }
  277. )
  278. } catch (error: any) {
  279. console.error('Error in fetch-rss function:', error)
  280. return new Response(
  281. JSON.stringify({
  282. error: 'An error occurred while fetching the feed',
  283. success: false
  284. }),
  285. {
  286. status: 500,
  287. headers: { ...corsHeaders, 'Content-Type': 'application/json' }
  288. }
  289. )
  290. }
  291. })