40
40
* byte arrays at the end of sockaddr_ll
41
41
* and packet_mreq.
42
42
* Johann Baudy : Added TX RING.
43
+ * Chetan Loke : Implemented TPACKET_V3 block abstraction
44
+ * layer.
45
+ * Copyright (C) 2011, <lokec@ccs.neu.edu>
46
+ *
43
47
*
44
48
* This program is free software; you can redistribute it and/or
45
49
* modify it under the terms of the GNU General Public License
@@ -161,9 +165,56 @@ struct packet_mreq_max {
161
165
unsigned char mr_address [MAX_ADDR_LEN ];
162
166
};
163
167
164
- static int packet_set_ring (struct sock * sk , struct tpacket_req * req ,
168
+ static int packet_set_ring (struct sock * sk , union tpacket_req_u * req_u ,
165
169
int closing , int tx_ring );
166
170
171
+
172
+ #define V3_ALIGNMENT (8)
173
+
174
+ #define BLK_HDR_LEN (ALIGN(sizeof(struct block_desc), V3_ALIGNMENT))
175
+
176
+ #define BLK_PLUS_PRIV (sz_of_priv ) \
177
+ (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
178
+
179
+ /* kbdq - kernel block descriptor queue */
180
+ struct kbdq_core {
181
+ struct pgv * pkbdq ;
182
+ unsigned int feature_req_word ;
183
+ unsigned int hdrlen ;
184
+ unsigned char reset_pending_on_curr_blk ;
185
+ unsigned char delete_blk_timer ;
186
+ unsigned short kactive_blk_num ;
187
+ unsigned short blk_sizeof_priv ;
188
+
189
+ /* last_kactive_blk_num:
190
+ * trick to see if user-space has caught up
191
+ * in order to avoid refreshing timer when every single pkt arrives.
192
+ */
193
+ unsigned short last_kactive_blk_num ;
194
+
195
+ char * pkblk_start ;
196
+ char * pkblk_end ;
197
+ int kblk_size ;
198
+ unsigned int knum_blocks ;
199
+ uint64_t knxt_seq_num ;
200
+ char * prev ;
201
+ char * nxt_offset ;
202
+ struct sk_buff * skb ;
203
+
204
+ atomic_t blk_fill_in_prog ;
205
+
206
+ /* Default is set to 8ms */
207
+ #define DEFAULT_PRB_RETIRE_TOV (8)
208
+
209
+ unsigned short retire_blk_tov ;
210
+ unsigned short version ;
211
+ unsigned long tov_in_jiffies ;
212
+
213
+ /* timer to retire an outstanding block */
214
+ struct timer_list retire_blk_timer ;
215
+ };
216
+
217
+ #define PGV_FROM_VMALLOC 1
167
218
struct pgv {
168
219
char * buffer ;
169
220
};
@@ -179,12 +230,40 @@ struct packet_ring_buffer {
179
230
unsigned int pg_vec_pages ;
180
231
unsigned int pg_vec_len ;
181
232
233
+ struct kbdq_core prb_bdqc ;
182
234
atomic_t pending ;
183
235
};
184
236
237
+ #define BLOCK_STATUS (x ) ((x)->hdr.bh1.block_status)
238
+ #define BLOCK_NUM_PKTS (x ) ((x)->hdr.bh1.num_pkts)
239
+ #define BLOCK_O2FP (x ) ((x)->hdr.bh1.offset_to_first_pkt)
240
+ #define BLOCK_LEN (x ) ((x)->hdr.bh1.blk_len)
241
+ #define BLOCK_SNUM (x ) ((x)->hdr.bh1.seq_num)
242
+ #define BLOCK_O2PRIV (x ) ((x)->offset_to_priv)
243
+ #define BLOCK_PRIV (x ) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
244
+
185
245
struct packet_sock ;
186
246
static int tpacket_snd (struct packet_sock * po , struct msghdr * msg );
187
247
248
+ static void * packet_previous_frame (struct packet_sock * po ,
249
+ struct packet_ring_buffer * rb ,
250
+ int status );
251
+ static void packet_increment_head (struct packet_ring_buffer * buff );
252
+ static int prb_curr_blk_in_use (struct kbdq_core * ,
253
+ struct block_desc * );
254
+ static void * prb_dispatch_next_block (struct kbdq_core * ,
255
+ struct packet_sock * );
256
+ static void prb_retire_current_block (struct kbdq_core * ,
257
+ struct packet_sock * , unsigned int status );
258
+ static int prb_queue_frozen (struct kbdq_core * );
259
+ static void prb_open_block (struct kbdq_core * , struct block_desc * );
260
+ static void prb_retire_rx_blk_timer_expired (unsigned long );
261
+ static void _prb_refresh_rx_retire_blk_timer (struct kbdq_core * );
262
+ static void prb_init_blk_timer (struct packet_sock * , struct kbdq_core * ,
263
+ void (* func ) (unsigned long ));
264
+ static void prb_fill_rxhash (struct kbdq_core * , struct tpacket3_hdr * );
265
+ static void prb_clear_rxhash (struct kbdq_core * , struct tpacket3_hdr * );
266
+ static void prb_fill_vlan_info (struct kbdq_core * , struct tpacket3_hdr * );
188
267
static void packet_flush_mclist (struct sock * sk );
189
268
190
269
struct packet_fanout ;
@@ -193,6 +272,7 @@ struct packet_sock {
193
272
struct sock sk ;
194
273
struct packet_fanout * fanout ;
195
274
struct tpacket_stats stats ;
275
+ union tpacket_stats_u stats_u ;
196
276
struct packet_ring_buffer rx_ring ;
197
277
struct packet_ring_buffer tx_ring ;
198
278
int copy_thresh ;
@@ -242,6 +322,15 @@ struct packet_skb_cb {
242
322
243
323
#define PACKET_SKB_CB (__skb ) ((struct packet_skb_cb *)((__skb)->cb))
244
324
325
+ #define GET_PBDQC_FROM_RB (x ) ((struct kbdq_core *)(&(x)->prb_bdqc))
326
+ #define GET_PBLOCK_DESC (x , bid ) \
327
+ ((struct block_desc *)((x)->pkbdq[(bid)].buffer))
328
+ #define GET_CURR_PBLOCK_DESC_FROM_CORE (x ) \
329
+ ((struct block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
330
+ #define GET_NEXT_PRB_BLK_NUM (x ) \
331
+ (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
332
+ ((x)->kactive_blk_num+1) : 0)
333
+
245
334
static inline struct packet_sock * pkt_sk (struct sock * sk )
246
335
{
247
336
return (struct packet_sock * )sk ;
@@ -325,8 +414,9 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
325
414
h .h2 -> tp_status = status ;
326
415
flush_dcache_page (pgv_to_page (& h .h2 -> tp_status ));
327
416
break ;
417
+ case TPACKET_V3 :
328
418
default :
329
- pr_err ( "TPACKET version not supported\n" );
419
+ WARN ( 1 , "TPACKET version not supported. \n" );
330
420
BUG ();
331
421
}
332
422
@@ -351,8 +441,9 @@ static int __packet_get_status(struct packet_sock *po, void *frame)
351
441
case TPACKET_V2 :
352
442
flush_dcache_page (pgv_to_page (& h .h2 -> tp_status ));
353
443
return h .h2 -> tp_status ;
444
+ case TPACKET_V3 :
354
445
default :
355
- pr_err ( "TPACKET version not supported\n" );
446
+ WARN ( 1 , "TPACKET version not supported. \n" );
356
447
BUG ();
357
448
return 0 ;
358
449
}
@@ -389,6 +480,665 @@ static inline void *packet_current_frame(struct packet_sock *po,
389
480
return packet_lookup_frame (po , rb , rb -> head , status );
390
481
}
391
482
483
+ static void prb_del_retire_blk_timer (struct kbdq_core * pkc )
484
+ {
485
+ del_timer_sync (& pkc -> retire_blk_timer );
486
+ }
487
+
488
+ static void prb_shutdown_retire_blk_timer (struct packet_sock * po ,
489
+ int tx_ring ,
490
+ struct sk_buff_head * rb_queue )
491
+ {
492
+ struct kbdq_core * pkc ;
493
+
494
+ pkc = tx_ring ? & po -> tx_ring .prb_bdqc : & po -> rx_ring .prb_bdqc ;
495
+
496
+ spin_lock (& rb_queue -> lock );
497
+ pkc -> delete_blk_timer = 1 ;
498
+ spin_unlock (& rb_queue -> lock );
499
+
500
+ prb_del_retire_blk_timer (pkc );
501
+ }
502
+
503
+ static void prb_init_blk_timer (struct packet_sock * po ,
504
+ struct kbdq_core * pkc ,
505
+ void (* func ) (unsigned long ))
506
+ {
507
+ init_timer (& pkc -> retire_blk_timer );
508
+ pkc -> retire_blk_timer .data = (long )po ;
509
+ pkc -> retire_blk_timer .function = func ;
510
+ pkc -> retire_blk_timer .expires = jiffies ;
511
+ }
512
+
513
+ static void prb_setup_retire_blk_timer (struct packet_sock * po , int tx_ring )
514
+ {
515
+ struct kbdq_core * pkc ;
516
+
517
+ if (tx_ring )
518
+ BUG ();
519
+
520
+ pkc = tx_ring ? & po -> tx_ring .prb_bdqc : & po -> rx_ring .prb_bdqc ;
521
+ prb_init_blk_timer (po , pkc , prb_retire_rx_blk_timer_expired );
522
+ }
523
+
524
+ static int prb_calc_retire_blk_tmo (struct packet_sock * po ,
525
+ int blk_size_in_bytes )
526
+ {
527
+ struct net_device * dev ;
528
+ unsigned int mbits = 0 , msec = 0 , div = 0 , tmo = 0 ;
529
+
530
+ dev = dev_get_by_index (sock_net (& po -> sk ), po -> ifindex );
531
+ if (unlikely (dev == NULL ))
532
+ return DEFAULT_PRB_RETIRE_TOV ;
533
+
534
+ if (dev -> ethtool_ops && dev -> ethtool_ops -> get_settings ) {
535
+ struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET , };
536
+
537
+ if (!dev -> ethtool_ops -> get_settings (dev , & ecmd )) {
538
+ switch (ecmd .speed ) {
539
+ case SPEED_10000 :
540
+ msec = 1 ;
541
+ div = 10000 /1000 ;
542
+ break ;
543
+ case SPEED_1000 :
544
+ msec = 1 ;
545
+ div = 1000 /1000 ;
546
+ break ;
547
+ /*
548
+ * If the link speed is so slow you don't really
549
+ * need to worry about perf anyways
550
+ */
551
+ case SPEED_100 :
552
+ case SPEED_10 :
553
+ default :
554
+ return DEFAULT_PRB_RETIRE_TOV ;
555
+ }
556
+ }
557
+ }
558
+
559
+ mbits = (blk_size_in_bytes * 8 ) / (1024 * 1024 );
560
+
561
+ if (div )
562
+ mbits /= div ;
563
+
564
+ tmo = mbits * msec ;
565
+
566
+ if (div )
567
+ return tmo + 1 ;
568
+ return tmo ;
569
+ }
570
+
571
+ static void prb_init_ft_ops (struct kbdq_core * p1 ,
572
+ union tpacket_req_u * req_u )
573
+ {
574
+ p1 -> feature_req_word = req_u -> req3 .tp_feature_req_word ;
575
+ }
576
+
577
+ static void init_prb_bdqc (struct packet_sock * po ,
578
+ struct packet_ring_buffer * rb ,
579
+ struct pgv * pg_vec ,
580
+ union tpacket_req_u * req_u , int tx_ring )
581
+ {
582
+ struct kbdq_core * p1 = & rb -> prb_bdqc ;
583
+ struct block_desc * pbd ;
584
+
585
+ memset (p1 , 0x0 , sizeof (* p1 ));
586
+
587
+ p1 -> knxt_seq_num = 1 ;
588
+ p1 -> pkbdq = pg_vec ;
589
+ pbd = (struct block_desc * )pg_vec [0 ].buffer ;
590
+ p1 -> pkblk_start = (char * )pg_vec [0 ].buffer ;
591
+ p1 -> kblk_size = req_u -> req3 .tp_block_size ;
592
+ p1 -> knum_blocks = req_u -> req3 .tp_block_nr ;
593
+ p1 -> hdrlen = po -> tp_hdrlen ;
594
+ p1 -> version = po -> tp_version ;
595
+ p1 -> last_kactive_blk_num = 0 ;
596
+ po -> stats_u .stats3 .tp_freeze_q_cnt = 0 ;
597
+ if (req_u -> req3 .tp_retire_blk_tov )
598
+ p1 -> retire_blk_tov = req_u -> req3 .tp_retire_blk_tov ;
599
+ else
600
+ p1 -> retire_blk_tov = prb_calc_retire_blk_tmo (po ,
601
+ req_u -> req3 .tp_block_size );
602
+ p1 -> tov_in_jiffies = msecs_to_jiffies (p1 -> retire_blk_tov );
603
+ p1 -> blk_sizeof_priv = req_u -> req3 .tp_sizeof_priv ;
604
+
605
+ prb_init_ft_ops (p1 , req_u );
606
+ prb_setup_retire_blk_timer (po , tx_ring );
607
+ prb_open_block (p1 , pbd );
608
+ }
609
+
610
+ /* Do NOT update the last_blk_num first.
611
+ * Assumes sk_buff_head lock is held.
612
+ */
613
+ static void _prb_refresh_rx_retire_blk_timer (struct kbdq_core * pkc )
614
+ {
615
+ mod_timer (& pkc -> retire_blk_timer ,
616
+ jiffies + pkc -> tov_in_jiffies );
617
+ pkc -> last_kactive_blk_num = pkc -> kactive_blk_num ;
618
+ }
619
+
620
+ /*
621
+ * Timer logic:
622
+ * 1) We refresh the timer only when we open a block.
623
+ * By doing this we don't waste cycles refreshing the timer
624
+ * on packet-by-packet basis.
625
+ *
626
+ * With a 1MB block-size, on a 1Gbps line, it will take
627
+ * i) ~8 ms to fill a block + ii) memcpy etc.
628
+ * In this cut we are not accounting for the memcpy time.
629
+ *
630
+ * So, if the user sets the 'tmo' to 10ms then the timer
631
+ * will never fire while the block is still getting filled
632
+ * (which is what we want). However, the user could choose
633
+ * to close a block early and that's fine.
634
+ *
635
+ * But when the timer does fire, we check whether or not to refresh it.
636
+ * Since the tmo granularity is in msecs, it is not too expensive
637
+ * to refresh the timer, lets say every '8' msecs.
638
+ * Either the user can set the 'tmo' or we can derive it based on
639
+ * a) line-speed and b) block-size.
640
+ * prb_calc_retire_blk_tmo() calculates the tmo.
641
+ *
642
+ */
643
+ static void prb_retire_rx_blk_timer_expired (unsigned long data )
644
+ {
645
+ struct packet_sock * po = (struct packet_sock * )data ;
646
+ struct kbdq_core * pkc = & po -> rx_ring .prb_bdqc ;
647
+ unsigned int frozen ;
648
+ struct block_desc * pbd ;
649
+
650
+ spin_lock (& po -> sk .sk_receive_queue .lock );
651
+
652
+ frozen = prb_queue_frozen (pkc );
653
+ pbd = GET_CURR_PBLOCK_DESC_FROM_CORE (pkc );
654
+
655
+ if (unlikely (pkc -> delete_blk_timer ))
656
+ goto out ;
657
+
658
+ /* We only need to plug the race when the block is partially filled.
659
+ * tpacket_rcv:
660
+ * lock(); increment BLOCK_NUM_PKTS; unlock()
661
+ * copy_bits() is in progress ...
662
+ * timer fires on other cpu:
663
+ * we can't retire the current block because copy_bits
664
+ * is in progress.
665
+ *
666
+ */
667
+ if (BLOCK_NUM_PKTS (pbd )) {
668
+ while (atomic_read (& pkc -> blk_fill_in_prog )) {
669
+ /* Waiting for skb_copy_bits to finish... */
670
+ cpu_relax ();
671
+ }
672
+ }
673
+
674
+ if (pkc -> last_kactive_blk_num == pkc -> kactive_blk_num ) {
675
+ if (!frozen ) {
676
+ prb_retire_current_block (pkc , po , TP_STATUS_BLK_TMO );
677
+ if (!prb_dispatch_next_block (pkc , po ))
678
+ goto refresh_timer ;
679
+ else
680
+ goto out ;
681
+ } else {
682
+ /* Case 1. Queue was frozen because user-space was
683
+ * lagging behind.
684
+ */
685
+ if (prb_curr_blk_in_use (pkc , pbd )) {
686
+ /*
687
+ * Ok, user-space is still behind.
688
+ * So just refresh the timer.
689
+ */
690
+ goto refresh_timer ;
691
+ } else {
692
+ /* Case 2. queue was frozen,user-space caught up,
693
+ * now the link went idle && the timer fired.
694
+ * We don't have a block to close.So we open this
695
+ * block and restart the timer.
696
+ * opening a block thaws the queue,restarts timer
697
+ * Thawing/timer-refresh is a side effect.
698
+ */
699
+ prb_open_block (pkc , pbd );
700
+ goto out ;
701
+ }
702
+ }
703
+ }
704
+
705
+ refresh_timer :
706
+ _prb_refresh_rx_retire_blk_timer (pkc );
707
+
708
+ out :
709
+ spin_unlock (& po -> sk .sk_receive_queue .lock );
710
+ }
711
+
712
+ static inline void prb_flush_block (struct kbdq_core * pkc1 ,
713
+ struct block_desc * pbd1 , __u32 status )
714
+ {
715
+ /* Flush everything minus the block header */
716
+
717
+ #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
718
+ u8 * start , * end ;
719
+
720
+ start = (u8 * )pbd1 ;
721
+
722
+ /* Skip the block header(we know header WILL fit in 4K) */
723
+ start += PAGE_SIZE ;
724
+
725
+ end = (u8 * )PAGE_ALIGN ((unsigned long )pkc1 -> pkblk_end );
726
+ for (; start < end ; start += PAGE_SIZE )
727
+ flush_dcache_page (pgv_to_page (start ));
728
+
729
+ smp_wmb ();
730
+ #endif
731
+
732
+ /* Now update the block status. */
733
+
734
+ BLOCK_STATUS (pbd1 ) = status ;
735
+
736
+ /* Flush the block header */
737
+
738
+ #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
739
+ start = (u8 * )pbd1 ;
740
+ flush_dcache_page (pgv_to_page (start ));
741
+
742
+ smp_wmb ();
743
+ #endif
744
+ }
745
+
746
+ /*
747
+ * Side effect:
748
+ *
749
+ * 1) flush the block
750
+ * 2) Increment active_blk_num
751
+ *
752
+ * Note:We DONT refresh the timer on purpose.
753
+ * Because almost always the next block will be opened.
754
+ */
755
+ static void prb_close_block (struct kbdq_core * pkc1 , struct block_desc * pbd1 ,
756
+ struct packet_sock * po , unsigned int stat )
757
+ {
758
+ __u32 status = TP_STATUS_USER | stat ;
759
+
760
+ struct tpacket3_hdr * last_pkt ;
761
+ struct hdr_v1 * h1 = & pbd1 -> hdr .bh1 ;
762
+
763
+ if (po -> stats .tp_drops )
764
+ status |= TP_STATUS_LOSING ;
765
+
766
+ last_pkt = (struct tpacket3_hdr * )pkc1 -> prev ;
767
+ last_pkt -> tp_next_offset = 0 ;
768
+
769
+ /* Get the ts of the last pkt */
770
+ if (BLOCK_NUM_PKTS (pbd1 )) {
771
+ h1 -> ts_last_pkt .ts_sec = last_pkt -> tp_sec ;
772
+ h1 -> ts_last_pkt .ts_nsec = last_pkt -> tp_nsec ;
773
+ } else {
774
+ /* Ok, we tmo'd - so get the current time */
775
+ struct timespec ts ;
776
+ getnstimeofday (& ts );
777
+ h1 -> ts_last_pkt .ts_sec = ts .tv_sec ;
778
+ h1 -> ts_last_pkt .ts_nsec = ts .tv_nsec ;
779
+ }
780
+
781
+ smp_wmb ();
782
+
783
+ /* Flush the block */
784
+ prb_flush_block (pkc1 , pbd1 , status );
785
+
786
+ pkc1 -> kactive_blk_num = GET_NEXT_PRB_BLK_NUM (pkc1 );
787
+ }
788
+
789
+ static inline void prb_thaw_queue (struct kbdq_core * pkc )
790
+ {
791
+ pkc -> reset_pending_on_curr_blk = 0 ;
792
+ }
793
+
794
+ /*
795
+ * Side effect of opening a block:
796
+ *
797
+ * 1) prb_queue is thawed.
798
+ * 2) retire_blk_timer is refreshed.
799
+ *
800
+ */
801
+ static void prb_open_block (struct kbdq_core * pkc1 , struct block_desc * pbd1 )
802
+ {
803
+ struct timespec ts ;
804
+ struct hdr_v1 * h1 = & pbd1 -> hdr .bh1 ;
805
+
806
+ smp_rmb ();
807
+
808
+ if (likely (TP_STATUS_KERNEL == BLOCK_STATUS (pbd1 ))) {
809
+
810
+ /* We could have just memset this but we will lose the
811
+ * flexibility of making the priv area sticky
812
+ */
813
+ BLOCK_SNUM (pbd1 ) = pkc1 -> knxt_seq_num ++ ;
814
+ BLOCK_NUM_PKTS (pbd1 ) = 0 ;
815
+ BLOCK_LEN (pbd1 ) = BLK_PLUS_PRIV (pkc1 -> blk_sizeof_priv );
816
+ getnstimeofday (& ts );
817
+ h1 -> ts_first_pkt .ts_sec = ts .tv_sec ;
818
+ h1 -> ts_first_pkt .ts_nsec = ts .tv_nsec ;
819
+ pkc1 -> pkblk_start = (char * )pbd1 ;
820
+ pkc1 -> nxt_offset = (char * )(pkc1 -> pkblk_start +
821
+ BLK_PLUS_PRIV (pkc1 -> blk_sizeof_priv ));
822
+ BLOCK_O2FP (pbd1 ) = (__u32 )BLK_PLUS_PRIV (pkc1 -> blk_sizeof_priv );
823
+ BLOCK_O2PRIV (pbd1 ) = BLK_HDR_LEN ;
824
+ pbd1 -> version = pkc1 -> version ;
825
+ pkc1 -> prev = pkc1 -> nxt_offset ;
826
+ pkc1 -> pkblk_end = pkc1 -> pkblk_start + pkc1 -> kblk_size ;
827
+ prb_thaw_queue (pkc1 );
828
+ _prb_refresh_rx_retire_blk_timer (pkc1 );
829
+
830
+ smp_wmb ();
831
+
832
+ return ;
833
+ }
834
+
835
+ WARN (1 , "ERROR block:%p is NOT FREE status:%d kactive_blk_num:%d\n" ,
836
+ pbd1 , BLOCK_STATUS (pbd1 ), pkc1 -> kactive_blk_num );
837
+ dump_stack ();
838
+ BUG ();
839
+ }
840
+
841
+ /*
842
+ * Queue freeze logic:
843
+ * 1) Assume tp_block_nr = 8 blocks.
844
+ * 2) At time 't0', user opens Rx ring.
845
+ * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
846
+ * 4) user-space is either sleeping or processing block '0'.
847
+ * 5) tpacket_rcv is currently filling block '7', since there is no space left,
848
+ * it will close block-7,loop around and try to fill block '0'.
849
+ * call-flow:
850
+ * __packet_lookup_frame_in_block
851
+ * prb_retire_current_block()
852
+ * prb_dispatch_next_block()
853
+ * |->(BLOCK_STATUS == USER) evaluates to true
854
+ * 5.1) Since block-0 is currently in-use, we just freeze the queue.
855
+ * 6) Now there are two cases:
856
+ * 6.1) Link goes idle right after the queue is frozen.
857
+ * But remember, the last open_block() refreshed the timer.
858
+ * When this timer expires,it will refresh itself so that we can
859
+ * re-open block-0 in near future.
860
+ * 6.2) Link is busy and keeps on receiving packets. This is a simple
861
+ * case and __packet_lookup_frame_in_block will check if block-0
862
+ * is free and can now be re-used.
863
+ */
864
+ static inline void prb_freeze_queue (struct kbdq_core * pkc ,
865
+ struct packet_sock * po )
866
+ {
867
+ pkc -> reset_pending_on_curr_blk = 1 ;
868
+ po -> stats_u .stats3 .tp_freeze_q_cnt ++ ;
869
+ }
870
+
871
+ #define TOTAL_PKT_LEN_INCL_ALIGN (length ) (ALIGN((length), V3_ALIGNMENT))
872
+
873
+ /*
874
+ * If the next block is free then we will dispatch it
875
+ * and return a good offset.
876
+ * Else, we will freeze the queue.
877
+ * So, caller must check the return value.
878
+ */
879
+ static void * prb_dispatch_next_block (struct kbdq_core * pkc ,
880
+ struct packet_sock * po )
881
+ {
882
+ struct block_desc * pbd ;
883
+
884
+ smp_rmb ();
885
+
886
+ /* 1. Get current block num */
887
+ pbd = GET_CURR_PBLOCK_DESC_FROM_CORE (pkc );
888
+
889
+ /* 2. If this block is currently in_use then freeze the queue */
890
+ if (TP_STATUS_USER & BLOCK_STATUS (pbd )) {
891
+ prb_freeze_queue (pkc , po );
892
+ return NULL ;
893
+ }
894
+
895
+ /*
896
+ * 3.
897
+ * open this block and return the offset where the first packet
898
+ * needs to get stored.
899
+ */
900
+ prb_open_block (pkc , pbd );
901
+ return (void * )pkc -> nxt_offset ;
902
+ }
903
+
904
+ static void prb_retire_current_block (struct kbdq_core * pkc ,
905
+ struct packet_sock * po , unsigned int status )
906
+ {
907
+ struct block_desc * pbd = GET_CURR_PBLOCK_DESC_FROM_CORE (pkc );
908
+
909
+ /* retire/close the current block */
910
+ if (likely (TP_STATUS_KERNEL == BLOCK_STATUS (pbd ))) {
911
+ /*
912
+ * Plug the case where copy_bits() is in progress on
913
+ * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
914
+ * have space to copy the pkt in the current block and
915
+ * called prb_retire_current_block()
916
+ *
917
+ * We don't need to worry about the TMO case because
918
+ * the timer-handler already handled this case.
919
+ */
920
+ if (!(status & TP_STATUS_BLK_TMO )) {
921
+ while (atomic_read (& pkc -> blk_fill_in_prog )) {
922
+ /* Waiting for skb_copy_bits to finish... */
923
+ cpu_relax ();
924
+ }
925
+ }
926
+ prb_close_block (pkc , pbd , po , status );
927
+ return ;
928
+ }
929
+
930
+ WARN (1 , "ERROR-pbd[%d]:%p\n" , pkc -> kactive_blk_num , pbd );
931
+ dump_stack ();
932
+ BUG ();
933
+ }
934
+
935
+ static inline int prb_curr_blk_in_use (struct kbdq_core * pkc ,
936
+ struct block_desc * pbd )
937
+ {
938
+ return TP_STATUS_USER & BLOCK_STATUS (pbd );
939
+ }
940
+
941
+ static inline int prb_queue_frozen (struct kbdq_core * pkc )
942
+ {
943
+ return pkc -> reset_pending_on_curr_blk ;
944
+ }
945
+
946
+ static inline void prb_clear_blk_fill_status (struct packet_ring_buffer * rb )
947
+ {
948
+ struct kbdq_core * pkc = GET_PBDQC_FROM_RB (rb );
949
+ atomic_dec (& pkc -> blk_fill_in_prog );
950
+ }
951
+
952
+ static inline void prb_fill_rxhash (struct kbdq_core * pkc ,
953
+ struct tpacket3_hdr * ppd )
954
+ {
955
+ ppd -> hv1 .tp_rxhash = skb_get_rxhash (pkc -> skb );
956
+ }
957
+
958
+ static inline void prb_clear_rxhash (struct kbdq_core * pkc ,
959
+ struct tpacket3_hdr * ppd )
960
+ {
961
+ ppd -> hv1 .tp_rxhash = 0 ;
962
+ }
963
+
964
+ static inline void prb_fill_vlan_info (struct kbdq_core * pkc ,
965
+ struct tpacket3_hdr * ppd )
966
+ {
967
+ if (vlan_tx_tag_present (pkc -> skb )) {
968
+ ppd -> hv1 .tp_vlan_tci = vlan_tx_tag_get (pkc -> skb );
969
+ ppd -> tp_status = TP_STATUS_VLAN_VALID ;
970
+ } else {
971
+ ppd -> hv1 .tp_vlan_tci = ppd -> tp_status = 0 ;
972
+ }
973
+ }
974
+
975
+ static void prb_run_all_ft_ops (struct kbdq_core * pkc ,
976
+ struct tpacket3_hdr * ppd )
977
+ {
978
+ prb_fill_vlan_info (pkc , ppd );
979
+
980
+ if (pkc -> feature_req_word & TP_FT_REQ_FILL_RXHASH )
981
+ prb_fill_rxhash (pkc , ppd );
982
+ else
983
+ prb_clear_rxhash (pkc , ppd );
984
+ }
985
+
986
+ static inline void prb_fill_curr_block (char * curr , struct kbdq_core * pkc ,
987
+ struct block_desc * pbd ,
988
+ unsigned int len )
989
+ {
990
+ struct tpacket3_hdr * ppd ;
991
+
992
+ ppd = (struct tpacket3_hdr * )curr ;
993
+ ppd -> tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN (len );
994
+ pkc -> prev = curr ;
995
+ pkc -> nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN (len );
996
+ BLOCK_LEN (pbd ) += TOTAL_PKT_LEN_INCL_ALIGN (len );
997
+ BLOCK_NUM_PKTS (pbd ) += 1 ;
998
+ atomic_inc (& pkc -> blk_fill_in_prog );
999
+ prb_run_all_ft_ops (pkc , ppd );
1000
+ }
1001
+
1002
+ /* Assumes caller has the sk->rx_queue.lock */
1003
+ static void * __packet_lookup_frame_in_block (struct packet_sock * po ,
1004
+ struct sk_buff * skb ,
1005
+ int status ,
1006
+ unsigned int len
1007
+ )
1008
+ {
1009
+ struct kbdq_core * pkc ;
1010
+ struct block_desc * pbd ;
1011
+ char * curr , * end ;
1012
+
1013
+ pkc = GET_PBDQC_FROM_RB (((struct packet_ring_buffer * )& po -> rx_ring ));
1014
+ pbd = GET_CURR_PBLOCK_DESC_FROM_CORE (pkc );
1015
+
1016
+ /* Queue is frozen when user space is lagging behind */
1017
+ if (prb_queue_frozen (pkc )) {
1018
+ /*
1019
+ * Check if that last block which caused the queue to freeze,
1020
+ * is still in_use by user-space.
1021
+ */
1022
+ if (prb_curr_blk_in_use (pkc , pbd )) {
1023
+ /* Can't record this packet */
1024
+ return NULL ;
1025
+ } else {
1026
+ /*
1027
+ * Ok, the block was released by user-space.
1028
+ * Now let's open that block.
1029
+ * opening a block also thaws the queue.
1030
+ * Thawing is a side effect.
1031
+ */
1032
+ prb_open_block (pkc , pbd );
1033
+ }
1034
+ }
1035
+
1036
+ smp_mb ();
1037
+ curr = pkc -> nxt_offset ;
1038
+ pkc -> skb = skb ;
1039
+ end = (char * ) ((char * )pbd + pkc -> kblk_size );
1040
+
1041
+ /* first try the current block */
1042
+ if (curr + TOTAL_PKT_LEN_INCL_ALIGN (len ) < end ) {
1043
+ prb_fill_curr_block (curr , pkc , pbd , len );
1044
+ return (void * )curr ;
1045
+ }
1046
+
1047
+ /* Ok, close the current block */
1048
+ prb_retire_current_block (pkc , po , 0 );
1049
+
1050
+ /* Now, try to dispatch the next block */
1051
+ curr = (char * )prb_dispatch_next_block (pkc , po );
1052
+ if (curr ) {
1053
+ pbd = GET_CURR_PBLOCK_DESC_FROM_CORE (pkc );
1054
+ prb_fill_curr_block (curr , pkc , pbd , len );
1055
+ return (void * )curr ;
1056
+ }
1057
+
1058
+ /*
1059
+ * No free blocks are available.user_space hasn't caught up yet.
1060
+ * Queue was just frozen and now this packet will get dropped.
1061
+ */
1062
+ return NULL ;
1063
+ }
1064
+
1065
+ static inline void * packet_current_rx_frame (struct packet_sock * po ,
1066
+ struct sk_buff * skb ,
1067
+ int status , unsigned int len )
1068
+ {
1069
+ char * curr = NULL ;
1070
+ switch (po -> tp_version ) {
1071
+ case TPACKET_V1 :
1072
+ case TPACKET_V2 :
1073
+ curr = packet_lookup_frame (po , & po -> rx_ring ,
1074
+ po -> rx_ring .head , status );
1075
+ return curr ;
1076
+ case TPACKET_V3 :
1077
+ return __packet_lookup_frame_in_block (po , skb , status , len );
1078
+ default :
1079
+ WARN (1 , "TPACKET version not supported\n" );
1080
+ BUG ();
1081
+ return 0 ;
1082
+ }
1083
+ }
1084
+
1085
+ static inline void * prb_lookup_block (struct packet_sock * po ,
1086
+ struct packet_ring_buffer * rb ,
1087
+ unsigned int previous ,
1088
+ int status )
1089
+ {
1090
+ struct kbdq_core * pkc = GET_PBDQC_FROM_RB (rb );
1091
+ struct block_desc * pbd = GET_PBLOCK_DESC (pkc , previous );
1092
+
1093
+ if (status != BLOCK_STATUS (pbd ))
1094
+ return NULL ;
1095
+ return pbd ;
1096
+ }
1097
+
1098
+ static inline int prb_previous_blk_num (struct packet_ring_buffer * rb )
1099
+ {
1100
+ unsigned int prev ;
1101
+ if (rb -> prb_bdqc .kactive_blk_num )
1102
+ prev = rb -> prb_bdqc .kactive_blk_num - 1 ;
1103
+ else
1104
+ prev = rb -> prb_bdqc .knum_blocks - 1 ;
1105
+ return prev ;
1106
+ }
1107
+
1108
+ /* Assumes caller has held the rx_queue.lock */
1109
+ static inline void * __prb_previous_block (struct packet_sock * po ,
1110
+ struct packet_ring_buffer * rb ,
1111
+ int status )
1112
+ {
1113
+ unsigned int previous = prb_previous_blk_num (rb );
1114
+ return prb_lookup_block (po , rb , previous , status );
1115
+ }
1116
+
1117
+ static inline void * packet_previous_rx_frame (struct packet_sock * po ,
1118
+ struct packet_ring_buffer * rb ,
1119
+ int status )
1120
+ {
1121
+ if (po -> tp_version <= TPACKET_V2 )
1122
+ return packet_previous_frame (po , rb , status );
1123
+
1124
+ return __prb_previous_block (po , rb , status );
1125
+ }
1126
+
1127
+ static inline void packet_increment_rx_head (struct packet_sock * po ,
1128
+ struct packet_ring_buffer * rb )
1129
+ {
1130
+ switch (po -> tp_version ) {
1131
+ case TPACKET_V1 :
1132
+ case TPACKET_V2 :
1133
+ return packet_increment_head (rb );
1134
+ case TPACKET_V3 :
1135
+ default :
1136
+ WARN (1 , "TPACKET version not supported.\n" );
1137
+ BUG ();
1138
+ return ;
1139
+ }
1140
+ }
1141
+
392
1142
static inline void * packet_previous_frame (struct packet_sock * po ,
393
1143
struct packet_ring_buffer * rb ,
394
1144
int status )
@@ -982,12 +1732,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
982
1732
union {
983
1733
struct tpacket_hdr * h1 ;
984
1734
struct tpacket2_hdr * h2 ;
1735
+ struct tpacket3_hdr * h3 ;
985
1736
void * raw ;
986
1737
} h ;
987
1738
u8 * skb_head = skb -> data ;
988
1739
int skb_len = skb -> len ;
989
1740
unsigned int snaplen , res ;
990
- unsigned long status = TP_STATUS_LOSING | TP_STATUS_USER ;
1741
+ unsigned long status = TP_STATUS_USER ;
991
1742
unsigned short macoff , netoff , hdrlen ;
992
1743
struct sk_buff * copy_skb = NULL ;
993
1744
struct timeval tv ;
@@ -1033,37 +1784,46 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1033
1784
po -> tp_reserve ;
1034
1785
macoff = netoff - maclen ;
1035
1786
}
1036
-
1037
- if (macoff + snaplen > po -> rx_ring .frame_size ) {
1038
- if (po -> copy_thresh &&
1039
- atomic_read (& sk -> sk_rmem_alloc ) + skb -> truesize <
1040
- (unsigned )sk -> sk_rcvbuf ) {
1041
- if (skb_shared (skb )) {
1042
- copy_skb = skb_clone (skb , GFP_ATOMIC );
1043
- } else {
1044
- copy_skb = skb_get (skb );
1045
- skb_head = skb -> data ;
1787
+ if (po -> tp_version <= TPACKET_V2 ) {
1788
+ if (macoff + snaplen > po -> rx_ring .frame_size ) {
1789
+ if (po -> copy_thresh &&
1790
+ atomic_read (& sk -> sk_rmem_alloc ) + skb -> truesize
1791
+ < (unsigned )sk -> sk_rcvbuf ) {
1792
+ if (skb_shared (skb )) {
1793
+ copy_skb = skb_clone (skb , GFP_ATOMIC );
1794
+ } else {
1795
+ copy_skb = skb_get (skb );
1796
+ skb_head = skb -> data ;
1797
+ }
1798
+ if (copy_skb )
1799
+ skb_set_owner_r (copy_skb , sk );
1046
1800
}
1047
- if (copy_skb )
1048
- skb_set_owner_r (copy_skb , sk );
1801
+ snaplen = po -> rx_ring .frame_size - macoff ;
1802
+ if ((int )snaplen < 0 )
1803
+ snaplen = 0 ;
1049
1804
}
1050
- snaplen = po -> rx_ring .frame_size - macoff ;
1051
- if ((int )snaplen < 0 )
1052
- snaplen = 0 ;
1053
1805
}
1054
-
1055
1806
spin_lock (& sk -> sk_receive_queue .lock );
1056
- h .raw = packet_current_frame (po , & po -> rx_ring , TP_STATUS_KERNEL );
1807
+ h .raw = packet_current_rx_frame (po , skb ,
1808
+ TP_STATUS_KERNEL , (macoff + snaplen ));
1057
1809
if (!h .raw )
1058
1810
goto ring_is_full ;
1059
- packet_increment_head (& po -> rx_ring );
1811
+ if (po -> tp_version <= TPACKET_V2 ) {
1812
+ packet_increment_rx_head (po , & po -> rx_ring );
1813
+ /*
1814
+ * LOSING will be reported till you read the stats,
1815
+ * because it's COR - Clear On Read.
1816
+ * Anyways, moving it for V1/V2 only as V3 doesn't need this
1817
+ * at packet level.
1818
+ */
1819
+ if (po -> stats .tp_drops )
1820
+ status |= TP_STATUS_LOSING ;
1821
+ }
1060
1822
po -> stats .tp_packets ++ ;
1061
1823
if (copy_skb ) {
1062
1824
status |= TP_STATUS_COPY ;
1063
1825
__skb_queue_tail (& sk -> sk_receive_queue , copy_skb );
1064
1826
}
1065
- if (!po -> stats .tp_drops )
1066
- status &= ~TP_STATUS_LOSING ;
1067
1827
spin_unlock (& sk -> sk_receive_queue .lock );
1068
1828
1069
1829
skb_copy_bits (skb , 0 , h .raw + macoff , snaplen );
@@ -1114,6 +1874,29 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1114
1874
h .h2 -> tp_padding = 0 ;
1115
1875
hdrlen = sizeof (* h .h2 );
1116
1876
break ;
1877
+ case TPACKET_V3 :
1878
+ /* tp_nxt_offset,vlan are already populated above.
1879
+ * So DONT clear those fields here
1880
+ */
1881
+ h .h3 -> tp_status |= status ;
1882
+ h .h3 -> tp_len = skb -> len ;
1883
+ h .h3 -> tp_snaplen = snaplen ;
1884
+ h .h3 -> tp_mac = macoff ;
1885
+ h .h3 -> tp_net = netoff ;
1886
+ if ((po -> tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE )
1887
+ && shhwtstamps -> syststamp .tv64 )
1888
+ ts = ktime_to_timespec (shhwtstamps -> syststamp );
1889
+ else if ((po -> tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE )
1890
+ && shhwtstamps -> hwtstamp .tv64 )
1891
+ ts = ktime_to_timespec (shhwtstamps -> hwtstamp );
1892
+ else if (skb -> tstamp .tv64 )
1893
+ ts = ktime_to_timespec (skb -> tstamp );
1894
+ else
1895
+ getnstimeofday (& ts );
1896
+ h .h3 -> tp_sec = ts .tv_sec ;
1897
+ h .h3 -> tp_nsec = ts .tv_nsec ;
1898
+ hdrlen = sizeof (* h .h3 );
1899
+ break ;
1117
1900
default :
1118
1901
BUG ();
1119
1902
}
@@ -1134,13 +1917,19 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1134
1917
{
1135
1918
u8 * start , * end ;
1136
1919
1137
- end = (u8 * )PAGE_ALIGN ((unsigned long )h .raw + macoff + snaplen );
1138
- for (start = h .raw ; start < end ; start += PAGE_SIZE )
1139
- flush_dcache_page (pgv_to_page (start ));
1920
+ if (po -> tp_version <= TPACKET_V2 ) {
1921
+ end = (u8 * )PAGE_ALIGN ((unsigned long )h .raw
1922
+ + macoff + snaplen );
1923
+ for (start = h .raw ; start < end ; start += PAGE_SIZE )
1924
+ flush_dcache_page (pgv_to_page (start ));
1925
+ }
1140
1926
smp_wmb ();
1141
1927
}
1142
1928
#endif
1143
- __packet_set_status (po , h .raw , status );
1929
+ if (po -> tp_version <= TPACKET_V2 )
1930
+ __packet_set_status (po , h .raw , status );
1931
+ else
1932
+ prb_clear_blk_fill_status (& po -> rx_ring );
1144
1933
1145
1934
sk -> sk_data_ready (sk , 0 );
1146
1935
@@ -1631,7 +2420,7 @@ static int packet_release(struct socket *sock)
1631
2420
struct sock * sk = sock -> sk ;
1632
2421
struct packet_sock * po ;
1633
2422
struct net * net ;
1634
- struct tpacket_req req ;
2423
+ union tpacket_req_u req_u ;
1635
2424
1636
2425
if (!sk )
1637
2426
return 0 ;
@@ -1654,13 +2443,13 @@ static int packet_release(struct socket *sock)
1654
2443
1655
2444
packet_flush_mclist (sk );
1656
2445
1657
- memset (& req , 0 , sizeof (req ));
2446
+ memset (& req_u , 0 , sizeof (req_u ));
1658
2447
1659
2448
if (po -> rx_ring .pg_vec )
1660
- packet_set_ring (sk , & req , 1 , 0 );
2449
+ packet_set_ring (sk , & req_u , 1 , 0 );
1661
2450
1662
2451
if (po -> tx_ring .pg_vec )
1663
- packet_set_ring (sk , & req , 1 , 1 );
2452
+ packet_set_ring (sk , & req_u , 1 , 1 );
1664
2453
1665
2454
fanout_release (sk );
1666
2455
@@ -2280,15 +3069,27 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
2280
3069
case PACKET_RX_RING :
2281
3070
case PACKET_TX_RING :
2282
3071
{
2283
- struct tpacket_req req ;
3072
+ union tpacket_req_u req_u ;
3073
+ int len ;
2284
3074
2285
- if (optlen < sizeof (req ))
3075
+ switch (po -> tp_version ) {
3076
+ case TPACKET_V1 :
3077
+ case TPACKET_V2 :
3078
+ len = sizeof (req_u .req );
3079
+ break ;
3080
+ case TPACKET_V3 :
3081
+ default :
3082
+ len = sizeof (req_u .req3 );
3083
+ break ;
3084
+ }
3085
+ if (optlen < len )
2286
3086
return - EINVAL ;
2287
3087
if (pkt_sk (sk )-> has_vnet_hdr )
2288
3088
return - EINVAL ;
2289
- if (copy_from_user (& req , optval , sizeof ( req ) ))
3089
+ if (copy_from_user (& req_u . req , optval , len ))
2290
3090
return - EFAULT ;
2291
- return packet_set_ring (sk , & req , 0 , optname == PACKET_TX_RING );
3091
+ return packet_set_ring (sk , & req_u , 0 ,
3092
+ optname == PACKET_TX_RING );
2292
3093
}
2293
3094
case PACKET_COPY_THRESH :
2294
3095
{
@@ -2315,6 +3116,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
2315
3116
switch (val ) {
2316
3117
case TPACKET_V1 :
2317
3118
case TPACKET_V2 :
3119
+ case TPACKET_V3 :
2318
3120
po -> tp_version = val ;
2319
3121
return 0 ;
2320
3122
default :
@@ -2424,6 +3226,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
2424
3226
struct packet_sock * po = pkt_sk (sk );
2425
3227
void * data ;
2426
3228
struct tpacket_stats st ;
3229
+ union tpacket_stats_u st_u ;
2427
3230
2428
3231
if (level != SOL_PACKET )
2429
3232
return - ENOPROTOOPT ;
@@ -2436,15 +3239,27 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
2436
3239
2437
3240
switch (optname ) {
2438
3241
case PACKET_STATISTICS :
2439
- if (len > sizeof (struct tpacket_stats ))
2440
- len = sizeof (struct tpacket_stats );
3242
+ if (po -> tp_version == TPACKET_V3 ) {
3243
+ len = sizeof (struct tpacket_stats_v3 );
3244
+ } else {
3245
+ if (len > sizeof (struct tpacket_stats ))
3246
+ len = sizeof (struct tpacket_stats );
3247
+ }
2441
3248
spin_lock_bh (& sk -> sk_receive_queue .lock );
2442
- st = po -> stats ;
3249
+ if (po -> tp_version == TPACKET_V3 ) {
3250
+ memcpy (& st_u .stats3 , & po -> stats ,
3251
+ sizeof (struct tpacket_stats ));
3252
+ st_u .stats3 .tp_freeze_q_cnt =
3253
+ po -> stats_u .stats3 .tp_freeze_q_cnt ;
3254
+ st_u .stats3 .tp_packets += po -> stats .tp_drops ;
3255
+ data = & st_u .stats3 ;
3256
+ } else {
3257
+ st = po -> stats ;
3258
+ st .tp_packets += st .tp_drops ;
3259
+ data = & st ;
3260
+ }
2443
3261
memset (& po -> stats , 0 , sizeof (st ));
2444
3262
spin_unlock_bh (& sk -> sk_receive_queue .lock );
2445
- st .tp_packets += st .tp_drops ;
2446
-
2447
- data = & st ;
2448
3263
break ;
2449
3264
case PACKET_AUXDATA :
2450
3265
if (len > sizeof (int ))
@@ -2485,6 +3300,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
2485
3300
case TPACKET_V2 :
2486
3301
val = sizeof (struct tpacket2_hdr );
2487
3302
break ;
3303
+ case TPACKET_V3 :
3304
+ val = sizeof (struct tpacket3_hdr );
3305
+ break ;
2488
3306
default :
2489
3307
return - EINVAL ;
2490
3308
}
@@ -2641,7 +3459,8 @@ static unsigned int packet_poll(struct file *file, struct socket *sock,
2641
3459
2642
3460
spin_lock_bh (& sk -> sk_receive_queue .lock );
2643
3461
if (po -> rx_ring .pg_vec ) {
2644
- if (!packet_previous_frame (po , & po -> rx_ring , TP_STATUS_KERNEL ))
3462
+ if (!packet_previous_rx_frame (po , & po -> rx_ring ,
3463
+ TP_STATUS_KERNEL ))
2645
3464
mask |= POLLIN | POLLRDNORM ;
2646
3465
}
2647
3466
spin_unlock_bh (& sk -> sk_receive_queue .lock );
@@ -2760,7 +3579,7 @@ static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
2760
3579
goto out ;
2761
3580
}
2762
3581
2763
- static int packet_set_ring (struct sock * sk , struct tpacket_req * req ,
3582
+ static int packet_set_ring (struct sock * sk , union tpacket_req_u * req_u ,
2764
3583
int closing , int tx_ring )
2765
3584
{
2766
3585
struct pgv * pg_vec = NULL ;
@@ -2769,7 +3588,15 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2769
3588
struct packet_ring_buffer * rb ;
2770
3589
struct sk_buff_head * rb_queue ;
2771
3590
__be16 num ;
2772
- int err ;
3591
+ int err = - EINVAL ;
3592
+ /* Added to avoid minimal code churn */
3593
+ struct tpacket_req * req = & req_u -> req ;
3594
+
3595
+ /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3596
+ if (!closing && tx_ring && (po -> tp_version > TPACKET_V2 )) {
3597
+ WARN (1 , "Tx-ring is not supported.\n" );
3598
+ goto out ;
3599
+ }
2773
3600
2774
3601
rb = tx_ring ? & po -> tx_ring : & po -> rx_ring ;
2775
3602
rb_queue = tx_ring ? & sk -> sk_write_queue : & sk -> sk_receive_queue ;
@@ -2795,6 +3622,9 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2795
3622
case TPACKET_V2 :
2796
3623
po -> tp_hdrlen = TPACKET2_HDRLEN ;
2797
3624
break ;
3625
+ case TPACKET_V3 :
3626
+ po -> tp_hdrlen = TPACKET3_HDRLEN ;
3627
+ break ;
2798
3628
}
2799
3629
2800
3630
err = - EINVAL ;
@@ -2820,6 +3650,17 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2820
3650
pg_vec = alloc_pg_vec (req , order );
2821
3651
if (unlikely (!pg_vec ))
2822
3652
goto out ;
3653
+ switch (po -> tp_version ) {
3654
+ case TPACKET_V3 :
3655
+ /* Transmit path is not supported. We checked
3656
+ * it above but just being paranoid
3657
+ */
3658
+ if (!tx_ring )
3659
+ init_prb_bdqc (po , rb , pg_vec , req_u , tx_ring );
3660
+ break ;
3661
+ default :
3662
+ break ;
3663
+ }
2823
3664
}
2824
3665
/* Done */
2825
3666
else {
@@ -2872,7 +3713,11 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2872
3713
register_prot_hook (sk );
2873
3714
}
2874
3715
spin_unlock (& po -> bind_lock );
2875
-
3716
+ if (closing && (po -> tp_version > TPACKET_V2 )) {
3717
+ /* Because we don't support block-based V3 on tx-ring */
3718
+ if (!tx_ring )
3719
+ prb_shutdown_retire_blk_timer (po , tx_ring , rb_queue );
3720
+ }
2876
3721
release_sock (sk );
2877
3722
2878
3723
if (pg_vec )
0 commit comments